guoneimeitishujucaiji/国内党媒/CrawlZhongguoziranziyuanbao.py

172 lines
8.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# _*_ coding : UTF-8 _*_
# @Time : 2024/11/23 03:39
# @UpdateTime : 2024/11/23 03:39
# @Author : haochen zhong
# @File : CrawlZhongguoziranziyuanbao.py
# @Software : PyCharm
# @Comment : 本程序中国自然资源报数据
import asyncio
import random
import uuid
from datetime import datetime
from httpx import AsyncClient
from motor.motor_asyncio import AsyncIOMotorClient
start_date = datetime.strptime('2018-05-18', '%Y-%m-%d')
"""中国自然资源报2018年5月18日开始有数据"""
end_date = datetime.today()
"""截止到今天"""
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
# 链接数据库
client = AsyncIOMotorClient('mongodb://localhost:27017')
db = client['buweijiguanbao']
collection = db['zhongguoziranziyuanbao']
async def main():
collection_names = await db.list_collection_names()
# 判断数据表是否存在
if "zhongguoziranziyuanbao" not in collection_names:
# 如果不存在则从2017年9月开始爬取
print("中国自然资源报数据表不存在,开始采集!")
await getData(start_date, end_date)
else:
# 如果存在,则从数据库中获取最后一条记录的日期
last_record = await collection.find_one({}, sort=[('release_time', -1)])
last_date_str = last_record['release_time']
print("数据库截止时间:", last_date_str)
await getData(last_date_str, end_date)
async def heartbeat():
"""
心跳检测
:return:
"""
uid = uuid.uuid4().__str__()
"""随机UUID"""
async with AsyncClient(headers={
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42',
"myidentity": uid}) as client:
response = await client.get(url="http://szb.iziran.net//user/ipLogin")
if response.status_code == 200:
global headers
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42',
"myidentity": uid,
"Site": 'iziran',
"Host": "szb.iziran.net"
}
async def getData(start_date: datetime, end_date: datetime):
"""
:param start_date: 开始日期
:param end_date: 结束日期
:return: None
"""
crawl_num = 0
heart_num = 0
date_url = "http://szb.iziran.net//xcms-pc/static/cache/bz1.json"
async with AsyncClient(headers=headers, timeout=60) as client:
response = await client.get(date_url)
response.encoding = response.charset_encoding
if response.status_code == 200:
dateList = response.json()
dateList = list(
filter(lambda x: x >= start_date, list(map(lambda x: datetime.strptime(x, '%Y-%m-%d'), dateList))))
dateList = dateList[::-1]
await heartbeat()
client.headers = headers
for date in dateList:
try:
if heart_num > 63:
await heartbeat()
client.headers = headers
heart_num = 0
url = "http://szb.iziran.net//bz/queryPageByDate"
params = {
"date": date.strftime("%Y-%m-%d"),
"columnId": 1
}
response = await client.post(url, params=params)
heart_num += 1
response.encoding = response.charset_encoding
print(f"一级连接状态:{response.status_code}")
if response.status_code == 200:
data = response.json().get("data", {"pages": []}).get("pages", [])
for item in data:
if heart_num > 63:
await heartbeat()
client.headers = headers
heart_num = 0
banmianming = item["name"]
banmianhao = item["number"]
url1 = f"http://szb.iziran.net//bz/queryArticleByPage"
params = {"pageId": item["id"]}
response2 = await client.post(url1, params=params)
heart_num += 1
response2.encoding = response2.charset_encoding
print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200:
data2 = response2.json().get("data", {"articles": []}).get("articles", [])
for item2 in data2:
url2 = f"http://szb.iziran.net/bz/html/content.html?date={date.strftime('%Y-%m-%d')}&pageIndex={item['index']}&cid=1&articleId={item2['id']}&articleIndex={item2['index']}&pageId={item2['pageId']}"
if await collection.find_one({"detail_url": url2}, {"_id": False}):
continue
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2)
url3 = "http://szb.iziran.net//bz/getArticleById"
params = {"articleId": item2["id"]}
response3 = await client.post(url3, params=params)
heart_num += 1
response3.encoding = response3.charset_encoding
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
data3 = response3.json().get("data", {})
await collection.insert_one({
"title": data3.get("title", ""),
"subtitle": data3.get("subtitle", ""),
"preTitle": data3.get("introTitle", ""),
"author": data3.get("author", ""),
"banmianming": banmianming,
"banmianhao": banmianhao,
'keywordlist': "empty",
'detail_url': url2,
'release_time': date,
'insert_timestamp': datetime.today(),
'content': data3.get("text", "")
})
crawl_num += 1
print(
f"中国自然资源报---{date.strftime('%Y-%m-%d')}---{banmianming}---{banmianhao}---{data3.get('title', '')}---采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(
f"中国自然资源报---{date.strftime('%Y-%m-%d')}---{banmianming}---{banmianhao}-----采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(
f"中国自然资源报---{date.strftime('%Y-%m-%d')}-----采集完成!")
await asyncio.sleep(random.randint(5, 15))
except Exception as e:
print(e)
await collection.insert_one(
{'banmianhao': 'empty',
'banmianming': 'empty',
'preTitle': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': date,
'insert_timestamp': datetime.today(),
'content': 'empty'}
)
print(f"中国自然资源报采集完毕,共采集{crawl_num}条数据!")
asyncio.run(main())