# _*_ coding : UTF-8 _*_ # @Time : 2024/11/23 03:39 # @UpdateTime : 2024/11/23 03:39 # @Author : haochen zhong # @File : CrawlZhongguoziranziyuanbao.py # @Software : PyCharm # @Comment : 本程序中国自然资源报数据 import asyncio import random import uuid from datetime import datetime from httpx import AsyncClient from motor.motor_asyncio import AsyncIOMotorClient start_date = datetime.strptime('2018-05-18', '%Y-%m-%d') """中国自然资源报2018年5月18日开始有数据""" end_date = datetime.today() """截止到今天""" headers = { 'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'} # 链接数据库 client = AsyncIOMotorClient('mongodb://localhost:27017') db = client['buweijiguanbao'] collection = db['zhongguoziranziyuanbao'] async def main(): collection_names = await db.list_collection_names() # 判断数据表是否存在 if "zhongguoziranziyuanbao" not in collection_names: # 如果不存在,则从2017年9月开始爬取 print("中国自然资源报数据表不存在,开始采集!") await getData(start_date, end_date) else: # 如果存在,则从数据库中获取最后一条记录的日期 last_record = await collection.find_one({}, sort=[('release_time', -1)]) last_date_str = last_record['release_time'] print("数据库截止时间:", last_date_str) await getData(last_date_str, end_date) async def heartbeat(): """ 心跳检测 :return: """ uid = uuid.uuid4().__str__() """随机UUID""" async with AsyncClient(headers={ 'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42', "myidentity": uid}) as client: response = await client.get(url="http://szb.iziran.net//user/ipLogin") if response.status_code == 200: global headers headers = { 'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42', "myidentity": uid, "Site": 'iziran', "Host": "szb.iziran.net" } async def getData(start_date: datetime, end_date: datetime): """ :param start_date: 开始日期 :param end_date: 结束日期 :return: None """ crawl_num = 0 heart_num = 0 date_url = "http://szb.iziran.net//xcms-pc/static/cache/bz1.json" async with AsyncClient(headers=headers, timeout=60) as client: response = await client.get(date_url) response.encoding = response.charset_encoding if response.status_code == 200: dateList = response.json() dateList = list( filter(lambda x: x >= start_date, list(map(lambda x: datetime.strptime(x, '%Y-%m-%d'), dateList)))) dateList = dateList[::-1] await heartbeat() client.headers = headers for date in dateList: try: if heart_num > 63: await heartbeat() client.headers = headers heart_num = 0 url = "http://szb.iziran.net//bz/queryPageByDate" params = { "date": date.strftime("%Y-%m-%d"), "columnId": 1 } response = await client.post(url, params=params) heart_num += 1 response.encoding = response.charset_encoding print(f"一级连接状态:{response.status_code}") if response.status_code == 200: data = response.json().get("data", {"pages": []}).get("pages", []) for item in data: if heart_num > 63: await heartbeat() client.headers = headers heart_num = 0 banmianming = item["name"] banmianhao = item["number"] url1 = f"http://szb.iziran.net//bz/queryArticleByPage" params = {"pageId": item["id"]} response2 = await client.post(url1, params=params) heart_num += 1 response2.encoding = response2.charset_encoding print(f"二级连接状态:{response2.status_code}") if response2.status_code == 200: data2 = response2.json().get("data", {"articles": []}).get("articles", []) for item2 in data2: url2 = f"http://szb.iziran.net/bz/html/content.html?date={date.strftime('%Y-%m-%d')}&pageIndex={item['index']}&cid=1&articleId={item2['id']}&articleIndex={item2['index']}&pageId={item2['pageId']}" if await collection.find_one({"detail_url": url2}, {"_id": False}): continue print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2) url3 = "http://szb.iziran.net//bz/getArticleById" params = {"articleId": item2["id"]} response3 = await client.post(url3, params=params) heart_num += 1 response3.encoding = response3.charset_encoding print(f"三级连接状态:{response3.status_code}") if response3.status_code == 200: data3 = response3.json().get("data", {}) await collection.insert_one({ "title": data3.get("title", ""), "subtitle": data3.get("subtitle", ""), "preTitle": data3.get("introTitle", ""), "author": data3.get("author", ""), "banmianming": banmianming, "banmianhao": banmianhao, 'keywordlist': "empty", 'detail_url': url2, 'release_time': date, 'insert_timestamp': datetime.today(), 'content': data3.get("text", "") }) crawl_num += 1 print( f"中国自然资源报---{date.strftime('%Y-%m-%d')}---{banmianming}---{banmianhao}---{data3.get('title', '')}---采集完成!") await asyncio.sleep(random.randint(5, 15)) print( f"中国自然资源报---{date.strftime('%Y-%m-%d')}---{banmianming}---{banmianhao}-----采集完成!") await asyncio.sleep(random.randint(5, 15)) print( f"中国自然资源报---{date.strftime('%Y-%m-%d')}-----采集完成!") await asyncio.sleep(random.randint(5, 15)) except Exception as e: print(e) await collection.insert_one( {'banmianhao': 'empty', 'banmianming': 'empty', 'preTitle': 'empty', 'title': 'empty', 'subtitle': 'empty', 'author': 'empty', 'keywordlist': 'empty', 'detail_url': url, 'release_time': date, 'insert_timestamp': datetime.today(), 'content': 'empty'} ) print(f"中国自然资源报采集完毕,共采集{crawl_num}条数据!") asyncio.run(main())