fix: 修复人民邮电报偶尔出现日期匹配失败问题

This commit is contained in:
皓月归尘 2024-11-11 20:28:12 +08:00
parent 8d62f1d2db
commit e134004f2d

View File

@ -9,7 +9,7 @@
import asyncio import asyncio
import random import random
from datetime import datetime, timedelta from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from httpx import AsyncClient from httpx import AsyncClient
@ -67,6 +67,7 @@ async def getData(start_date: datetime, end_date: datetime):
""" """
crawl_num = 0 crawl_num = 0
start_date = int(start_date.strftime("%Y%m%d")) start_date = int(start_date.strftime("%Y%m%d"))
try:
async with AsyncClient(headers=headers, timeout=60) as client: async with AsyncClient(headers=headers, timeout=60) as client:
response = await client.get("https://rmydb.cnii.com.cn/period/yearMonthDay.json") response = await client.get("https://rmydb.cnii.com.cn/period/yearMonthDay.json")
response.encoding = response.charset_encoding response.encoding = response.charset_encoding
@ -82,7 +83,7 @@ async def getData(start_date: datetime, end_date: datetime):
for day in dayList: for day in dayList:
try: try:
url = f"https://rmydb.cnii.com.cn/html/{day.__str__()[:4]}/{day}/data.json" url = f"https://rmydb.cnii.com.cn/html/{day.__str__()[:4]}/{day}/data.json"
print(url) print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url)
response = await client.get(url) response = await client.get(url)
response.encoding = response.charset_encoding response.encoding = response.charset_encoding
print(f"二级连接状态:{response.status_code}") print(f"二级连接状态:{response.status_code}")
@ -98,7 +99,7 @@ async def getData(start_date: datetime, end_date: datetime):
author = article["articleAuthor"] author = article["articleAuthor"]
if await collection.find_one({"detail_url": url2}, {"_id": False}): if await collection.find_one({"detail_url": url2}, {"_id": False}):
continue continue
print(url2) print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2)
response2 = await client.get(url2) response2 = await client.get(url2)
response2.encoding = response2.charset_encoding response2.encoding = response2.charset_encoding
print(f"三级连接状态:{response2.status_code}") print(f"三级连接状态:{response2.status_code}")
@ -122,7 +123,8 @@ async def getData(start_date: datetime, end_date: datetime):
'content': content 'content': content
}) })
crawl_num += 1 crawl_num += 1
print(f"人民邮电报---{day}---{banmianming}---{banmianhao}---{title}---采集完成!") print(
f"人民邮电报---{day}---{banmianming}---{banmianhao}---{title}---采集完成!")
await asyncio.sleep(random.randint(5, 15)) await asyncio.sleep(random.randint(5, 15))
print(f"人民邮电报---{day}---{banmianming}---{banmianhao}-----采集完成!") print(f"人民邮电报---{day}---{banmianming}---{banmianhao}-----采集完成!")
await asyncio.sleep(random.randint(5, 15)) await asyncio.sleep(random.randint(5, 15))
@ -143,6 +145,9 @@ async def getData(start_date: datetime, end_date: datetime):
'insert_timestamp': datetime.today(), 'insert_timestamp': datetime.today(),
'content': 'empty'} 'content': 'empty'}
) )
except Exception as e:
print(e)
print(f"人民邮电报采集完毕,共采集{crawl_num}条数据!") print(f"人民邮电报采集完毕,共采集{crawl_num}条数据!")
asyncio.run(main()) asyncio.run(main())