fix: 修复人民邮电报偶尔出现日期匹配失败问题

This commit is contained in:
皓月归尘 2024-11-11 20:28:12 +08:00
parent 8d62f1d2db
commit e134004f2d

View File

@ -9,7 +9,7 @@
import asyncio import asyncio
import random import random
from datetime import datetime, timedelta from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from httpx import AsyncClient from httpx import AsyncClient
@ -20,8 +20,8 @@ start_date = datetime.strptime('2017-09', '%Y-%m')
end_date = datetime.today() end_date = datetime.today()
"""截止到今天""" """截止到今天"""
headers = { headers = {
"connection":'keep-alive', "connection": 'keep-alive',
"host":"rmydb.cnii.com.cn", "host": "rmydb.cnii.com.cn",
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'} 'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
# 链接数据库 # 链接数据库
@ -67,82 +67,87 @@ async def getData(start_date: datetime, end_date: datetime):
""" """
crawl_num = 0 crawl_num = 0
start_date = int(start_date.strftime("%Y%m%d")) start_date = int(start_date.strftime("%Y%m%d"))
async with AsyncClient(headers=headers, timeout=60) as client: try:
response = await client.get("https://rmydb.cnii.com.cn/period/yearMonthDay.json") async with AsyncClient(headers=headers, timeout=60) as client:
response.encoding = response.charset_encoding response = await client.get("https://rmydb.cnii.com.cn/period/yearMonthDay.json")
print(f"一级连接状态:{response.status_code}") response.encoding = response.charset_encoding
if response.status_code == 200: print(f"一级连接状态:{response.status_code}")
data = response.json() if response.status_code == 200:
dayList = [] data = response.json()
for value in data.values(): dayList = []
for item in value.values(): for value in data.values():
dayList += item for item in value.values():
dayList.sort() dayList += item
dayList = list(filter(lambda x: x >= start_date, list(map(int, dayList)))) dayList.sort()
for day in dayList: dayList = list(filter(lambda x: x >= start_date, list(map(int, dayList))))
try: for day in dayList:
url = f"https://rmydb.cnii.com.cn/html/{day.__str__()[:4]}/{day}/data.json" try:
print(url) url = f"https://rmydb.cnii.com.cn/html/{day.__str__()[:4]}/{day}/data.json"
response = await client.get(url) print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url)
response.encoding = response.charset_encoding response = await client.get(url)
print(f"二级连接状态:{response.status_code}") response.encoding = response.charset_encoding
if response.status_code == 200: print(f"二级连接状态:{response.status_code}")
data = response.json() if response.status_code == 200:
for item in data: data = response.json()
banmianming = item["pageName"] for item in data:
banmianhao = f"{item['pageNo']}" banmianming = item["pageName"]
for article in item["onePageArticleList"]: banmianhao = f"{item['pageNo']}"
title = article["mainTitle"] for article in item["onePageArticleList"]:
url2 = f"https://rmydb.cnii.com.cn/html/{day.__str__()[:4]}/{day}/{day}_{item['pageNo']}/{article['articleHref']}" title = article["mainTitle"]
"""https://rmydb.cnii.com.cn/html/2024/20240906/20240906_001/20240906_001_01_523.html""" url2 = f"https://rmydb.cnii.com.cn/html/{day.__str__()[:4]}/{day}/{day}_{item['pageNo']}/{article['articleHref']}"
author = article["articleAuthor"] """https://rmydb.cnii.com.cn/html/2024/20240906/20240906_001/20240906_001_01_523.html"""
if await collection.find_one({"detail_url": url2}, {"_id": False}): author = article["articleAuthor"]
continue if await collection.find_one({"detail_url": url2}, {"_id": False}):
print(url2) continue
response2 = await client.get(url2) print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2)
response2.encoding = response2.charset_encoding response2 = await client.get(url2)
print(f"三级连接状态:{response2.status_code}") response2.encoding = response2.charset_encoding
if response2.status_code == 200: print(f"三级连接状态:{response2.status_code}")
soup = BeautifulSoup(response2.text, "lxml") if response2.status_code == 200:
preTitle = soup.select_one("#PreTitle").text soup = BeautifulSoup(response2.text, "lxml")
title = soup.select_one("#Title").text preTitle = soup.select_one("#PreTitle").text
subTitle = soup.select_one("#SubTitle").text title = soup.select_one("#Title").text
content = await getContent(soup) subTitle = soup.select_one("#SubTitle").text
await collection.insert_one({ content = await getContent(soup)
"title": title, await collection.insert_one({
"subtitle": subTitle, "title": title,
"preTitle": preTitle, "subtitle": subTitle,
"author": author, "preTitle": preTitle,
"banmianming": banmianming, "author": author,
"banmianhao": banmianhao, "banmianming": banmianming,
'keywordlist': "empty", "banmianhao": banmianhao,
'detail_url': url2, 'keywordlist': "empty",
'release_time': datetime.strptime(str(day), "%Y%m%d"), 'detail_url': url2,
'insert_timestamp': datetime.today(), 'release_time': datetime.strptime(str(day), "%Y%m%d"),
'content': content 'insert_timestamp': datetime.today(),
}) 'content': content
crawl_num += 1 })
print(f"人民邮电报---{day}---{banmianming}---{banmianhao}---{title}---采集完成!") crawl_num += 1
await asyncio.sleep(random.randint(5, 15)) print(
print(f"人民邮电报---{day}---{banmianming}---{banmianhao}-----采集完成!") f"人民邮电报---{day}---{banmianming}---{banmianhao}---{title}---采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(f"人民邮电报---{day}---{banmianming}---{banmianhao}-----采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(f"人民邮电报---{day}-----采集完成!")
await asyncio.sleep(random.randint(5, 15)) await asyncio.sleep(random.randint(5, 15))
print(f"人民邮电报---{day}-----采集完成!") except Exception as e:
await asyncio.sleep(random.randint(5, 15)) print(e)
except Exception as e: await collection.insert_one(
print(e) {'banmianhao': 'empty',
await collection.insert_one( 'banmianming': 'empty',
{'banmianhao': 'empty', 'preTitle': 'empty',
'banmianming': 'empty', 'title': 'empty',
'preTitle': 'empty', 'subtitle': 'empty',
'title': 'empty', 'author': 'empty',
'subtitle': 'empty', 'keywordlist': 'empty',
'author': 'empty', 'detail_url': url,
'keywordlist': 'empty', 'release_time': datetime.strptime(str(day), "%Y%m%d"),
'detail_url': url, 'insert_timestamp': datetime.today(),
'release_time': datetime.strptime(str(day), "%Y%m%d"), 'content': 'empty'}
'insert_timestamp': datetime.today(), )
'content': 'empty'} except Exception as e:
) print(e)
print(f"人民邮电报采集完毕,共采集{crawl_num}条数据!") print(f"人民邮电报采集完毕,共采集{crawl_num}条数据!")
asyncio.run(main()) asyncio.run(main())