From e134004f2dd5f57c8c606837adde35c42eca98ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9A=93=E6=9C=88=E5=BD=92=E5=B0=98?= Date: Mon, 11 Nov 2024 20:28:12 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E4=BA=BA=E6=B0=91?= =?UTF-8?q?=E9=82=AE=E7=94=B5=E6=8A=A5=E5=81=B6=E5=B0=94=E5=87=BA=E7=8E=B0?= =?UTF-8?q?=E6=97=A5=E6=9C=9F=E5=8C=B9=E9=85=8D=E5=A4=B1=E8=B4=A5=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 国内党媒/CrawlRenminyoudianbao.py | 161 +++++++++++++++--------------- 1 file changed, 83 insertions(+), 78 deletions(-) diff --git a/国内党媒/CrawlRenminyoudianbao.py b/国内党媒/CrawlRenminyoudianbao.py index ebe4a18..e99ad26 100644 --- a/国内党媒/CrawlRenminyoudianbao.py +++ b/国内党媒/CrawlRenminyoudianbao.py @@ -9,7 +9,7 @@ import asyncio import random -from datetime import datetime, timedelta +from datetime import datetime from bs4 import BeautifulSoup from httpx import AsyncClient @@ -20,8 +20,8 @@ start_date = datetime.strptime('2017-09', '%Y-%m') end_date = datetime.today() """截止到今天""" headers = { - "connection":'keep-alive', - "host":"rmydb.cnii.com.cn", + "connection": 'keep-alive', + "host": "rmydb.cnii.com.cn", 'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'} # 链接数据库 @@ -67,82 +67,87 @@ async def getData(start_date: datetime, end_date: datetime): """ crawl_num = 0 start_date = int(start_date.strftime("%Y%m%d")) - async with AsyncClient(headers=headers, timeout=60) as client: - response = await client.get("https://rmydb.cnii.com.cn/period/yearMonthDay.json") - response.encoding = response.charset_encoding - print(f"一级连接状态:{response.status_code}") - if response.status_code == 200: - data = response.json() - dayList = [] - for value in data.values(): - for item in value.values(): - dayList += item - dayList.sort() - dayList = list(filter(lambda x: x >= start_date, list(map(int, dayList)))) - for day in dayList: - try: - url = f"https://rmydb.cnii.com.cn/html/{day.__str__()[:4]}/{day}/data.json" - print(url) - response = await client.get(url) - response.encoding = response.charset_encoding - print(f"二级连接状态:{response.status_code}") - if response.status_code == 200: - data = response.json() - for item in data: - banmianming = item["pageName"] - banmianhao = f"第{item['pageNo']}版" - for article in item["onePageArticleList"]: - title = article["mainTitle"] - url2 = f"https://rmydb.cnii.com.cn/html/{day.__str__()[:4]}/{day}/{day}_{item['pageNo']}/{article['articleHref']}" - """https://rmydb.cnii.com.cn/html/2024/20240906/20240906_001/20240906_001_01_523.html""" - author = article["articleAuthor"] - if await collection.find_one({"detail_url": url2}, {"_id": False}): - continue - print(url2) - response2 = await client.get(url2) - response2.encoding = response2.charset_encoding - print(f"三级连接状态:{response2.status_code}") - if response2.status_code == 200: - soup = BeautifulSoup(response2.text, "lxml") - preTitle = soup.select_one("#PreTitle").text - title = soup.select_one("#Title").text - subTitle = soup.select_one("#SubTitle").text - content = await getContent(soup) - await collection.insert_one({ - "title": title, - "subtitle": subTitle, - "preTitle": preTitle, - "author": author, - "banmianming": banmianming, - "banmianhao": banmianhao, - 'keywordlist': "empty", - 'detail_url': url2, - 'release_time': datetime.strptime(str(day), "%Y%m%d"), - 'insert_timestamp': datetime.today(), - 'content': content - }) - crawl_num += 1 - print(f"人民邮电报---{day}---{banmianming}---{banmianhao}---{title}---采集完成!") - await asyncio.sleep(random.randint(5, 15)) - print(f"人民邮电报---{day}---{banmianming}---{banmianhao}-----采集完成!") + try: + async with AsyncClient(headers=headers, timeout=60) as client: + response = await client.get("https://rmydb.cnii.com.cn/period/yearMonthDay.json") + response.encoding = response.charset_encoding + print(f"一级连接状态:{response.status_code}") + if response.status_code == 200: + data = response.json() + dayList = [] + for value in data.values(): + for item in value.values(): + dayList += item + dayList.sort() + dayList = list(filter(lambda x: x >= start_date, list(map(int, dayList)))) + for day in dayList: + try: + url = f"https://rmydb.cnii.com.cn/html/{day.__str__()[:4]}/{day}/data.json" + print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url) + response = await client.get(url) + response.encoding = response.charset_encoding + print(f"二级连接状态:{response.status_code}") + if response.status_code == 200: + data = response.json() + for item in data: + banmianming = item["pageName"] + banmianhao = f"第{item['pageNo']}版" + for article in item["onePageArticleList"]: + title = article["mainTitle"] + url2 = f"https://rmydb.cnii.com.cn/html/{day.__str__()[:4]}/{day}/{day}_{item['pageNo']}/{article['articleHref']}" + """https://rmydb.cnii.com.cn/html/2024/20240906/20240906_001/20240906_001_01_523.html""" + author = article["articleAuthor"] + if await collection.find_one({"detail_url": url2}, {"_id": False}): + continue + print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2) + response2 = await client.get(url2) + response2.encoding = response2.charset_encoding + print(f"三级连接状态:{response2.status_code}") + if response2.status_code == 200: + soup = BeautifulSoup(response2.text, "lxml") + preTitle = soup.select_one("#PreTitle").text + title = soup.select_one("#Title").text + subTitle = soup.select_one("#SubTitle").text + content = await getContent(soup) + await collection.insert_one({ + "title": title, + "subtitle": subTitle, + "preTitle": preTitle, + "author": author, + "banmianming": banmianming, + "banmianhao": banmianhao, + 'keywordlist': "empty", + 'detail_url': url2, + 'release_time': datetime.strptime(str(day), "%Y%m%d"), + 'insert_timestamp': datetime.today(), + 'content': content + }) + crawl_num += 1 + print( + f"人民邮电报---{day}---{banmianming}---{banmianhao}---{title}---采集完成!") + await asyncio.sleep(random.randint(5, 15)) + print(f"人民邮电报---{day}---{banmianming}---{banmianhao}-----采集完成!") + await asyncio.sleep(random.randint(5, 15)) + print(f"人民邮电报---{day}-----采集完成!") await asyncio.sleep(random.randint(5, 15)) - print(f"人民邮电报---{day}-----采集完成!") - await asyncio.sleep(random.randint(5, 15)) - except Exception as e: - print(e) - await collection.insert_one( - {'banmianhao': 'empty', - 'banmianming': 'empty', - 'preTitle': 'empty', - 'title': 'empty', - 'subtitle': 'empty', - 'author': 'empty', - 'keywordlist': 'empty', - 'detail_url': url, - 'release_time': datetime.strptime(str(day), "%Y%m%d"), - 'insert_timestamp': datetime.today(), - 'content': 'empty'} - ) + except Exception as e: + print(e) + await collection.insert_one( + {'banmianhao': 'empty', + 'banmianming': 'empty', + 'preTitle': 'empty', + 'title': 'empty', + 'subtitle': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': url, + 'release_time': datetime.strptime(str(day), "%Y%m%d"), + 'insert_timestamp': datetime.today(), + 'content': 'empty'} + ) + except Exception as e: + print(e) print(f"人民邮电报采集完毕,共采集{crawl_num}条数据!") + asyncio.run(main())