fix: 修复安徽日报偶尔出现连接中断问题

This commit is contained in:
皓月归尘 2024-11-11 21:29:22 +08:00
parent b7a9a32601
commit 9bc73843fa
9 changed files with 76 additions and 45 deletions

2
.gitignore vendored
View File

@ -17,6 +17,8 @@ env/
.idea/ .idea/
.vscode/ .vscode/
test/
# Compiled source # Compiled source
*.com *.com
*.class *.class

View File

@ -16,11 +16,19 @@ from httpx import AsyncClient
from motor.motor_asyncio import AsyncIOMotorClient from motor.motor_asyncio import AsyncIOMotorClient
start_date = datetime.strptime('2017-09-29', '%Y-%m-%d') start_date = datetime.strptime('2017-09-29', '%Y-%m-%d')
"""安徽日报报2018年09月29日开始有数据""" """安徽日报报2017年09月29日开始有数据"""
end_date = datetime.today() end_date = datetime.today()
"""截止到今天""" """截止到今天"""
headers = { headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'} 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6,zh-TW;q=0.5,de-DE;q=0.4,de;q=0.3',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0',
'sec-ch-ua': '"Chromium";v="130", "Microsoft Edge";v="130", "Not?A_Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-gpc': '1',
}
# 链接数据库 # 链接数据库
client = AsyncIOMotorClient('mongodb://localhost:27017') client = AsyncIOMotorClient('mongodb://localhost:27017')
@ -66,9 +74,13 @@ async def getData(start_date: datetime, end_date: datetime):
url = base_url + 'node_01.html' url = base_url + 'node_01.html'
"""https://szb.ahnews.com.cn/ahrb/layout/201811/01/node_01.html""" """https://szb.ahnews.com.cn/ahrb/layout/201811/01/node_01.html"""
try: try:
async with AsyncClient(headers=headers, timeout=60) as client: async with AsyncClient(headers=headers, timeout=60, http2=False) as client:
print(url) print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url)
try:
response = await client.get(url) response = await client.get(url)
except Exception as e:
await asyncio.sleep(random.randint(8, 20))
print(f"尝试第{t + 1}次重连!")
response.encoding = response.charset_encoding response.encoding = response.charset_encoding
print(f"一级连接状态:{response.status_code}") print(f"一级连接状态:{response.status_code}")
if response.status_code == 200: if response.status_code == 200:
@ -77,8 +89,13 @@ async def getData(start_date: datetime, end_date: datetime):
banmianming = item.text.split("")[-1].strip() banmianming = item.text.split("")[-1].strip()
banmianhao = item.text.split("")[0].replace(" ", "").replace(" ", "").strip() banmianhao = item.text.split("")[0].replace(" ", "").replace(" ", "").strip()
url1 = base_url + item.get("href") url1 = base_url + item.get("href")
print(url1) print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url1)
for t in range(5):
try:
response2 = await client.get(url1) response2 = await client.get(url1)
except Exception as e:
await asyncio.sleep(random.randint(8, 20))
print(f"尝试第{t + 1}次重连!")
response2.encoding = response2.charset_encoding response2.encoding = response2.charset_encoding
print(f"二级连接状态:{response2.status_code}") print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200: if response2.status_code == 200:
@ -89,8 +106,17 @@ async def getData(start_date: datetime, end_date: datetime):
if await collection.find_one({"detail_url": url2}, {"_id": False}): if await collection.find_one({"detail_url": url2}, {"_id": False}):
continue continue
title = item2.text.strip() title = item2.text.strip()
print(url2) print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2)
# 启用超时重连
for t in range(5):
try:
response3 = await client.get(url2) response3 = await client.get(url2)
if response3.status_code == 200:
break
except:
# 随机等待重连
await asyncio.sleep(random.randint(8, 20))
print(f"尝试第{t + 1}次重连!")
response3.encoding = response3.charset_encoding response3.encoding = response3.charset_encoding
print(f"三级连接状态:{response3.status_code}") print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200: if response3.status_code == 200:
@ -118,12 +144,13 @@ async def getData(start_date: datetime, end_date: datetime):
'content': content 'content': content
}) })
crawl_num += 1 crawl_num += 1
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!") print(
await asyncio.sleep(random.randint(5, 15)) f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!")
await asyncio.sleep(random.randint(8, 20))
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!") print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!")
await asyncio.sleep(random.randint(5, 15)) await asyncio.sleep(random.randint(8, 20))
print(f"安徽日报---{date_now_s}-----采集完成!") print(f"安徽日报---{date_now_s}-----采集完成!")
await asyncio.sleep(random.randint(5, 15)) await asyncio.sleep(random.randint(8, 20))
except Exception as e: except Exception as e:
print(e) print(e)
await collection.insert_one( await collection.insert_one(
@ -141,5 +168,5 @@ async def getData(start_date: datetime, end_date: datetime):
) )
print(f"安徽日报采集完毕,共采集{crawl_num}条数据!") print(f"安徽日报采集完毕,共采集{crawl_num}条数据!")
asyncio.run(main())
asyncio.run(main())

View File

@ -4,13 +4,14 @@
# @File : CrawlHainan.py # @File : CrawlHainan.py
# @Software : PyCharm # @Software : PyCharm
# @Comment : # @Comment :
import re
from bs4 import BeautifulSoup
import requests
from datetime import timedelta, datetime
import time
import pymongo
import random import random
import re
import time
from datetime import timedelta, datetime
import pymongo
import requests
from bs4 import BeautifulSoup
# 数据库起止时间 # 数据库起止时间
start_date = datetime.strptime('2008-02-29', '%Y-%m-%d') start_date = datetime.strptime('2008-02-29', '%Y-%m-%d')

View File

@ -5,11 +5,12 @@
# @Software : PyCharm # @Software : PyCharm
# @Comment : 本程序采集宁夏日报版面数据 # @Comment : 本程序采集宁夏日报版面数据
import requests
import time
import pymongo
import random import random
import time
from datetime import timedelta, datetime from datetime import timedelta, datetime
import pymongo
import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
# 数据库起止时间 # 数据库起止时间

View File

@ -6,13 +6,13 @@
# @Software : PyCharm # @Software : PyCharm
# @Comment : 本程序采集四川日报数字报板面数据 # @Comment : 本程序采集四川日报数字报板面数据
import re
from bs4 import BeautifulSoup
import requests
from datetime import timedelta, datetime
import time
import pymongo
import random import random
import time
from datetime import timedelta, datetime
import pymongo
import requests
from bs4 import BeautifulSoup
# 数据库起止时间 # 数据库起止时间
start_date = datetime.strptime('2021-12-31', '%Y-%m-%d') start_date = datetime.strptime('2021-12-31', '%Y-%m-%d')

View File

@ -3,12 +3,13 @@
# @Author : Haochen Zhong # @Author : Haochen Zhong
# @File : 本程序用于抓取上海新民晚报数据 # @File : 本程序用于抓取上海新民晚报数据
# @Project : Pytharm # @Project : Pytharm
from bs4 import BeautifulSoup, Comment
import requests
from datetime import timedelta, datetime
import time
import pymongo
import random import random
import time
from datetime import timedelta, datetime
import pymongo
import requests
from bs4 import BeautifulSoup, Comment
start_date = datetime.strptime('2018-12-31', '%Y-%m-%d') # 抓取上海新民晚报从2019-01-01到至今的数据 start_date = datetime.strptime('2018-12-31', '%Y-%m-%d') # 抓取上海新民晚报从2019-01-01到至今的数据
end_date = datetime.today() end_date = datetime.today()

View File

@ -1,4 +1,3 @@
import datetime
import random import random
import time import time

View File

@ -174,7 +174,8 @@ def saveFile():
continue continue
csvData = pd.DataFrame(dataList) csvData = pd.DataFrame(dataList)
csvData.columns = ["数据库ID", "文章归类", "文种(自治区)", "公文种类", "文章标题", "文章副标题", "主题分类", csvData.columns = ["数据库ID", "文章归类", "文种(自治区)", "公文种类", "文章标题", "文章副标题", "主题分类",
"索引号", "文号", "发文机关(自治区)", "成文日期时间戳", "发布日期时间戳", "有效性", "作者", "年份", "索引号", "文号", "发文机关(自治区)", "成文日期时间戳", "发布日期时间戳", "有效性", "作者",
"年份",
"附件链接", "附件链接",
"文章链接", "来源", "正文内容"] "文章链接", "来源", "正文内容"]
csvData.to_csv(totalPath, encoding="utf-8-sig", index_label="序号") csvData.to_csv(totalPath, encoding="utf-8-sig", index_label="序号")

View File

@ -1,10 +1,7 @@
import asyncio import asyncio
import datetime
import random import random
import time
import pymongo import pymongo
import requests
from httpx import AsyncClient from httpx import AsyncClient
# 模拟用户访问 # 模拟用户访问
@ -59,4 +56,6 @@ async def getData():
shenzhenzcwj.insert_one(newData) shenzhenzcwj.insert_one(newData)
print(newData["title"], "采集完成") print(newData["title"], "采集完成")
await asyncio.sleep(random.randint(2, 3)) await asyncio.sleep(random.randint(2, 3))
asyncio.run(getData()) asyncio.run(getData())