fix: 修复安徽日报偶尔出现连接中断问题
This commit is contained in:
parent
b7a9a32601
commit
9bc73843fa
2
.gitignore
vendored
2
.gitignore
vendored
@ -17,6 +17,8 @@ env/
|
||||
.idea/
|
||||
.vscode/
|
||||
|
||||
test/
|
||||
|
||||
# Compiled source
|
||||
*.com
|
||||
*.class
|
||||
|
||||
@ -16,11 +16,19 @@ from httpx import AsyncClient
|
||||
from motor.motor_asyncio import AsyncIOMotorClient
|
||||
|
||||
start_date = datetime.strptime('2017-09-29', '%Y-%m-%d')
|
||||
"""安徽日报报2018年09月29日开始有数据"""
|
||||
"""安徽日报报2017年09月29日开始有数据"""
|
||||
end_date = datetime.today()
|
||||
"""截止到今天"""
|
||||
headers = {
|
||||
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6,zh-TW;q=0.5,de-DE;q=0.4,de;q=0.3',
|
||||
'Connection': 'keep-alive',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0',
|
||||
'sec-ch-ua': '"Chromium";v="130", "Microsoft Edge";v="130", "Not?A_Brand";v="99"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"Windows"',
|
||||
'sec-gpc': '1',
|
||||
}
|
||||
|
||||
# 链接数据库
|
||||
client = AsyncIOMotorClient('mongodb://localhost:27017')
|
||||
@ -66,9 +74,13 @@ async def getData(start_date: datetime, end_date: datetime):
|
||||
url = base_url + 'node_01.html'
|
||||
"""https://szb.ahnews.com.cn/ahrb/layout/201811/01/node_01.html"""
|
||||
try:
|
||||
async with AsyncClient(headers=headers, timeout=60) as client:
|
||||
print(url)
|
||||
response = await client.get(url)
|
||||
async with AsyncClient(headers=headers, timeout=60, http2=False) as client:
|
||||
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url)
|
||||
try:
|
||||
response = await client.get(url)
|
||||
except Exception as e:
|
||||
await asyncio.sleep(random.randint(8, 20))
|
||||
print(f"尝试第{t + 1}次重连!")
|
||||
response.encoding = response.charset_encoding
|
||||
print(f"一级连接状态:{response.status_code}")
|
||||
if response.status_code == 200:
|
||||
@ -77,8 +89,13 @@ async def getData(start_date: datetime, end_date: datetime):
|
||||
banmianming = item.text.split(":")[-1].strip()
|
||||
banmianhao = item.text.split(":")[0].replace(" ", "").replace(" ", "").strip()
|
||||
url1 = base_url + item.get("href")
|
||||
print(url1)
|
||||
response2= await client.get(url1)
|
||||
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url1)
|
||||
for t in range(5):
|
||||
try:
|
||||
response2 = await client.get(url1)
|
||||
except Exception as e:
|
||||
await asyncio.sleep(random.randint(8, 20))
|
||||
print(f"尝试第{t + 1}次重连!")
|
||||
response2.encoding = response2.charset_encoding
|
||||
print(f"二级连接状态:{response2.status_code}")
|
||||
if response2.status_code == 200:
|
||||
@ -89,8 +106,17 @@ async def getData(start_date: datetime, end_date: datetime):
|
||||
if await collection.find_one({"detail_url": url2}, {"_id": False}):
|
||||
continue
|
||||
title = item2.text.strip()
|
||||
print(url2)
|
||||
response3 = await client.get(url2)
|
||||
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2)
|
||||
# 启用超时重连
|
||||
for t in range(5):
|
||||
try:
|
||||
response3 = await client.get(url2)
|
||||
if response3.status_code == 200:
|
||||
break
|
||||
except:
|
||||
# 随机等待重连
|
||||
await asyncio.sleep(random.randint(8, 20))
|
||||
print(f"尝试第{t + 1}次重连!")
|
||||
response3.encoding = response3.charset_encoding
|
||||
print(f"三级连接状态:{response3.status_code}")
|
||||
if response3.status_code == 200:
|
||||
@ -101,7 +127,7 @@ async def getData(start_date: datetime, end_date: datetime):
|
||||
except:
|
||||
title = title
|
||||
try:
|
||||
subTitle= soup3.select(".newsdetatext p")[0].text.strip()
|
||||
subTitle = soup3.select(".newsdetatext p")[0].text.strip()
|
||||
except:
|
||||
subTitle = ""
|
||||
await collection.insert_one({
|
||||
@ -118,12 +144,13 @@ async def getData(start_date: datetime, end_date: datetime):
|
||||
'content': content
|
||||
})
|
||||
crawl_num += 1
|
||||
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!")
|
||||
await asyncio.sleep(random.randint(5, 15))
|
||||
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!")
|
||||
await asyncio.sleep(random.randint(5, 15))
|
||||
print(
|
||||
f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!")
|
||||
await asyncio.sleep(random.randint(8, 20))
|
||||
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!")
|
||||
await asyncio.sleep(random.randint(8, 20))
|
||||
print(f"安徽日报---{date_now_s}-----采集完成!")
|
||||
await asyncio.sleep(random.randint(5, 15))
|
||||
await asyncio.sleep(random.randint(8, 20))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
await collection.insert_one(
|
||||
@ -141,5 +168,5 @@ async def getData(start_date: datetime, end_date: datetime):
|
||||
)
|
||||
print(f"安徽日报采集完毕,共采集{crawl_num}条数据!")
|
||||
|
||||
asyncio.run(main())
|
||||
|
||||
asyncio.run(main())
|
||||
|
||||
@ -4,13 +4,14 @@
|
||||
# @File : CrawlHainan.py
|
||||
# @Software : PyCharm
|
||||
# @Comment :
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from datetime import timedelta, datetime
|
||||
import time
|
||||
import pymongo
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
from datetime import timedelta, datetime
|
||||
|
||||
import pymongo
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# 数据库起止时间
|
||||
start_date = datetime.strptime('2008-02-29', '%Y-%m-%d')
|
||||
|
||||
@ -5,11 +5,12 @@
|
||||
# @Software : PyCharm
|
||||
# @Comment : 本程序采集宁夏日报版面数据
|
||||
|
||||
import requests
|
||||
import time
|
||||
import pymongo
|
||||
import random
|
||||
import time
|
||||
from datetime import timedelta, datetime
|
||||
|
||||
import pymongo
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# 数据库起止时间
|
||||
|
||||
@ -6,13 +6,13 @@
|
||||
# @Software : PyCharm
|
||||
# @Comment : 本程序采集四川日报数字报板面数据
|
||||
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from datetime import timedelta, datetime
|
||||
import time
|
||||
import pymongo
|
||||
import random
|
||||
import time
|
||||
from datetime import timedelta, datetime
|
||||
|
||||
import pymongo
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# 数据库起止时间
|
||||
start_date = datetime.strptime('2021-12-31', '%Y-%m-%d')
|
||||
|
||||
@ -3,12 +3,13 @@
|
||||
# @Author : Haochen Zhong
|
||||
# @File : 本程序用于抓取上海新民晚报数据
|
||||
# @Project : Pytharm
|
||||
from bs4 import BeautifulSoup, Comment
|
||||
import requests
|
||||
from datetime import timedelta, datetime
|
||||
import time
|
||||
import pymongo
|
||||
import random
|
||||
import time
|
||||
from datetime import timedelta, datetime
|
||||
|
||||
import pymongo
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, Comment
|
||||
|
||||
start_date = datetime.strptime('2018-12-31', '%Y-%m-%d') # 抓取上海新民晚报从2019-01-01到至今的数据
|
||||
end_date = datetime.today()
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
import datetime
|
||||
import random
|
||||
import time
|
||||
|
||||
|
||||
@ -174,10 +174,11 @@ def saveFile():
|
||||
continue
|
||||
csvData = pd.DataFrame(dataList)
|
||||
csvData.columns = ["数据库ID", "文章归类", "文种(自治区)", "公文种类", "文章标题", "文章副标题", "主题分类",
|
||||
"索引号", "文号", "发文机关(自治区)", "成文日期时间戳", "发布日期时间戳", "有效性", "作者", "年份",
|
||||
"索引号", "文号", "发文机关(自治区)", "成文日期时间戳", "发布日期时间戳", "有效性", "作者",
|
||||
"年份",
|
||||
"附件链接",
|
||||
"文章链接", "来源", "正文内容"]
|
||||
csvData.to_csv(totalPath, encoding="utf-8-sig",index_label="序号")
|
||||
csvData.to_csv(totalPath, encoding="utf-8-sig", index_label="序号")
|
||||
print(f"耗时:{time.time() - startTime} 秒,一共导出{num}份文件,详情数据请看数据统计表.csv")
|
||||
|
||||
|
||||
|
||||
@ -1,10 +1,7 @@
|
||||
import asyncio
|
||||
import datetime
|
||||
import random
|
||||
import time
|
||||
|
||||
import pymongo
|
||||
import requests
|
||||
from httpx import AsyncClient
|
||||
|
||||
# 模拟用户访问
|
||||
@ -47,7 +44,7 @@ async def getData():
|
||||
print(response2.status_code)
|
||||
if response2.status_code == 200:
|
||||
for item2 in response2.json()["articles"][1:]:
|
||||
if shenzhenzcwj.find_one({"id":item2["id"]}):
|
||||
if shenzhenzcwj.find_one({"id": item2["id"]}):
|
||||
continue
|
||||
url3 = f"http://www.sz.gov.cn/postmeta/p/{item2['id'] // 1000000}/{item2['id'] // 1000}/{item2['id']}.json"
|
||||
response3 = await client.get(url=url3)
|
||||
@ -57,6 +54,8 @@ async def getData():
|
||||
data = response3.json()
|
||||
newData = update_json_data(item2, data)
|
||||
shenzhenzcwj.insert_one(newData)
|
||||
print(newData["title"],"采集完成")
|
||||
await asyncio.sleep(random.randint(2,3))
|
||||
print(newData["title"], "采集完成")
|
||||
await asyncio.sleep(random.randint(2, 3))
|
||||
|
||||
|
||||
asyncio.run(getData())
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user