fix: 修复安徽日报偶尔出现连接中断问题
This commit is contained in:
parent
b7a9a32601
commit
9bc73843fa
2
.gitignore
vendored
2
.gitignore
vendored
@ -17,6 +17,8 @@ env/
|
|||||||
.idea/
|
.idea/
|
||||||
.vscode/
|
.vscode/
|
||||||
|
|
||||||
|
test/
|
||||||
|
|
||||||
# Compiled source
|
# Compiled source
|
||||||
*.com
|
*.com
|
||||||
*.class
|
*.class
|
||||||
|
|||||||
@ -16,11 +16,19 @@ from httpx import AsyncClient
|
|||||||
from motor.motor_asyncio import AsyncIOMotorClient
|
from motor.motor_asyncio import AsyncIOMotorClient
|
||||||
|
|
||||||
start_date = datetime.strptime('2017-09-29', '%Y-%m-%d')
|
start_date = datetime.strptime('2017-09-29', '%Y-%m-%d')
|
||||||
"""安徽日报报2018年09月29日开始有数据"""
|
"""安徽日报报2017年09月29日开始有数据"""
|
||||||
end_date = datetime.today()
|
end_date = datetime.today()
|
||||||
"""截止到今天"""
|
"""截止到今天"""
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||||
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6,zh-TW;q=0.5,de-DE;q=0.4,de;q=0.3',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0',
|
||||||
|
'sec-ch-ua': '"Chromium";v="130", "Microsoft Edge";v="130", "Not?A_Brand";v="99"',
|
||||||
|
'sec-ch-ua-mobile': '?0',
|
||||||
|
'sec-ch-ua-platform': '"Windows"',
|
||||||
|
'sec-gpc': '1',
|
||||||
|
}
|
||||||
|
|
||||||
# 链接数据库
|
# 链接数据库
|
||||||
client = AsyncIOMotorClient('mongodb://localhost:27017')
|
client = AsyncIOMotorClient('mongodb://localhost:27017')
|
||||||
@ -66,9 +74,13 @@ async def getData(start_date: datetime, end_date: datetime):
|
|||||||
url = base_url + 'node_01.html'
|
url = base_url + 'node_01.html'
|
||||||
"""https://szb.ahnews.com.cn/ahrb/layout/201811/01/node_01.html"""
|
"""https://szb.ahnews.com.cn/ahrb/layout/201811/01/node_01.html"""
|
||||||
try:
|
try:
|
||||||
async with AsyncClient(headers=headers, timeout=60) as client:
|
async with AsyncClient(headers=headers, timeout=60, http2=False) as client:
|
||||||
print(url)
|
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url)
|
||||||
response = await client.get(url)
|
try:
|
||||||
|
response = await client.get(url)
|
||||||
|
except Exception as e:
|
||||||
|
await asyncio.sleep(random.randint(8, 20))
|
||||||
|
print(f"尝试第{t + 1}次重连!")
|
||||||
response.encoding = response.charset_encoding
|
response.encoding = response.charset_encoding
|
||||||
print(f"一级连接状态:{response.status_code}")
|
print(f"一级连接状态:{response.status_code}")
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
@ -77,8 +89,13 @@ async def getData(start_date: datetime, end_date: datetime):
|
|||||||
banmianming = item.text.split(":")[-1].strip()
|
banmianming = item.text.split(":")[-1].strip()
|
||||||
banmianhao = item.text.split(":")[0].replace(" ", "").replace(" ", "").strip()
|
banmianhao = item.text.split(":")[0].replace(" ", "").replace(" ", "").strip()
|
||||||
url1 = base_url + item.get("href")
|
url1 = base_url + item.get("href")
|
||||||
print(url1)
|
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url1)
|
||||||
response2= await client.get(url1)
|
for t in range(5):
|
||||||
|
try:
|
||||||
|
response2 = await client.get(url1)
|
||||||
|
except Exception as e:
|
||||||
|
await asyncio.sleep(random.randint(8, 20))
|
||||||
|
print(f"尝试第{t + 1}次重连!")
|
||||||
response2.encoding = response2.charset_encoding
|
response2.encoding = response2.charset_encoding
|
||||||
print(f"二级连接状态:{response2.status_code}")
|
print(f"二级连接状态:{response2.status_code}")
|
||||||
if response2.status_code == 200:
|
if response2.status_code == 200:
|
||||||
@ -89,8 +106,17 @@ async def getData(start_date: datetime, end_date: datetime):
|
|||||||
if await collection.find_one({"detail_url": url2}, {"_id": False}):
|
if await collection.find_one({"detail_url": url2}, {"_id": False}):
|
||||||
continue
|
continue
|
||||||
title = item2.text.strip()
|
title = item2.text.strip()
|
||||||
print(url2)
|
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2)
|
||||||
response3 = await client.get(url2)
|
# 启用超时重连
|
||||||
|
for t in range(5):
|
||||||
|
try:
|
||||||
|
response3 = await client.get(url2)
|
||||||
|
if response3.status_code == 200:
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
# 随机等待重连
|
||||||
|
await asyncio.sleep(random.randint(8, 20))
|
||||||
|
print(f"尝试第{t + 1}次重连!")
|
||||||
response3.encoding = response3.charset_encoding
|
response3.encoding = response3.charset_encoding
|
||||||
print(f"三级连接状态:{response3.status_code}")
|
print(f"三级连接状态:{response3.status_code}")
|
||||||
if response3.status_code == 200:
|
if response3.status_code == 200:
|
||||||
@ -101,7 +127,7 @@ async def getData(start_date: datetime, end_date: datetime):
|
|||||||
except:
|
except:
|
||||||
title = title
|
title = title
|
||||||
try:
|
try:
|
||||||
subTitle= soup3.select(".newsdetatext p")[0].text.strip()
|
subTitle = soup3.select(".newsdetatext p")[0].text.strip()
|
||||||
except:
|
except:
|
||||||
subTitle = ""
|
subTitle = ""
|
||||||
await collection.insert_one({
|
await collection.insert_one({
|
||||||
@ -118,12 +144,13 @@ async def getData(start_date: datetime, end_date: datetime):
|
|||||||
'content': content
|
'content': content
|
||||||
})
|
})
|
||||||
crawl_num += 1
|
crawl_num += 1
|
||||||
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!")
|
print(
|
||||||
await asyncio.sleep(random.randint(5, 15))
|
f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!")
|
||||||
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!")
|
await asyncio.sleep(random.randint(8, 20))
|
||||||
await asyncio.sleep(random.randint(5, 15))
|
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!")
|
||||||
|
await asyncio.sleep(random.randint(8, 20))
|
||||||
print(f"安徽日报---{date_now_s}-----采集完成!")
|
print(f"安徽日报---{date_now_s}-----采集完成!")
|
||||||
await asyncio.sleep(random.randint(5, 15))
|
await asyncio.sleep(random.randint(8, 20))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
await collection.insert_one(
|
await collection.insert_one(
|
||||||
@ -141,5 +168,5 @@ async def getData(start_date: datetime, end_date: datetime):
|
|||||||
)
|
)
|
||||||
print(f"安徽日报采集完毕,共采集{crawl_num}条数据!")
|
print(f"安徽日报采集完毕,共采集{crawl_num}条数据!")
|
||||||
|
|
||||||
asyncio.run(main())
|
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
|
|||||||
@ -4,13 +4,14 @@
|
|||||||
# @File : CrawlHainan.py
|
# @File : CrawlHainan.py
|
||||||
# @Software : PyCharm
|
# @Software : PyCharm
|
||||||
# @Comment :
|
# @Comment :
|
||||||
import re
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import requests
|
|
||||||
from datetime import timedelta, datetime
|
|
||||||
import time
|
|
||||||
import pymongo
|
|
||||||
import random
|
import random
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from datetime import timedelta, datetime
|
||||||
|
|
||||||
|
import pymongo
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
# 数据库起止时间
|
# 数据库起止时间
|
||||||
start_date = datetime.strptime('2008-02-29', '%Y-%m-%d')
|
start_date = datetime.strptime('2008-02-29', '%Y-%m-%d')
|
||||||
|
|||||||
@ -5,11 +5,12 @@
|
|||||||
# @Software : PyCharm
|
# @Software : PyCharm
|
||||||
# @Comment : 本程序采集宁夏日报版面数据
|
# @Comment : 本程序采集宁夏日报版面数据
|
||||||
|
|
||||||
import requests
|
|
||||||
import time
|
|
||||||
import pymongo
|
|
||||||
import random
|
import random
|
||||||
|
import time
|
||||||
from datetime import timedelta, datetime
|
from datetime import timedelta, datetime
|
||||||
|
|
||||||
|
import pymongo
|
||||||
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
# 数据库起止时间
|
# 数据库起止时间
|
||||||
|
|||||||
@ -6,13 +6,13 @@
|
|||||||
# @Software : PyCharm
|
# @Software : PyCharm
|
||||||
# @Comment : 本程序采集四川日报数字报板面数据
|
# @Comment : 本程序采集四川日报数字报板面数据
|
||||||
|
|
||||||
import re
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import requests
|
|
||||||
from datetime import timedelta, datetime
|
|
||||||
import time
|
|
||||||
import pymongo
|
|
||||||
import random
|
import random
|
||||||
|
import time
|
||||||
|
from datetime import timedelta, datetime
|
||||||
|
|
||||||
|
import pymongo
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
# 数据库起止时间
|
# 数据库起止时间
|
||||||
start_date = datetime.strptime('2021-12-31', '%Y-%m-%d')
|
start_date = datetime.strptime('2021-12-31', '%Y-%m-%d')
|
||||||
|
|||||||
@ -3,12 +3,13 @@
|
|||||||
# @Author : Haochen Zhong
|
# @Author : Haochen Zhong
|
||||||
# @File : 本程序用于抓取上海新民晚报数据
|
# @File : 本程序用于抓取上海新民晚报数据
|
||||||
# @Project : Pytharm
|
# @Project : Pytharm
|
||||||
from bs4 import BeautifulSoup, Comment
|
|
||||||
import requests
|
|
||||||
from datetime import timedelta, datetime
|
|
||||||
import time
|
|
||||||
import pymongo
|
|
||||||
import random
|
import random
|
||||||
|
import time
|
||||||
|
from datetime import timedelta, datetime
|
||||||
|
|
||||||
|
import pymongo
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup, Comment
|
||||||
|
|
||||||
start_date = datetime.strptime('2018-12-31', '%Y-%m-%d') # 抓取上海新民晚报从2019-01-01到至今的数据
|
start_date = datetime.strptime('2018-12-31', '%Y-%m-%d') # 抓取上海新民晚报从2019-01-01到至今的数据
|
||||||
end_date = datetime.today()
|
end_date = datetime.today()
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
import datetime
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
|||||||
@ -174,10 +174,11 @@ def saveFile():
|
|||||||
continue
|
continue
|
||||||
csvData = pd.DataFrame(dataList)
|
csvData = pd.DataFrame(dataList)
|
||||||
csvData.columns = ["数据库ID", "文章归类", "文种(自治区)", "公文种类", "文章标题", "文章副标题", "主题分类",
|
csvData.columns = ["数据库ID", "文章归类", "文种(自治区)", "公文种类", "文章标题", "文章副标题", "主题分类",
|
||||||
"索引号", "文号", "发文机关(自治区)", "成文日期时间戳", "发布日期时间戳", "有效性", "作者", "年份",
|
"索引号", "文号", "发文机关(自治区)", "成文日期时间戳", "发布日期时间戳", "有效性", "作者",
|
||||||
|
"年份",
|
||||||
"附件链接",
|
"附件链接",
|
||||||
"文章链接", "来源", "正文内容"]
|
"文章链接", "来源", "正文内容"]
|
||||||
csvData.to_csv(totalPath, encoding="utf-8-sig",index_label="序号")
|
csvData.to_csv(totalPath, encoding="utf-8-sig", index_label="序号")
|
||||||
print(f"耗时:{time.time() - startTime} 秒,一共导出{num}份文件,详情数据请看数据统计表.csv")
|
print(f"耗时:{time.time() - startTime} 秒,一共导出{num}份文件,详情数据请看数据统计表.csv")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,10 +1,7 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import datetime
|
|
||||||
import random
|
import random
|
||||||
import time
|
|
||||||
|
|
||||||
import pymongo
|
import pymongo
|
||||||
import requests
|
|
||||||
from httpx import AsyncClient
|
from httpx import AsyncClient
|
||||||
|
|
||||||
# 模拟用户访问
|
# 模拟用户访问
|
||||||
@ -47,7 +44,7 @@ async def getData():
|
|||||||
print(response2.status_code)
|
print(response2.status_code)
|
||||||
if response2.status_code == 200:
|
if response2.status_code == 200:
|
||||||
for item2 in response2.json()["articles"][1:]:
|
for item2 in response2.json()["articles"][1:]:
|
||||||
if shenzhenzcwj.find_one({"id":item2["id"]}):
|
if shenzhenzcwj.find_one({"id": item2["id"]}):
|
||||||
continue
|
continue
|
||||||
url3 = f"http://www.sz.gov.cn/postmeta/p/{item2['id'] // 1000000}/{item2['id'] // 1000}/{item2['id']}.json"
|
url3 = f"http://www.sz.gov.cn/postmeta/p/{item2['id'] // 1000000}/{item2['id'] // 1000}/{item2['id']}.json"
|
||||||
response3 = await client.get(url=url3)
|
response3 = await client.get(url=url3)
|
||||||
@ -57,6 +54,8 @@ async def getData():
|
|||||||
data = response3.json()
|
data = response3.json()
|
||||||
newData = update_json_data(item2, data)
|
newData = update_json_data(item2, data)
|
||||||
shenzhenzcwj.insert_one(newData)
|
shenzhenzcwj.insert_one(newData)
|
||||||
print(newData["title"],"采集完成")
|
print(newData["title"], "采集完成")
|
||||||
await asyncio.sleep(random.randint(2,3))
|
await asyncio.sleep(random.randint(2, 3))
|
||||||
|
|
||||||
|
|
||||||
asyncio.run(getData())
|
asyncio.run(getData())
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user