fix: 修复中国社会报首页正则匹配规则

This commit is contained in:
皓月归尘 2024-11-21 11:07:27 +08:00
parent cce3752564
commit 68c8c18d85

View File

@ -23,7 +23,7 @@ headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'} 'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
# 正则表达式提取年份、月份和数组内容 # 正则表达式提取年份、月份和数组内容
pattern = r"_htep_(\d{4})_(\d{1,2})=new Array\((.*?)\);" pattern = r"_htep_(\d{4})_(\d{1,2})=new Array\((.*?)\);"
pattern_url = r'<meta content="[^"]*URL=([^"]+)"[^>]*>' pattern_url = r'url=([^">]+)'
# 链接数据库 # 链接数据库
client = AsyncIOMotorClient('mongodb://localhost:27017') client = AsyncIOMotorClient('mongodb://localhost:27017')
db = client['buweijiguanbao'] db = client['buweijiguanbao']
@ -93,7 +93,7 @@ async def loading():
end_time = time(23, 0, 0) # 23:00:00 end_time = time(23, 0, 0) # 23:00:00
# 判断当前时间是否在范围内 # 判断当前时间是否在范围内
if start_time <= now <= end_time: if start_time <= now <= end_time:
# print("当前时间在07:00:00--23:00:00范围内,中国可正常采集!") # print("当前时间在07:00:00--23:00:00范围内,中国社会报可正常采集!")
return True return True
else: else:
print("当前时间不在07:00:00--23:00:00范围内中国社会报无法采集") print("当前时间不在07:00:00--23:00:00范围内中国社会报无法采集")
@ -142,6 +142,7 @@ async def getData(start_date: datetime, end_date: datetime):
match = re.search(pattern_url, response.text, re.IGNORECASE) match = re.search(pattern_url, response.text, re.IGNORECASE)
if match: if match:
url = "https://epaper.shehuiwang.cn" + match.group(1) url = "https://epaper.shehuiwang.cn" + match.group(1)
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url)
response = await client.get(url) response = await client.get(url)
response.encoding = response.charset_encoding response.encoding = response.charset_encoding
print(f"一级连接状态:{response.status_code}") print(f"一级连接状态:{response.status_code}")