fix: 修复中国社会报首页正则匹配规则
This commit is contained in:
parent
cce3752564
commit
68c8c18d85
@ -23,7 +23,7 @@ headers = {
|
||||
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
|
||||
# 正则表达式提取年份、月份和数组内容
|
||||
pattern = r"_htep_(\d{4})_(\d{1,2})=new Array\((.*?)\);"
|
||||
pattern_url = r'<meta content="[^"]*URL=([^"]+)"[^>]*>'
|
||||
pattern_url = r'url=([^">]+)'
|
||||
# 链接数据库
|
||||
client = AsyncIOMotorClient('mongodb://localhost:27017')
|
||||
db = client['buweijiguanbao']
|
||||
@ -93,7 +93,7 @@ async def loading():
|
||||
end_time = time(23, 0, 0) # 23:00:00
|
||||
# 判断当前时间是否在范围内
|
||||
if start_time <= now <= end_time:
|
||||
# print("当前时间在07:00:00--23:00:00范围内,中国可正常采集!")
|
||||
# print("当前时间在07:00:00--23:00:00范围内,中国社会报可正常采集!")
|
||||
return True
|
||||
else:
|
||||
print("当前时间不在07:00:00--23:00:00范围内,中国社会报无法采集")
|
||||
@ -142,6 +142,7 @@ async def getData(start_date: datetime, end_date: datetime):
|
||||
match = re.search(pattern_url, response.text, re.IGNORECASE)
|
||||
if match:
|
||||
url = "https://epaper.shehuiwang.cn" + match.group(1)
|
||||
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url)
|
||||
response = await client.get(url)
|
||||
response.encoding = response.charset_encoding
|
||||
print(f"一级连接状态:{response.status_code}")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user