fix: 修复中国社会报首页正则匹配规则
This commit is contained in:
parent
cce3752564
commit
68c8c18d85
@ -23,7 +23,7 @@ headers = {
|
|||||||
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
|
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
|
||||||
# 正则表达式提取年份、月份和数组内容
|
# 正则表达式提取年份、月份和数组内容
|
||||||
pattern = r"_htep_(\d{4})_(\d{1,2})=new Array\((.*?)\);"
|
pattern = r"_htep_(\d{4})_(\d{1,2})=new Array\((.*?)\);"
|
||||||
pattern_url = r'<meta content="[^"]*URL=([^"]+)"[^>]*>'
|
pattern_url = r'url=([^">]+)'
|
||||||
# 链接数据库
|
# 链接数据库
|
||||||
client = AsyncIOMotorClient('mongodb://localhost:27017')
|
client = AsyncIOMotorClient('mongodb://localhost:27017')
|
||||||
db = client['buweijiguanbao']
|
db = client['buweijiguanbao']
|
||||||
@ -93,7 +93,7 @@ async def loading():
|
|||||||
end_time = time(23, 0, 0) # 23:00:00
|
end_time = time(23, 0, 0) # 23:00:00
|
||||||
# 判断当前时间是否在范围内
|
# 判断当前时间是否在范围内
|
||||||
if start_time <= now <= end_time:
|
if start_time <= now <= end_time:
|
||||||
# print("当前时间在07:00:00--23:00:00范围内,中国可正常采集!")
|
# print("当前时间在07:00:00--23:00:00范围内,中国社会报可正常采集!")
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
print("当前时间不在07:00:00--23:00:00范围内,中国社会报无法采集")
|
print("当前时间不在07:00:00--23:00:00范围内,中国社会报无法采集")
|
||||||
@ -142,6 +142,7 @@ async def getData(start_date: datetime, end_date: datetime):
|
|||||||
match = re.search(pattern_url, response.text, re.IGNORECASE)
|
match = re.search(pattern_url, response.text, re.IGNORECASE)
|
||||||
if match:
|
if match:
|
||||||
url = "https://epaper.shehuiwang.cn" + match.group(1)
|
url = "https://epaper.shehuiwang.cn" + match.group(1)
|
||||||
|
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url)
|
||||||
response = await client.get(url)
|
response = await client.get(url)
|
||||||
response.encoding = response.charset_encoding
|
response.encoding = response.charset_encoding
|
||||||
print(f"一级连接状态:{response.status_code}")
|
print(f"一级连接状态:{response.status_code}")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user