From 68c8c18d8592e965ac0a26ecccd072e576fa6c46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9A=93=E6=9C=88=E5=BD=92=E5=B0=98?= Date: Thu, 21 Nov 2024 11:07:27 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E4=B8=AD=E5=9B=BD?= =?UTF-8?q?=E7=A4=BE=E4=BC=9A=E6=8A=A5=E9=A6=96=E9=A1=B5=E6=AD=A3=E5=88=99?= =?UTF-8?q?=E5=8C=B9=E9=85=8D=E8=A7=84=E5=88=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 国内党媒/CrawlZhongguoshehuibao.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/国内党媒/CrawlZhongguoshehuibao.py b/国内党媒/CrawlZhongguoshehuibao.py index b70632f..384f220 100644 --- a/国内党媒/CrawlZhongguoshehuibao.py +++ b/国内党媒/CrawlZhongguoshehuibao.py @@ -23,7 +23,7 @@ headers = { 'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'} # 正则表达式提取年份、月份和数组内容 pattern = r"_htep_(\d{4})_(\d{1,2})=new Array\((.*?)\);" -pattern_url = r']*>' +pattern_url = r'url=([^">]+)' # 链接数据库 client = AsyncIOMotorClient('mongodb://localhost:27017') db = client['buweijiguanbao'] @@ -93,7 +93,7 @@ async def loading(): end_time = time(23, 0, 0) # 23:00:00 # 判断当前时间是否在范围内 if start_time <= now <= end_time: - # print("当前时间在07:00:00--23:00:00范围内,中国可正常采集!") + # print("当前时间在07:00:00--23:00:00范围内,中国社会报可正常采集!") return True else: print("当前时间不在07:00:00--23:00:00范围内,中国社会报无法采集") @@ -142,6 +142,7 @@ async def getData(start_date: datetime, end_date: datetime): match = re.search(pattern_url, response.text, re.IGNORECASE) if match: url = "https://epaper.shehuiwang.cn" + match.group(1) + print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url) response = await client.get(url) response.encoding = response.charset_encoding print(f"一级连接状态:{response.status_code}")