diff --git a/国内党媒/CrawlZhongguoshehuibao.py b/国内党媒/CrawlZhongguoshehuibao.py index b70632f..384f220 100644 --- a/国内党媒/CrawlZhongguoshehuibao.py +++ b/国内党媒/CrawlZhongguoshehuibao.py @@ -23,7 +23,7 @@ headers = { 'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'} # 正则表达式提取年份、月份和数组内容 pattern = r"_htep_(\d{4})_(\d{1,2})=new Array\((.*?)\);" -pattern_url = r']*>' +pattern_url = r'url=([^">]+)' # 链接数据库 client = AsyncIOMotorClient('mongodb://localhost:27017') db = client['buweijiguanbao'] @@ -93,7 +93,7 @@ async def loading(): end_time = time(23, 0, 0) # 23:00:00 # 判断当前时间是否在范围内 if start_time <= now <= end_time: - # print("当前时间在07:00:00--23:00:00范围内,中国可正常采集!") + # print("当前时间在07:00:00--23:00:00范围内,中国社会报可正常采集!") return True else: print("当前时间不在07:00:00--23:00:00范围内,中国社会报无法采集") @@ -142,6 +142,7 @@ async def getData(start_date: datetime, end_date: datetime): match = re.search(pattern_url, response.text, re.IGNORECASE) if match: url = "https://epaper.shehuiwang.cn" + match.group(1) + print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url) response = await client.get(url) response.encoding = response.charset_encoding print(f"一级连接状态:{response.status_code}")