From 860f128fe68409bd5ad04ba00e3e88145a46dbf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9A=93=E6=9C=88=E5=BD=92=E5=B0=98?= Date: Tue, 12 Nov 2024 14:03:59 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E8=B0=83=E6=95=B4=E5=AE=89=E5=BE=BD?= =?UTF-8?q?=E6=97=A5=E6=8A=A5=E9=87=8D=E8=BF=9E=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 地方政策/报刊/CrawlAnhui.py | 141 +++++++++++++++++++----------------- 1 file changed, 74 insertions(+), 67 deletions(-) diff --git a/地方政策/报刊/CrawlAnhui.py b/地方政策/报刊/CrawlAnhui.py index 3a08470..cbdcda2 100644 --- a/地方政策/报刊/CrawlAnhui.py +++ b/地方政策/报刊/CrawlAnhui.py @@ -79,77 +79,84 @@ async def getData(start_date: datetime, end_date: datetime): for t in range(5): try: response = await client.get(url) - except Exception as e: - await asyncio.sleep(random.randint(8, 20)) - print(f"尝试第{t + 1}次重连!") - response.encoding = response.charset_encoding - print(f"一级连接状态:{response.status_code}") - if response.status_code == 200: - soup = BeautifulSoup(response.text, 'lxml') - for item in soup.select(".Chunkiconlist p > a:nth-child(1)"): - banmianming = item.text.split(":")[-1].strip() - banmianhao = item.text.split(":")[0].replace(" ", "").replace(" ", "").strip() - url1 = base_url + item.get("href") - print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url1) - for t in range(5): - try: - response2 = await client.get(url1) - except Exception as e: - await asyncio.sleep(random.randint(8, 20)) - print(f"尝试第{t + 1}次重连!") - response2.encoding = response2.charset_encoding - print(f"二级连接状态:{response2.status_code}") - if response2.status_code == 200: - soup2 = BeautifulSoup(response2.text, 'lxml') - for item2 in soup2.select(".newslist a"): - url2 = "https://szb.ahnews.com.cn/ahrb/" + item2.get("href")[9:] - """https://szb.ahnews.com.cn/ahrb/content/201709/29/c17310.html""" - if await collection.find_one({"detail_url": url2}, {"_id": False}): - continue - title = item2.text.strip() - print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2) - # 启用超时重连 - for t in range(5): + response.encoding = response.charset_encoding + print(f"一级连接状态:{response.status_code}") + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'lxml') + for item in soup.select(".Chunkiconlist p > a:nth-child(1)"): + banmianming = item.text.split(":")[-1].strip() + banmianhao = item.text.split(":")[0].replace(" ", "").replace(" ", "").strip() + url1 = base_url + item.get("href") + print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url1) + for y in range(5): try: - response3 = await client.get(url2) - if response3.status_code == 200: + response2 = await client.get(url1) + response2.encoding = response2.charset_encoding + print(f"二级连接状态:{response2.status_code}") + if response2.status_code == 200: + soup2 = BeautifulSoup(response2.text, 'lxml') + for item2 in soup2.select(".newslist a"): + url2 = "https://szb.ahnews.com.cn/ahrb/" + item2.get("href")[9:] + """https://szb.ahnews.com.cn/ahrb/content/201709/29/c17310.html""" + if await collection.find_one({"detail_url": url2}, {"_id": False}): + continue + title = item2.text.strip() + print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2) + # 启用超时重连 + for z in range(5): + try: + response3 = await client.get(url2) + response3.encoding = response3.charset_encoding + print(f"三级连接状态:{response3.status_code}") + if response3.status_code == 200: + soup3 = BeautifulSoup(response3.text, 'lxml') + content = await getContent(soup3) + try: + title = soup3.select(".newsdetatit h3")[0].text.strip() + except: + title = title + try: + subTitle = soup3.select(".newsdetatext p")[ + 0].text.strip() + except: + subTitle = "" + await collection.insert_one({ + "title": title, + "subtitle": subTitle, + "preTitle": "", + "author": "", + "banmianming": banmianming, + "banmianhao": banmianhao, + 'keywordlist': 'empty', + 'detail_url': url2, + 'release_time': date_now, + 'insert_timestamp': datetime.today(), + 'content': content + }) + crawl_num += 1 + print( + f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!") + await asyncio.sleep(random.randint(8, 20)) + break + except Exception as e: + print(e) + # 随机等待重连 + await asyncio.sleep(random.randint(8, 20)) + print(f"尝试第{z + 1}次重连!") break - except: - # 随机等待重连 + except Exception as e: + print(e) await asyncio.sleep(random.randint(8, 20)) print(f"尝试第{t + 1}次重连!") - response3.encoding = response3.charset_encoding - print(f"三级连接状态:{response3.status_code}") - if response3.status_code == 200: - soup3 = BeautifulSoup(response3.text, 'lxml') - content = await getContent(soup3) - try: - title = soup3.select(".newsdetatit h3")[0].text.strip() - except: - title = title - try: - subTitle = soup3.select(".newsdetatext p")[0].text.strip() - except: - subTitle = "" - await collection.insert_one({ - "title": title, - "subtitle": subTitle, - "preTitle": "", - "author": "", - "banmianming": banmianming, - "banmianhao": banmianhao, - 'keywordlist': 'empty', - 'detail_url': url2, - 'release_time': date_now, - 'insert_timestamp': datetime.today(), - 'content': content - }) - crawl_num += 1 - print( - f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!") - await asyncio.sleep(random.randint(8, 20)) - print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!") - await asyncio.sleep(random.randint(8, 20)) + print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!") + await asyncio.sleep(random.randint(8, 20)) + break + except Exception as e: + if t >= 4: + print(f"尝试第{t + 1}次重连失败,请检查网络环境!") + break + await asyncio.sleep(random.randint(8, 20)) + print(f"尝试第{t + 1}次重连!") print(f"安徽日报---{date_now_s}-----采集完成!") await asyncio.sleep(random.randint(8, 20)) except Exception as e: