From 860f128fe68409bd5ad04ba00e3e88145a46dbf9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9A=93=E6=9C=88=E5=BD=92=E5=B0=98?= <xingling25@qq.com>
Date: Tue, 12 Nov 2024 14:03:59 +0800
Subject: [PATCH] =?UTF-8?q?fix:=20=E8=B0=83=E6=95=B4=E5=AE=89=E5=BE=BD?=
 =?UTF-8?q?=E6=97=A5=E6=8A=A5=E9=87=8D=E8=BF=9E=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 地方政策/报刊/CrawlAnhui.py | 141 +++++++++++++++++++-----------------
 1 file changed, 74 insertions(+), 67 deletions(-)

diff --git a/地方政策/报刊/CrawlAnhui.py b/地方政策/报刊/CrawlAnhui.py
index 3a08470..cbdcda2 100644
--- a/地方政策/报刊/CrawlAnhui.py
+++ b/地方政策/报刊/CrawlAnhui.py
@@ -79,77 +79,84 @@ async def getData(start_date: datetime, end_date: datetime):
                 for t in range(5):
                     try:
                         response = await client.get(url)
-                    except Exception as e:
-                        await asyncio.sleep(random.randint(8, 20))
-                        print(f"尝试第{t + 1}次重连！")
-                response.encoding = response.charset_encoding
-                print(f"一级连接状态：{response.status_code}")
-                if response.status_code == 200:
-                    soup = BeautifulSoup(response.text, 'lxml')
-                    for item in soup.select(".Chunkiconlist p > a:nth-child(1)"):
-                        banmianming = item.text.split("：")[-1].strip()
-                        banmianhao = item.text.split("：")[0].replace("&nbsp;", "").replace(" ", "").strip()
-                        url1 = base_url + item.get("href")
-                        print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url1)
-                        for t in range(5):
-                            try:
-                                response2 = await client.get(url1)
-                            except Exception as e:
-                                await asyncio.sleep(random.randint(8, 20))
-                                print(f"尝试第{t + 1}次重连！")
-                        response2.encoding = response2.charset_encoding
-                        print(f"二级连接状态：{response2.status_code}")
-                        if response2.status_code == 200:
-                            soup2 = BeautifulSoup(response2.text, 'lxml')
-                            for item2 in soup2.select(".newslist a"):
-                                url2 = "https://szb.ahnews.com.cn/ahrb/" + item2.get("href")[9:]
-                                """https://szb.ahnews.com.cn/ahrb/content/201709/29/c17310.html"""
-                                if await collection.find_one({"detail_url": url2}, {"_id": False}):
-                                    continue
-                                title = item2.text.strip()
-                                print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2)
-                                # 启用超时重连
-                                for t in range(5):
+                        response.encoding = response.charset_encoding
+                        print(f"一级连接状态：{response.status_code}")
+                        if response.status_code == 200:
+                            soup = BeautifulSoup(response.text, 'lxml')
+                            for item in soup.select(".Chunkiconlist p > a:nth-child(1)"):
+                                banmianming = item.text.split("：")[-1].strip()
+                                banmianhao = item.text.split("：")[0].replace("&nbsp;", "").replace(" ", "").strip()
+                                url1 = base_url + item.get("href")
+                                print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url1)
+                                for y in range(5):
                                     try:
-                                        response3 = await client.get(url2)
-                                        if response3.status_code == 200:
+                                        response2 = await client.get(url1)
+                                        response2.encoding = response2.charset_encoding
+                                        print(f"二级连接状态：{response2.status_code}")
+                                        if response2.status_code == 200:
+                                            soup2 = BeautifulSoup(response2.text, 'lxml')
+                                            for item2 in soup2.select(".newslist a"):
+                                                url2 = "https://szb.ahnews.com.cn/ahrb/" + item2.get("href")[9:]
+                                                """https://szb.ahnews.com.cn/ahrb/content/201709/29/c17310.html"""
+                                                if await collection.find_one({"detail_url": url2}, {"_id": False}):
+                                                    continue
+                                                title = item2.text.strip()
+                                                print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2)
+                                                # 启用超时重连
+                                                for z in range(5):
+                                                    try:
+                                                        response3 = await client.get(url2)
+                                                        response3.encoding = response3.charset_encoding
+                                                        print(f"三级连接状态：{response3.status_code}")
+                                                        if response3.status_code == 200:
+                                                            soup3 = BeautifulSoup(response3.text, 'lxml')
+                                                            content = await getContent(soup3)
+                                                            try:
+                                                                title = soup3.select(".newsdetatit h3")[0].text.strip()
+                                                            except:
+                                                                title = title
+                                                            try:
+                                                                subTitle = soup3.select(".newsdetatext p")[
+                                                                    0].text.strip()
+                                                            except:
+                                                                subTitle = ""
+                                                            await collection.insert_one({
+                                                                "title": title,
+                                                                "subtitle": subTitle,
+                                                                "preTitle": "",
+                                                                "author": "",
+                                                                "banmianming": banmianming,
+                                                                "banmianhao": banmianhao,
+                                                                'keywordlist': 'empty',
+                                                                'detail_url': url2,
+                                                                'release_time': date_now,
+                                                                'insert_timestamp': datetime.today(),
+                                                                'content': content
+                                                            })
+                                                            crawl_num += 1
+                                                            print(
+                                                                f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成！")
+                                                            await asyncio.sleep(random.randint(8, 20))
+                                                            break
+                                                    except Exception as e:
+                                                        print(e)
+                                                        # 随机等待重连
+                                                        await asyncio.sleep(random.randint(8, 20))
+                                                        print(f"尝试第{z + 1}次重连！")
                                             break
-                                    except:
-                                        # 随机等待重连
+                                    except Exception as e:
+                                        print(e)
                                         await asyncio.sleep(random.randint(8, 20))
                                         print(f"尝试第{t + 1}次重连！")
-                                response3.encoding = response3.charset_encoding
-                                print(f"三级连接状态：{response3.status_code}")
-                                if response3.status_code == 200:
-                                    soup3 = BeautifulSoup(response3.text, 'lxml')
-                                    content = await getContent(soup3)
-                                    try:
-                                        title = soup3.select(".newsdetatit h3")[0].text.strip()
-                                    except:
-                                        title = title
-                                    try:
-                                        subTitle = soup3.select(".newsdetatext p")[0].text.strip()
-                                    except:
-                                        subTitle = ""
-                                    await collection.insert_one({
-                                        "title": title,
-                                        "subtitle": subTitle,
-                                        "preTitle": "",
-                                        "author": "",
-                                        "banmianming": banmianming,
-                                        "banmianhao": banmianhao,
-                                        'keywordlist': 'empty',
-                                        'detail_url': url2,
-                                        'release_time': date_now,
-                                        'insert_timestamp': datetime.today(),
-                                        'content': content
-                                    })
-                                    crawl_num += 1
-                                    print(
-                                        f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成！")
-                                    await asyncio.sleep(random.randint(8, 20))
-                            print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成！")
-                            await asyncio.sleep(random.randint(8, 20))
+                                print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成！")
+                                await asyncio.sleep(random.randint(8, 20))
+                            break
+                    except Exception as e:
+                        if t >= 4:
+                            print(f"尝试第{t + 1}次重连失败，请检查网络环境！")
+                            break
+                        await asyncio.sleep(random.randint(8, 20))
+                        print(f"尝试第{t + 1}次重连！")
                 print(f"安徽日报---{date_now_s}-----采集完成！")
                 await asyncio.sleep(random.randint(8, 20))
         except Exception as e: