fix: 调整安徽日报重连逻辑

This commit is contained in:
皓月归尘 2024-11-12 14:03:59 +08:00
parent 5767dfb591
commit 860f128fe6

View File

@ -79,77 +79,84 @@ async def getData(start_date: datetime, end_date: datetime):
for t in range(5): for t in range(5):
try: try:
response = await client.get(url) response = await client.get(url)
except Exception as e: response.encoding = response.charset_encoding
await asyncio.sleep(random.randint(8, 20)) print(f"一级连接状态:{response.status_code}")
print(f"尝试第{t + 1}次重连!") if response.status_code == 200:
response.encoding = response.charset_encoding soup = BeautifulSoup(response.text, 'lxml')
print(f"一级连接状态:{response.status_code}") for item in soup.select(".Chunkiconlist p > a:nth-child(1)"):
if response.status_code == 200: banmianming = item.text.split("")[-1].strip()
soup = BeautifulSoup(response.text, 'lxml') banmianhao = item.text.split("")[0].replace(" ", "").replace(" ", "").strip()
for item in soup.select(".Chunkiconlist p > a:nth-child(1)"): url1 = base_url + item.get("href")
banmianming = item.text.split("")[-1].strip() print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url1)
banmianhao = item.text.split("")[0].replace(" ", "").replace(" ", "").strip() for y in range(5):
url1 = base_url + item.get("href")
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url1)
for t in range(5):
try:
response2 = await client.get(url1)
except Exception as e:
await asyncio.sleep(random.randint(8, 20))
print(f"尝试第{t + 1}次重连!")
response2.encoding = response2.charset_encoding
print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, 'lxml')
for item2 in soup2.select(".newslist a"):
url2 = "https://szb.ahnews.com.cn/ahrb/" + item2.get("href")[9:]
"""https://szb.ahnews.com.cn/ahrb/content/201709/29/c17310.html"""
if await collection.find_one({"detail_url": url2}, {"_id": False}):
continue
title = item2.text.strip()
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2)
# 启用超时重连
for t in range(5):
try: try:
response3 = await client.get(url2) response2 = await client.get(url1)
if response3.status_code == 200: response2.encoding = response2.charset_encoding
print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, 'lxml')
for item2 in soup2.select(".newslist a"):
url2 = "https://szb.ahnews.com.cn/ahrb/" + item2.get("href")[9:]
"""https://szb.ahnews.com.cn/ahrb/content/201709/29/c17310.html"""
if await collection.find_one({"detail_url": url2}, {"_id": False}):
continue
title = item2.text.strip()
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2)
# 启用超时重连
for z in range(5):
try:
response3 = await client.get(url2)
response3.encoding = response3.charset_encoding
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
soup3 = BeautifulSoup(response3.text, 'lxml')
content = await getContent(soup3)
try:
title = soup3.select(".newsdetatit h3")[0].text.strip()
except:
title = title
try:
subTitle = soup3.select(".newsdetatext p")[
0].text.strip()
except:
subTitle = ""
await collection.insert_one({
"title": title,
"subtitle": subTitle,
"preTitle": "",
"author": "",
"banmianming": banmianming,
"banmianhao": banmianhao,
'keywordlist': 'empty',
'detail_url': url2,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': content
})
crawl_num += 1
print(
f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!")
await asyncio.sleep(random.randint(8, 20))
break
except Exception as e:
print(e)
# 随机等待重连
await asyncio.sleep(random.randint(8, 20))
print(f"尝试第{z + 1}次重连!")
break break
except: except Exception as e:
# 随机等待重连 print(e)
await asyncio.sleep(random.randint(8, 20)) await asyncio.sleep(random.randint(8, 20))
print(f"尝试第{t + 1}次重连!") print(f"尝试第{t + 1}次重连!")
response3.encoding = response3.charset_encoding print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!")
print(f"三级连接状态:{response3.status_code}") await asyncio.sleep(random.randint(8, 20))
if response3.status_code == 200: break
soup3 = BeautifulSoup(response3.text, 'lxml') except Exception as e:
content = await getContent(soup3) if t >= 4:
try: print(f"尝试第{t + 1}次重连失败,请检查网络环境!")
title = soup3.select(".newsdetatit h3")[0].text.strip() break
except: await asyncio.sleep(random.randint(8, 20))
title = title print(f"尝试第{t + 1}次重连!")
try:
subTitle = soup3.select(".newsdetatext p")[0].text.strip()
except:
subTitle = ""
await collection.insert_one({
"title": title,
"subtitle": subTitle,
"preTitle": "",
"author": "",
"banmianming": banmianming,
"banmianhao": banmianhao,
'keywordlist': 'empty',
'detail_url': url2,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': content
})
crawl_num += 1
print(
f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!")
await asyncio.sleep(random.randint(8, 20))
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!")
await asyncio.sleep(random.randint(8, 20))
print(f"安徽日报---{date_now_s}-----采集完成!") print(f"安徽日报---{date_now_s}-----采集完成!")
await asyncio.sleep(random.randint(8, 20)) await asyncio.sleep(random.randint(8, 20))
except Exception as e: except Exception as e: