fix: 调整安徽日报重连逻辑

This commit is contained in:
皓月归尘 2024-11-12 14:03:59 +08:00
parent 5767dfb591
commit 860f128fe6

View File

@ -79,77 +79,84 @@ async def getData(start_date: datetime, end_date: datetime):
for t in range(5):
try:
response = await client.get(url)
except Exception as e:
await asyncio.sleep(random.randint(8, 20))
print(f"尝试第{t + 1}次重连!")
response.encoding = response.charset_encoding
print(f"一级连接状态:{response.status_code}")
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')
for item in soup.select(".Chunkiconlist p > a:nth-child(1)"):
banmianming = item.text.split("")[-1].strip()
banmianhao = item.text.split("")[0].replace(" ", "").replace(" ", "").strip()
url1 = base_url + item.get("href")
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url1)
for t in range(5):
try:
response2 = await client.get(url1)
except Exception as e:
await asyncio.sleep(random.randint(8, 20))
print(f"尝试第{t + 1}次重连!")
response2.encoding = response2.charset_encoding
print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, 'lxml')
for item2 in soup2.select(".newslist a"):
url2 = "https://szb.ahnews.com.cn/ahrb/" + item2.get("href")[9:]
"""https://szb.ahnews.com.cn/ahrb/content/201709/29/c17310.html"""
if await collection.find_one({"detail_url": url2}, {"_id": False}):
continue
title = item2.text.strip()
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2)
# 启用超时重连
for t in range(5):
response.encoding = response.charset_encoding
print(f"一级连接状态:{response.status_code}")
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')
for item in soup.select(".Chunkiconlist p > a:nth-child(1)"):
banmianming = item.text.split("")[-1].strip()
banmianhao = item.text.split("")[0].replace(" ", "").replace(" ", "").strip()
url1 = base_url + item.get("href")
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url1)
for y in range(5):
try:
response3 = await client.get(url2)
if response3.status_code == 200:
response2 = await client.get(url1)
response2.encoding = response2.charset_encoding
print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, 'lxml')
for item2 in soup2.select(".newslist a"):
url2 = "https://szb.ahnews.com.cn/ahrb/" + item2.get("href")[9:]
"""https://szb.ahnews.com.cn/ahrb/content/201709/29/c17310.html"""
if await collection.find_one({"detail_url": url2}, {"_id": False}):
continue
title = item2.text.strip()
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2)
# 启用超时重连
for z in range(5):
try:
response3 = await client.get(url2)
response3.encoding = response3.charset_encoding
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
soup3 = BeautifulSoup(response3.text, 'lxml')
content = await getContent(soup3)
try:
title = soup3.select(".newsdetatit h3")[0].text.strip()
except:
title = title
try:
subTitle = soup3.select(".newsdetatext p")[
0].text.strip()
except:
subTitle = ""
await collection.insert_one({
"title": title,
"subtitle": subTitle,
"preTitle": "",
"author": "",
"banmianming": banmianming,
"banmianhao": banmianhao,
'keywordlist': 'empty',
'detail_url': url2,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': content
})
crawl_num += 1
print(
f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!")
await asyncio.sleep(random.randint(8, 20))
break
except Exception as e:
print(e)
# 随机等待重连
await asyncio.sleep(random.randint(8, 20))
print(f"尝试第{z + 1}次重连!")
break
except:
# 随机等待重连
except Exception as e:
print(e)
await asyncio.sleep(random.randint(8, 20))
print(f"尝试第{t + 1}次重连!")
response3.encoding = response3.charset_encoding
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
soup3 = BeautifulSoup(response3.text, 'lxml')
content = await getContent(soup3)
try:
title = soup3.select(".newsdetatit h3")[0].text.strip()
except:
title = title
try:
subTitle = soup3.select(".newsdetatext p")[0].text.strip()
except:
subTitle = ""
await collection.insert_one({
"title": title,
"subtitle": subTitle,
"preTitle": "",
"author": "",
"banmianming": banmianming,
"banmianhao": banmianhao,
'keywordlist': 'empty',
'detail_url': url2,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': content
})
crawl_num += 1
print(
f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!")
await asyncio.sleep(random.randint(8, 20))
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!")
await asyncio.sleep(random.randint(8, 20))
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!")
await asyncio.sleep(random.randint(8, 20))
break
except Exception as e:
if t >= 4:
print(f"尝试第{t + 1}次重连失败,请检查网络环境!")
break
await asyncio.sleep(random.randint(8, 20))
print(f"尝试第{t + 1}次重连!")
print(f"安徽日报---{date_now_s}-----采集完成!")
await asyncio.sleep(random.randint(8, 20))
except Exception as e: