fix: 调整安徽日报重连逻辑
This commit is contained in:
parent
5767dfb591
commit
860f128fe6
@ -79,77 +79,84 @@ async def getData(start_date: datetime, end_date: datetime):
|
||||
for t in range(5):
|
||||
try:
|
||||
response = await client.get(url)
|
||||
except Exception as e:
|
||||
await asyncio.sleep(random.randint(8, 20))
|
||||
print(f"尝试第{t + 1}次重连!")
|
||||
response.encoding = response.charset_encoding
|
||||
print(f"一级连接状态:{response.status_code}")
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
for item in soup.select(".Chunkiconlist p > a:nth-child(1)"):
|
||||
banmianming = item.text.split(":")[-1].strip()
|
||||
banmianhao = item.text.split(":")[0].replace(" ", "").replace(" ", "").strip()
|
||||
url1 = base_url + item.get("href")
|
||||
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url1)
|
||||
for t in range(5):
|
||||
try:
|
||||
response2 = await client.get(url1)
|
||||
except Exception as e:
|
||||
await asyncio.sleep(random.randint(8, 20))
|
||||
print(f"尝试第{t + 1}次重连!")
|
||||
response2.encoding = response2.charset_encoding
|
||||
print(f"二级连接状态:{response2.status_code}")
|
||||
if response2.status_code == 200:
|
||||
soup2 = BeautifulSoup(response2.text, 'lxml')
|
||||
for item2 in soup2.select(".newslist a"):
|
||||
url2 = "https://szb.ahnews.com.cn/ahrb/" + item2.get("href")[9:]
|
||||
"""https://szb.ahnews.com.cn/ahrb/content/201709/29/c17310.html"""
|
||||
if await collection.find_one({"detail_url": url2}, {"_id": False}):
|
||||
continue
|
||||
title = item2.text.strip()
|
||||
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2)
|
||||
# 启用超时重连
|
||||
for t in range(5):
|
||||
response.encoding = response.charset_encoding
|
||||
print(f"一级连接状态:{response.status_code}")
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
for item in soup.select(".Chunkiconlist p > a:nth-child(1)"):
|
||||
banmianming = item.text.split(":")[-1].strip()
|
||||
banmianhao = item.text.split(":")[0].replace(" ", "").replace(" ", "").strip()
|
||||
url1 = base_url + item.get("href")
|
||||
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url1)
|
||||
for y in range(5):
|
||||
try:
|
||||
response3 = await client.get(url2)
|
||||
if response3.status_code == 200:
|
||||
response2 = await client.get(url1)
|
||||
response2.encoding = response2.charset_encoding
|
||||
print(f"二级连接状态:{response2.status_code}")
|
||||
if response2.status_code == 200:
|
||||
soup2 = BeautifulSoup(response2.text, 'lxml')
|
||||
for item2 in soup2.select(".newslist a"):
|
||||
url2 = "https://szb.ahnews.com.cn/ahrb/" + item2.get("href")[9:]
|
||||
"""https://szb.ahnews.com.cn/ahrb/content/201709/29/c17310.html"""
|
||||
if await collection.find_one({"detail_url": url2}, {"_id": False}):
|
||||
continue
|
||||
title = item2.text.strip()
|
||||
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2)
|
||||
# 启用超时重连
|
||||
for z in range(5):
|
||||
try:
|
||||
response3 = await client.get(url2)
|
||||
response3.encoding = response3.charset_encoding
|
||||
print(f"三级连接状态:{response3.status_code}")
|
||||
if response3.status_code == 200:
|
||||
soup3 = BeautifulSoup(response3.text, 'lxml')
|
||||
content = await getContent(soup3)
|
||||
try:
|
||||
title = soup3.select(".newsdetatit h3")[0].text.strip()
|
||||
except:
|
||||
title = title
|
||||
try:
|
||||
subTitle = soup3.select(".newsdetatext p")[
|
||||
0].text.strip()
|
||||
except:
|
||||
subTitle = ""
|
||||
await collection.insert_one({
|
||||
"title": title,
|
||||
"subtitle": subTitle,
|
||||
"preTitle": "",
|
||||
"author": "",
|
||||
"banmianming": banmianming,
|
||||
"banmianhao": banmianhao,
|
||||
'keywordlist': 'empty',
|
||||
'detail_url': url2,
|
||||
'release_time': date_now,
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': content
|
||||
})
|
||||
crawl_num += 1
|
||||
print(
|
||||
f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!")
|
||||
await asyncio.sleep(random.randint(8, 20))
|
||||
break
|
||||
except Exception as e:
|
||||
print(e)
|
||||
# 随机等待重连
|
||||
await asyncio.sleep(random.randint(8, 20))
|
||||
print(f"尝试第{z + 1}次重连!")
|
||||
break
|
||||
except:
|
||||
# 随机等待重连
|
||||
except Exception as e:
|
||||
print(e)
|
||||
await asyncio.sleep(random.randint(8, 20))
|
||||
print(f"尝试第{t + 1}次重连!")
|
||||
response3.encoding = response3.charset_encoding
|
||||
print(f"三级连接状态:{response3.status_code}")
|
||||
if response3.status_code == 200:
|
||||
soup3 = BeautifulSoup(response3.text, 'lxml')
|
||||
content = await getContent(soup3)
|
||||
try:
|
||||
title = soup3.select(".newsdetatit h3")[0].text.strip()
|
||||
except:
|
||||
title = title
|
||||
try:
|
||||
subTitle = soup3.select(".newsdetatext p")[0].text.strip()
|
||||
except:
|
||||
subTitle = ""
|
||||
await collection.insert_one({
|
||||
"title": title,
|
||||
"subtitle": subTitle,
|
||||
"preTitle": "",
|
||||
"author": "",
|
||||
"banmianming": banmianming,
|
||||
"banmianhao": banmianhao,
|
||||
'keywordlist': 'empty',
|
||||
'detail_url': url2,
|
||||
'release_time': date_now,
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': content
|
||||
})
|
||||
crawl_num += 1
|
||||
print(
|
||||
f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!")
|
||||
await asyncio.sleep(random.randint(8, 20))
|
||||
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!")
|
||||
await asyncio.sleep(random.randint(8, 20))
|
||||
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!")
|
||||
await asyncio.sleep(random.randint(8, 20))
|
||||
break
|
||||
except Exception as e:
|
||||
if t >= 4:
|
||||
print(f"尝试第{t + 1}次重连失败,请检查网络环境!")
|
||||
break
|
||||
await asyncio.sleep(random.randint(8, 20))
|
||||
print(f"尝试第{t + 1}次重连!")
|
||||
print(f"安徽日报---{date_now_s}-----采集完成!")
|
||||
await asyncio.sleep(random.randint(8, 20))
|
||||
except Exception as e:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user