fix: 调整安徽日报重连逻辑
This commit is contained in:
parent
5767dfb591
commit
860f128fe6
@ -79,77 +79,84 @@ async def getData(start_date: datetime, end_date: datetime):
|
|||||||
for t in range(5):
|
for t in range(5):
|
||||||
try:
|
try:
|
||||||
response = await client.get(url)
|
response = await client.get(url)
|
||||||
except Exception as e:
|
response.encoding = response.charset_encoding
|
||||||
await asyncio.sleep(random.randint(8, 20))
|
print(f"一级连接状态:{response.status_code}")
|
||||||
print(f"尝试第{t + 1}次重连!")
|
if response.status_code == 200:
|
||||||
response.encoding = response.charset_encoding
|
soup = BeautifulSoup(response.text, 'lxml')
|
||||||
print(f"一级连接状态:{response.status_code}")
|
for item in soup.select(".Chunkiconlist p > a:nth-child(1)"):
|
||||||
if response.status_code == 200:
|
banmianming = item.text.split(":")[-1].strip()
|
||||||
soup = BeautifulSoup(response.text, 'lxml')
|
banmianhao = item.text.split(":")[0].replace(" ", "").replace(" ", "").strip()
|
||||||
for item in soup.select(".Chunkiconlist p > a:nth-child(1)"):
|
url1 = base_url + item.get("href")
|
||||||
banmianming = item.text.split(":")[-1].strip()
|
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url1)
|
||||||
banmianhao = item.text.split(":")[0].replace(" ", "").replace(" ", "").strip()
|
for y in range(5):
|
||||||
url1 = base_url + item.get("href")
|
|
||||||
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url1)
|
|
||||||
for t in range(5):
|
|
||||||
try:
|
|
||||||
response2 = await client.get(url1)
|
|
||||||
except Exception as e:
|
|
||||||
await asyncio.sleep(random.randint(8, 20))
|
|
||||||
print(f"尝试第{t + 1}次重连!")
|
|
||||||
response2.encoding = response2.charset_encoding
|
|
||||||
print(f"二级连接状态:{response2.status_code}")
|
|
||||||
if response2.status_code == 200:
|
|
||||||
soup2 = BeautifulSoup(response2.text, 'lxml')
|
|
||||||
for item2 in soup2.select(".newslist a"):
|
|
||||||
url2 = "https://szb.ahnews.com.cn/ahrb/" + item2.get("href")[9:]
|
|
||||||
"""https://szb.ahnews.com.cn/ahrb/content/201709/29/c17310.html"""
|
|
||||||
if await collection.find_one({"detail_url": url2}, {"_id": False}):
|
|
||||||
continue
|
|
||||||
title = item2.text.strip()
|
|
||||||
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2)
|
|
||||||
# 启用超时重连
|
|
||||||
for t in range(5):
|
|
||||||
try:
|
try:
|
||||||
response3 = await client.get(url2)
|
response2 = await client.get(url1)
|
||||||
if response3.status_code == 200:
|
response2.encoding = response2.charset_encoding
|
||||||
|
print(f"二级连接状态:{response2.status_code}")
|
||||||
|
if response2.status_code == 200:
|
||||||
|
soup2 = BeautifulSoup(response2.text, 'lxml')
|
||||||
|
for item2 in soup2.select(".newslist a"):
|
||||||
|
url2 = "https://szb.ahnews.com.cn/ahrb/" + item2.get("href")[9:]
|
||||||
|
"""https://szb.ahnews.com.cn/ahrb/content/201709/29/c17310.html"""
|
||||||
|
if await collection.find_one({"detail_url": url2}, {"_id": False}):
|
||||||
|
continue
|
||||||
|
title = item2.text.strip()
|
||||||
|
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2)
|
||||||
|
# 启用超时重连
|
||||||
|
for z in range(5):
|
||||||
|
try:
|
||||||
|
response3 = await client.get(url2)
|
||||||
|
response3.encoding = response3.charset_encoding
|
||||||
|
print(f"三级连接状态:{response3.status_code}")
|
||||||
|
if response3.status_code == 200:
|
||||||
|
soup3 = BeautifulSoup(response3.text, 'lxml')
|
||||||
|
content = await getContent(soup3)
|
||||||
|
try:
|
||||||
|
title = soup3.select(".newsdetatit h3")[0].text.strip()
|
||||||
|
except:
|
||||||
|
title = title
|
||||||
|
try:
|
||||||
|
subTitle = soup3.select(".newsdetatext p")[
|
||||||
|
0].text.strip()
|
||||||
|
except:
|
||||||
|
subTitle = ""
|
||||||
|
await collection.insert_one({
|
||||||
|
"title": title,
|
||||||
|
"subtitle": subTitle,
|
||||||
|
"preTitle": "",
|
||||||
|
"author": "",
|
||||||
|
"banmianming": banmianming,
|
||||||
|
"banmianhao": banmianhao,
|
||||||
|
'keywordlist': 'empty',
|
||||||
|
'detail_url': url2,
|
||||||
|
'release_time': date_now,
|
||||||
|
'insert_timestamp': datetime.today(),
|
||||||
|
'content': content
|
||||||
|
})
|
||||||
|
crawl_num += 1
|
||||||
|
print(
|
||||||
|
f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!")
|
||||||
|
await asyncio.sleep(random.randint(8, 20))
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
# 随机等待重连
|
||||||
|
await asyncio.sleep(random.randint(8, 20))
|
||||||
|
print(f"尝试第{z + 1}次重连!")
|
||||||
break
|
break
|
||||||
except:
|
except Exception as e:
|
||||||
# 随机等待重连
|
print(e)
|
||||||
await asyncio.sleep(random.randint(8, 20))
|
await asyncio.sleep(random.randint(8, 20))
|
||||||
print(f"尝试第{t + 1}次重连!")
|
print(f"尝试第{t + 1}次重连!")
|
||||||
response3.encoding = response3.charset_encoding
|
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!")
|
||||||
print(f"三级连接状态:{response3.status_code}")
|
await asyncio.sleep(random.randint(8, 20))
|
||||||
if response3.status_code == 200:
|
break
|
||||||
soup3 = BeautifulSoup(response3.text, 'lxml')
|
except Exception as e:
|
||||||
content = await getContent(soup3)
|
if t >= 4:
|
||||||
try:
|
print(f"尝试第{t + 1}次重连失败,请检查网络环境!")
|
||||||
title = soup3.select(".newsdetatit h3")[0].text.strip()
|
break
|
||||||
except:
|
await asyncio.sleep(random.randint(8, 20))
|
||||||
title = title
|
print(f"尝试第{t + 1}次重连!")
|
||||||
try:
|
|
||||||
subTitle = soup3.select(".newsdetatext p")[0].text.strip()
|
|
||||||
except:
|
|
||||||
subTitle = ""
|
|
||||||
await collection.insert_one({
|
|
||||||
"title": title,
|
|
||||||
"subtitle": subTitle,
|
|
||||||
"preTitle": "",
|
|
||||||
"author": "",
|
|
||||||
"banmianming": banmianming,
|
|
||||||
"banmianhao": banmianhao,
|
|
||||||
'keywordlist': 'empty',
|
|
||||||
'detail_url': url2,
|
|
||||||
'release_time': date_now,
|
|
||||||
'insert_timestamp': datetime.today(),
|
|
||||||
'content': content
|
|
||||||
})
|
|
||||||
crawl_num += 1
|
|
||||||
print(
|
|
||||||
f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!")
|
|
||||||
await asyncio.sleep(random.randint(8, 20))
|
|
||||||
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!")
|
|
||||||
await asyncio.sleep(random.randint(8, 20))
|
|
||||||
print(f"安徽日报---{date_now_s}-----采集完成!")
|
print(f"安徽日报---{date_now_s}-----采集完成!")
|
||||||
await asyncio.sleep(random.randint(8, 20))
|
await asyncio.sleep(random.randint(8, 20))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user