144 lines
6.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# _*_ coding : UTF-8 _*_
# @Time : 2024/11/13 13:42
# @UpdateTime : 2024/11/13 13:42
# @Author : haochen zhong
# @File : CrawlAnhui-sync.py
# @Software : PyCharm
# @Comment : 本程序采集
import random
import time
from datetime import timedelta, datetime
import pymongo
import requests
from bs4 import BeautifulSoup
start_date = datetime.strptime('2017-09-29', '%Y-%m-%d')
"""安徽日报报2018年09月29日开始有数据"""
end_date = datetime.today()
"""截止到今天"""
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
# 链接数据库
client = pymongo.MongoClient('mongodb://localhost:27017')
db = client['dfdm_sjribao']
collection = db['anhuiribao']
def main():
collection_names = db.list_collection_names()
# 判断数据表是否存在
if "anhuiribao" not in collection_names:
# 如果不存在则从2017年9月开始爬取
print("安徽日报报数据表不存在,开始采集!")
getData(start_date, end_date)
else:
# 如果存在,则从数据库中获取最后一条记录的日期
last_record = collection.find_one(sort=[('release_time', -1)])
last_date_str = last_record['release_time']
print("数据库截止时间:", last_date_str)
getData(last_date_str, end_date)
def getContent(soup: BeautifulSoup) -> str:
"""
:param soup: BeautifulSoup对象
:return: 文章内容
"""
content = ""
for p in soup.select(".content p"):
para = p.text.strip()
if para:
content += para
content += '\n'
return content
def getData(start_date: datetime, end_date: datetime):
crawl_num = 0
for i in range((end_date - start_date).days):
date_now = start_date + timedelta(days=i)
date_now_s = date_now.strftime('%Y%m/%d')
base_url = "https://szb.ahnews.com.cn/ahrb/layout/" + date_now_s + '/'
url = base_url + 'node_01.html'
"""https://szb.ahnews.com.cn/ahrb/layout/201811/01/node_01.html"""
try:
print(url)
response = requests.get(url=url, headers=headers, timeout=60)
response.encoding = response.apparent_encoding
print(f"一级连接状态:{response.status_code}")
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')
for item in soup.select(".Chunkiconlist p > a:nth-child(1)"):
banmianming = item.text.split("")[-1].strip()
banmianhao = item.text.split("")[0].replace(" ", "").replace(" ", "").strip()
url1 = base_url + item.get("href")
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url1)
response2 = requests.get(url1, headers=headers, timeout=60)
response2.encoding = response2.apparent_encoding
print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, 'lxml')
for item2 in soup2.select(".newslist a"):
url2 = "https://szb.ahnews.com.cn/ahrb/" + item2.get("href")[9:]
"""https://szb.ahnews.com.cn/ahrb/content/201709/29/c17310.html"""
if collection.find_one({"detail_url": url2}, {"_id": False}):
continue
title = item2.text.strip()
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2)
response3 = requests.get(url=url2, headers=headers, timeout=60)
response3.encoding = response3.apparent_encoding
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
soup3 = BeautifulSoup(response3.text, 'lxml')
content = getContent(soup3)
try:
title = soup3.select(".newsdetatit h3")[0].text.strip()
except:
title = title
try:
subTitle = soup3.select(".newsdetatext p")[0].text.strip()
except:
subTitle = ""
collection.insert_one({
"title": title,
"subtitle": subTitle,
"preTitle": "",
"author": "",
"banmianming": banmianming,
"banmianhao": banmianhao,
'keywordlist': 'empty',
'detail_url': url2,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': content
})
crawl_num += 1
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!")
time.sleep(random.randint(5, 15))
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!")
time.sleep(random.randint(5, 15))
print(f"安徽日报---{date_now_s}-----采集完成!")
time.sleep(random.randint(5, 15))
except Exception as e:
print(e)
collection.insert_one(
{'banmianhao': 'empty',
'banmianming': 'empty',
'preTitle': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': 'empty'}
)
print(f"安徽日报采集完毕,共采集{crawl_num}条数据!")
main()