guoneimeitishujucaiji/中央党报/CrawlZhongguoqingnianbao.py

299 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# _*_ coding : UTF-8 _*_
# @Time : 2025/02/07 22:33
# @UpdateTime : 2025/02/07 22:33
# @Author : sonder
# @File : CrawlZhongguoqingnianbao.py
# @Software : PyCharm
# @Comment : 本程序
import asyncio
import random
from datetime import datetime
from bs4 import BeautifulSoup, Comment
from httpx import AsyncClient
from motor.motor_asyncio import AsyncIOMotorClient
start_date = datetime.strptime('2011-01', '%Y-%m')
"""中国青年报2011-01开始有效数据"""
end_date = datetime.today()
"""截止到今天"""
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
pattern = r'<a href=([^>]+)><div[^>]*>([^<]+)</div></a>'
# 链接数据库
client = AsyncIOMotorClient('mongodb://localhost:27017')
db = client['zydm']
collection = db['zgqnb']
async def main():
collection_names = await db.list_collection_names()
# 判断数据表是否存在
if "zgqnb" not in collection_names:
print("中国青年报数据表不存在,开始采集!")
await getData(start_date, end_date)
else:
# 如果存在,则从数据库中获取最后一条记录的日期
last_record = await collection.find_one({}, sort=[('release_time', -1)])
last_date_str = last_record['release_time']
print("数据库截止时间:", last_date_str)
await getData(last_date_str, end_date)
async def getContent(soup: BeautifulSoup) -> str:
"""
:param soup: BeautifulSoup对象
:return: 文章内容
"""
content = ""
for p in soup.select("#ozoom p"):
para = p.text.strip()
if para:
content += para
content += '\n'
for p in soup.select("#content p"):
para = p.text.strip()
if para:
content += para
content += '\n'
return content
async def getData(start_date: datetime, end_date: datetime):
"""
:param start_date: 开始日期
:param end_date: 结束日期
:return: None
"""
crawl_num = 0
# 创建一个列表保存月份
months = []
# 从开始日期到结束日期,每个月份都添加到列表中
current_date = start_date
current_date = current_date.replace(day=1)
while current_date <= end_date:
months.append(current_date)
# 增加一个月
if current_date.month == 12:
current_date = current_date.replace(year=current_date.year + 1, month=1)
else:
current_date = current_date.replace(month=current_date.month + 1)
# 遍历月份列表
for month in months:
if month < datetime(2024, 12, 1):
# 构造URL
url = f'https://zqb.cyol.com/html/{month.strftime("%Y-%m")}/period.xml'
"""https://zqb.cyol.com/html/2024-01/period.xml"""
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url)
async with AsyncClient(headers=headers, timeout=60) as client:
response = await client.get(url)
response.encoding = response.charset_encoding
print(f"一级连接状态:{response.status_code}")
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'xml')
for period in soup.select("period"):
period_name = datetime.strptime(period.find("period_name").text.strip(), "%Y-%m-%d")
front_page = period.find("front_page").text.strip()
try:
url1 = f"https://zqb.cyol.com/html/{period_name.strftime('%Y-%m/%d')}/{front_page}"
"""https://zqb.cyol.com/html/2024-01/01/nbs.D110000zgqnb_01.htm"""
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url1)
response2 = await client.get(url1)
response2.encoding = response2.charset_encoding
print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, 'lxml')
for item in soup2.select("#pageLink"):
banmianming = item.text.split("")[-1]
banmianhao = item.text.split("")[0]
url2 = f"https://zqb.cyol.com/html/{period_name.strftime('%Y-%m/%d')}/" + item.get(
"href").replace("./", "").strip()
"""https://zqb.cyol.com/html/2024-01/01/nbs.D110000zgqnb_01.htm"""
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2)
response3 = await client.get(url2)
response3.encoding = response3.charset_encoding
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
soup3 = BeautifulSoup(response3.text, 'lxml')
for item2 in soup3.select("#titleList li a"):
url3 = f"https://zqb.cyol.com/html/{period_name.strftime('%Y-%m/%d')}/" + item2.get(
"href").replace(
"./", "").strip()
"""https://zqb.cyol.com/html/2024-01/01/nw.D110000zgqnb_20240101_1-01.htm"""
if await collection.find_one({"detail_url": url3}, {"_id": False}):
continue
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url3)
response4 = await client.get(url3)
response4.encoding = response4.charset_encoding
print(f"四级连接状态:{response4.status_code}")
if response4.status_code == 200:
soup4 = BeautifulSoup(response4.text, 'html.parser')
try:
comments = soup4.find_all(
string=lambda text: isinstance(text, Comment))
for comment in comments:
if 'mpproperty' in comment:
enpproperty_content = comment.strip()
inner_soup = BeautifulSoup(enpproperty_content,
'html.parser')
# 提取特定标签值
title = inner_soup.find('title').text.strip()
perTitle = inner_soup.find(
'founder-introtitle').text.strip()
subTitle = inner_soup.find("founder-subtitle").text.strip()
author = inner_soup.find('author').text.strip()
except:
title = title
perTitle = ""
subTitle = ""
author = ""
content = await getContent(soup4)
await collection.insert_one({
"title": title,
"subtitle": subTitle,
"preTitle": perTitle,
"author": author,
"banmianming": banmianming,
"banmianhao": banmianhao,
'detail_url': url3,
'release_time': period_name,
'insert_timestamp': datetime.today(),
'content': content
})
crawl_num += 1
print(
f"中国青年报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}---{title}---采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(
f"中国青年报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}-----采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(
f"中国青年报---{period_name.strftime('%Y-%m-%d')}------采集完成!")
await asyncio.sleep(random.randint(5, 15))
except Exception as e:
print(e)
await collection.insert_one(
{'banmianhao': 'empty',
'banmianming': 'empty',
'preTitle': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'detail_url': url,
'release_time': period_name,
'insert_timestamp': datetime.today(),
'content': 'empty'}
)
else:
# 构造URL
url = f'https://zqb.cyol.com/pc/layout/{month.strftime("%Y%m")}/period.xml'
"""https://zqb.cyol.com/pc/layout/202501/period.xml"""
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url)
async with AsyncClient(headers=headers, timeout=60) as client:
response = await client.get(url)
response.encoding = response.charset_encoding
print(f"一级连接状态:{response.status_code}")
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'xml')
for period in soup.select("period"):
period_name = datetime.strptime(period.find("period_name").text.strip(), "%Y-%m-%d")
front_page = period.find("front_page").text.strip()
try:
url1 = f"https://zqb.cyol.com/pc/layout/{period_name.strftime('%Y%m/%d')}/{front_page}"
"""https://zqb.cyol.com/pc/layout/202502/07/node_01.html"""
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url1)
response2 = await client.get(url1)
response2.encoding = response2.charset_encoding
print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, 'lxml')
for item in soup2.select("#pageLink"):
banmianming = item.text.split("")[-1]
banmianhao = item.text.split("")[0]
url2 = f"https://zqb.cyol.com/pc/layout/{period_name.strftime('%Y%m/%d')}/" + item.get(
"href").replace("./", "").strip()
"""https://zqb.cyol.com/pc/layout/202502/07/node_01.html"""
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url2)
response3 = await client.get(url2)
response3.encoding = response3.charset_encoding
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
soup3 = BeautifulSoup(response3.text, 'lxml')
for item2 in soup3.select(".news-list a"):
url3 = f"https://zqb.cyol.com/pc/" + item2.get(
"href").replace(
"../", "").strip()
"""https://zqb.cyol.com/pc/content/202502/07/content_406551.html"""
if await collection.find_one({"detail_url": url3}, {"_id": False}):
continue
title = item2.text.strip()
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), url3)
response4 = await client.get(url3)
response4.encoding = response4.charset_encoding
print(f"四级连接状态:{response4.status_code}")
if response4.status_code == 200:
soup4 = BeautifulSoup(response4.text, 'html.parser')
try:
comments = soup4.find_all(
string=lambda text: isinstance(text, Comment))
for comment in comments:
if 'enpproperty' in comment:
# 提取注释内容
enpproperty_content = comment.strip()
inner_soup = BeautifulSoup(enpproperty_content,
'html.parser')
# 提取特定标签值
title = inner_soup.find('title').text.strip()
perTitle = inner_soup.find('introtitle').text.strip()
subTitle = inner_soup.find("subtitle").text.strip()
author = inner_soup.find('author').text.strip()
except:
title = title
perTitle = ""
subTitle = ""
author = ""
content = await getContent(soup4)
await collection.insert_one({
"title": title,
"subtitle": subTitle,
"preTitle": perTitle,
"author": author,
"banmianming": banmianming,
"banmianhao": banmianhao,
'detail_url': url3,
'release_time': period_name,
'insert_timestamp': datetime.today(),
'content': content
})
crawl_num += 1
print(
f"中国青年报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}---{title}---采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(
f"中国青年报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}-----采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(
f"中国青年报---{period_name.strftime('%Y-%m-%d')}------采集完成!")
await asyncio.sleep(random.randint(5, 15))
except Exception as e:
print(e)
await collection.insert_one(
{'banmianhao': 'empty',
'banmianming': 'empty',
'preTitle': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': period_name,
'insert_timestamp': datetime.today(),
'content': 'empty'}
)
print(f"中国青年报采集完毕,共采集{crawl_num}条数据!")
asyncio.run(main())