feat: 添加中国民族报,科技日报,人民邮电报

This commit is contained in:
皓月归尘 2024-11-10 00:17:12 +08:00
parent 3bfb57b662
commit 4cb5d4a5d3
5 changed files with 496 additions and 6 deletions

View File

@ -0,0 +1,199 @@
# _*_ coding : UTF-8 _*_
# @Time : 2024/11/10 21:23
# @UpdateTime : 2024/11/10 21:23
# @Author : haochen zhong
# @File : CrawlKejiribao.py
# @Software : PyCharm
# @Comment : 本程序采集科技日报数据
import asyncio
import random
from datetime import datetime
from bs4 import BeautifulSoup
from httpx import AsyncClient
from motor.motor_asyncio import AsyncIOMotorClient
start_date = datetime.strptime('2011-01', '%Y-%m')
"""科技日报报2011年1月份开始有数据"""
end_date = datetime.today()
"""截止到今天"""
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
# 链接数据库
client = AsyncIOMotorClient('mongodb://localhost:27017')
db = client['buweijiguanbao']
collection = db['kejiribao']
async def main():
collection_names = await db.list_collection_names()
# 判断数据表是否存在
if "kejiribao" not in collection_names:
# 如果不存在则从2017年9月开始爬取
print("科技日报数据表不存在,开始采集!")
await getData(start_date, end_date)
else:
# 如果存在,则从数据库中获取最后一条记录的日期
last_record = await collection.find_one({}, sort=[('release_time', -1)])
last_date_str = last_record['release_time']
print("数据库截止时间:", last_date_str)
await getData(last_date_str, end_date)
async def getContent(soup: BeautifulSoup) -> str:
"""
:param soup: BeautifulSoup对象
:return: 文章内容
"""
content = ""
for p in soup.select("#ozoom p"):
para = p.text.strip()
if para:
content += para
content += '\n'
return content
async def getData(start_date: datetime, end_date: datetime):
"""
:param start_date: 开始日期
:param end_date: 结束日期
:return: None
"""
crawl_num = 0
# 创建一个列表保存月份
months = []
# 从开始日期到结束日期,每个月份都添加到列表中
current_date = start_date
while current_date <= end_date:
months.append(current_date)
# 增加一个月
if current_date.month == 12:
current_date = current_date.replace(year=current_date.year + 1, month=1)
else:
current_date = current_date.replace(month=current_date.month + 1)
# 遍历月份列表
for month in months:
# 构造URL
url = f'https://digitalpaper.stdaily.com/http_www.kjrb.com/kjrb/html/{month.strftime("%Y-%m")}/period.xml'
"""https://digitalpaper.stdaily.com/http_www.kjrb.com/kjrb/html/2011-10/period.xml"""
print(url)
async with AsyncClient(headers=headers, timeout=60) as client:
response = await client.get(url)
response.encoding = response.charset_encoding
print(f"一级连接状态:{response.status_code}")
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'xml')
for period in soup.select("period"):
period_name = datetime.strptime(period.select_one("period_name").text.strip(), "%Y-%m-%d")
front_page = period.select_one("front_page").text.strip()
try:
url1 = f"https://digitalpaper.stdaily.com/http_www.kjrb.com/kjrb/html/{period_name.strftime('%Y-%m/%d')}/{front_page}"
"""https://digitalpaper.stdaily.com/http_www.kjrb.com/kjrb/html/2024-10/30/node_2.htm"""
print(url1)
response2 = await client.get(url1)
response2.encoding = response2.charset_encoding
print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, 'lxml')
for item in soup2.select("#pageLink"):
banmianming = item.text.split("")[-1]
banmianhao = item.text.split("")[0]
url2 = f"https://digitalpaper.stdaily.com/http_www.kjrb.com/kjrb/html/{period_name.strftime('%Y-%m/%d')}/" + item.get(
"href").replace("./", "").strip()
print(url2)
response3 = await client.get(url2)
response3.encoding = response3.charset_encoding
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
soup3 = BeautifulSoup(response3.text, 'lxml')
pages = soup3.select(".ul02_l a")
if pages:
titleTag = "body > table:nth-child(2) > tbody > tr:nth-child(1) > td:nth-child(2) > table:nth-child(3) > tbody > tr:nth-child(1) > td > table > tbody > tr > td > strong"
subTitleTag = "body > table:nth-child(2) > tbody > tr:nth-child(1) > td:nth-child(2) > table:nth-child(3) > tbody > tr:nth-child(1) > td > table > tbody > tr > td > span:nth-child(5)"
preTitleTag = "body > table:nth-child(2) > tbody > tr:nth-child(1) > td:nth-child(2) > table:nth-child(3) > tbody > tr:nth-child(1) > td > table > tbody > tr > td > span:nth-child(1)"
authorTag = "body > table:nth-child(2) > tbody > tr:nth-child(1) > td:nth-child(2) > table:nth-child(3) > tbody > tr:nth-child(1) > td > table > tbody > tr > td > span:nth-child(7)"
else:
pages = soup3.select(".title a")
titleTag = ".biaoti"
subTitleTag = ".futi"
preTitleTag = ".yinti"
authorTag = ".autor"
for item2 in pages:
url3 = f"https://digitalpaper.stdaily.com/http_www.kjrb.com/kjrb/html/{period_name.strftime('%Y-%m/%d')}/" + item2.get(
"href")
"""https://digitalpaper.stdaily.com/http_www.kjrb.com/kjrb/html/2024-10/30/content_579753.htm?div=-1"""
if await collection.find_one({"detail_url": url3}, {"_id": False}):
continue
title = item2.text.strip()
print(url3)
response4 = await client.get(url3)
response4.encoding = response4.charset_encoding
print(f"四级连接状态:{response4.status_code}")
if response4.status_code == 200:
soup4 = BeautifulSoup(response4.text, 'lxml')
try:
title = soup4.select_one(titleTag).text.strip()
except:
title = title
try:
subTitle = soup4.select(subTitleTag)[0].text.strip()
except:
subTitle = ""
try:
author = soup4.select_one(authorTag).text.strip()
except:
author = ""
try:
perTitle = soup4.select(preTitleTag)[-1].text.strip()
except:
perTitle = ""
try:
keywordlist = soup4.find("founder-keyword").text.strip()
except:
keywordlist = ""
content = await getContent(soup4)
await collection.insert_one({
"title": title,
"subtitle": subTitle,
"preTitle": perTitle,
"author": author,
"banmianming": banmianming,
"banmianhao": banmianhao,
'keywordlist': keywordlist,
'detail_url': url3,
'release_time': period_name,
'insert_timestamp': datetime.today(),
'content': content
})
crawl_num += 1
print(
f"科技日报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}---{title}---采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(
f"科技日报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}-----采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(f"科技日报---{period_name.strftime('%Y-%m-%d')}-----采集完成!")
await asyncio.sleep(random.randint(5, 15))
except Exception as e:
print(e)
await collection.insert_one(
{'banmianhao': 'empty',
'banmianming': 'empty',
'preTitle': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': period_name,
'insert_timestamp': datetime.today(),
'content': 'empty'}
)
print(f"科技日报采集完毕,共采集{crawl_num}条数据!")
asyncio.run(main())

View File

@ -0,0 +1,148 @@
# _*_ coding : UTF-8 _*_
# @Time : 2024/11/11 21:41
# @UpdateTime : 2024/11/11 21:41
# @Author : haochen zhong
# @File : CrawlRenminyoudianbao.py
# @Software : PyCharm
# @Comment : 本程序采集人民邮电报
import asyncio
import random
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from httpx import AsyncClient
from motor.motor_asyncio import AsyncIOMotorClient
start_date = datetime.strptime('2017-09', '%Y-%m')
"""中国改革报2017年9月份开始有数据"""
end_date = datetime.today()
"""截止到今天"""
headers = {
"connection":'keep-alive',
"host":"rmydb.cnii.com.cn",
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
# 链接数据库
client = AsyncIOMotorClient('mongodb://localhost:27017')
db = client['buweijiguanbao']
collection = db['renminyoudianbao']
async def main():
collection_names = await db.list_collection_names()
# 判断数据表是否存在
if "renminyoudianbao" not in collection_names:
# 如果不存在则从2017年9月开始爬取
print("人民邮电报数据表不存在,开始采集!")
await getData(start_date, end_date)
else:
# 如果存在,则从数据库中获取最后一条记录的日期
last_record = await collection.find_one({}, sort=[('release_time', -1)])
last_date_str = last_record['release_time']
print("数据库截止时间:", last_date_str)
await getData(last_date_str, end_date)
async def getContent(soup: BeautifulSoup) -> str:
"""
:param soup: BeautifulSoup对象
:return: 文章内容
"""
content = ""
for p in soup.select("#ozoom p"):
para = p.text.strip()
if para:
content += para
content += '\n'
return content
async def getData(start_date: datetime, end_date: datetime):
"""
:param start_date: 开始日期
:param end_date: 结束日期
:return: None
"""
crawl_num = 0
start_date = int(start_date.strftime("%Y%m%d"))
async with AsyncClient(headers=headers, timeout=60) as client:
response = await client.get("https://rmydb.cnii.com.cn/period/yearMonthDay.json")
response.encoding = response.charset_encoding
print(f"一级连接状态:{response.status_code}")
if response.status_code == 200:
data = response.json()
dayList = []
for value in data.values():
for item in value.values():
dayList += item
dayList.sort()
dayList = list(filter(lambda x: x >= start_date, list(map(int, dayList))))
for day in dayList:
try:
url = f"https://rmydb.cnii.com.cn/html/{day.__str__()[:4]}/{day}/data.json"
print(url)
response = await client.get(url)
response.encoding = response.charset_encoding
print(f"二级连接状态:{response.status_code}")
if response.status_code == 200:
data = response.json()
for item in data:
banmianming = item["pageName"]
banmianhao = f"{item['pageNo']}"
for article in item["onePageArticleList"]:
title = article["mainTitle"]
url2 = f"https://rmydb.cnii.com.cn/html/{day.__str__()[:4]}/{day}/{day}_{item['pageNo']}/{article['articleHref']}"
"""https://rmydb.cnii.com.cn/html/2024/20240906/20240906_001/20240906_001_01_523.html"""
author = article["articleAuthor"]
if await collection.find_one({"detail_url": url2}, {"_id": False}):
continue
print(url2)
response2 = await client.get(url2)
response2.encoding = response2.charset_encoding
print(f"三级连接状态:{response2.status_code}")
if response2.status_code == 200:
soup = BeautifulSoup(response2.text, "lxml")
preTitle = soup.select_one("#PreTitle").text
title = soup.select_one("#Title").text
subTitle = soup.select_one("#SubTitle").text
content = await getContent(soup)
await collection.insert_one({
"title": title,
"subtitle": subTitle,
"preTitle": preTitle,
"author": author,
"banmianming": banmianming,
"banmianhao": banmianhao,
'keywordlist': "empty",
'detail_url': url2,
'release_time': datetime.strptime(str(day), "%Y%m%d"),
'insert_timestamp': datetime.today(),
'content': content
})
crawl_num += 1
print(f"人民邮电报---{day}---{banmianming}---{banmianhao}---{title}---采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(f"人民邮电报---{day}---{banmianming}---{banmianhao}-----采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(f"人民邮电报---{day}-----采集完成!")
await asyncio.sleep(random.randint(5, 15))
except Exception as e:
print(e)
await collection.insert_one(
{'banmianhao': 'empty',
'banmianming': 'empty',
'preTitle': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': datetime.strptime(str(day), "%Y%m%d"),
'insert_timestamp': datetime.today(),
'content': 'empty'}
)
print(f"人民邮电报采集完毕,共采集{crawl_num}条数据!")
asyncio.run(main())

View File

@ -8,7 +8,7 @@
import asyncio import asyncio
import random import random
from datetime import datetime, timedelta from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from httpx import AsyncClient from httpx import AsyncClient
@ -279,4 +279,5 @@ async def getData(start_date: datetime, end_date: datetime):
) )
print(f"中国教育报采集完毕,共采集{crawl_num}条数据!") print(f"中国教育报采集完毕,共采集{crawl_num}条数据!")
asyncio.run(main()) asyncio.run(main())

View File

@ -0,0 +1,142 @@
# _*_ coding : UTF-8 _*_
# @Time : 2024/11/12 22:45
# @UpdateTime : 2024/11/12 22:45
# @Author : haochen zhong
# @File : CrawlZhongguominzubao.py
# @Software : PyCharm
# @Comment : 本程序采集中国民族报数据
import asyncio
import random
from datetime import datetime
from bs4 import BeautifulSoup
from httpx import AsyncClient
from motor.motor_asyncio import AsyncIOMotorClient
start_date = datetime.strptime('2022-01', '%Y-%m')
"""中国民族报2022年1月份开始有数据"""
end_date = datetime.today()
"""截止到今天"""
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
# 链接数据库
client = AsyncIOMotorClient('mongodb://localhost:27017')
db = client['buweijiguanbao']
collection = db['zhongguominzubao']
async def main():
collection_names = await db.list_collection_names()
# 判断数据表是否存在
if "zhongguomizubao" not in collection_names:
# 如果不存在则从2017年9月开始爬取
print("中国民族数据表不存在,开始采集!")
await getData(start_date, end_date)
else:
# 如果存在,则从数据库中获取最后一条记录的日期
last_record = await collection.find_one({}, sort=[('release_time', -1)])
last_date_str = last_record['release_time']
print("数据库截止时间:", last_date_str)
await getData(last_date_str, end_date)
async def getData(start_date: datetime, end_date: datetime):
"""
:param start_date: 开始日期
:param end_date: 结束日期
:return: None
"""
crawl_num = 0
# 创建一个列表保存月份
months = []
# 从开始日期到结束日期,每个月份都添加到列表中
current_date = start_date
while current_date <= end_date:
months.append(current_date)
# 增加一个月
if current_date.month == 12:
current_date = current_date.replace(year=current_date.year + 1, month=1)
else:
current_date = current_date.replace(month=current_date.month + 1)
# 遍历月份列表
async with AsyncClient(headers=headers, timeout=60) as client:
for month in months:
# 构造URL
url = "http://210.12.104.26:81/reader/layout/getSZBDatePub.do"
try:
response = await client.post(url, params={"sj": month.strftime("%Y.%m")})
response.encoding = response.charset_encoding
print(f"一级连接状态:{response.status_code}")
if response.status_code == 200:
data = response.json()
for item in data:
url2 = "http://210.12.104.26:81/reader/layout/findBmMenuPub.do"
response2 = await client.post(url2, params={"docPubTime": item.replace("/", "")})
response2.encoding = response2.charset_encoding
print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200:
data2 = response2.json()
for item2 in data2:
banmianming = item2["BM"]
banmianhao = item2["BC"]
url3 = f"http://210.12.104.26:81/reader/layout/getBmDetailPub.do?bc={item2['IRCATELOG']}&docpubtime={item.replace('/', '')}"
print(url3)
response3 = await client.get(url3)
response3.encoding = response3.charset_encoding
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
data3 = response3.json()
for item3 in data3:
url4 = "http://210.12.104.26:81/reader/layout/detailData.do"
response4 = await client.post(url4, params={"guid": item3['ZB_GUID']})
response4.encoding = response4.charset_encoding
print(f"四级连接状态:{response4.status_code}")
if response4.status_code == 200:
data4 = response4.json()
title = BeautifulSoup(data4['docTitle'],"lxml").text
subTitle = BeautifulSoup(data4['fb'],"lxml").text
preTitle = BeautifulSoup(data4['yt'],"lxml").text
author = data4['docAuthor']
content = BeautifulSoup(data4["content"], "lxml").text
await collection.insert_one({
"title": title,
"subtitle": subTitle,
"preTitle": preTitle,
"author": author,
"banmianming": banmianming,
"banmianhao": banmianhao,
'keywordlist': "empty",
'detail_url': f"http://210.12.104.26:81/epaper/?id={item3['ZB_GUID']}&time={item.replace('/', '')}",
'release_time': datetime.strptime(data4["docPubTime"],
"%Y/%m/%d %H:%M:%S"),
'insert_timestamp': datetime.today(),
'content': content
})
crawl_num += 1
print(
f"中国民族报---{item}---{banmianming}---{banmianhao}---{title}---采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(f"中国民族报---{item}---{banmianming}---{banmianhao}-----采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(f"中国民族报---{item}----采集完成!")
await asyncio.sleep(random.randint(5, 15))
except Exception as e:
print(e)
await collection.insert_one(
{'banmianhao': 'empty',
'banmianming': 'empty',
'preTitle': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': month,
'insert_timestamp': datetime.today(),
'content': 'empty'}
)
asyncio.run(main())