初始化仓库

This commit is contained in:
皓月归尘 2024-11-09 17:00:30 +08:00
commit 3bfb57b662
17 changed files with 2201 additions and 0 deletions

52
.gitignore vendored Normal file
View File

@ -0,0 +1,52 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.pyc
*.pyo
*.pyd
config.json
# Virtual environment
venv/
env/
.venv/
.venv3/
.Python
*.sqlite3
# IDE-specific files
.idea/
.vscode/
# Compiled source
*.com
*.class
*.dll
*.exe
*.o
*.so
# Logs and databases
*.log
*.sql
*.sqlite
# Output files
dist/
build/
*.egg-info/
*.egg
# OS-specific files
.DS_Store
Thumbs.db
# Miscellaneous
*.bak
*.swp
*.tmp
*.tmp.*
*.~*
# Jupyter Notebook
.ipynb_checkpoints/

BIN
requirements.txt Normal file

Binary file not shown.

View File

@ -0,0 +1,186 @@
# _*_ coding : UTF-8 _*_
# @Time : 2024/11/06 21:35
# @UpdateTime : 2024/11/06 21:35
# @Author : haochen zhong
# @File : CrawlZhongguogaigebao.py
# @Software : PyCharm
# @Comment : 本程序采集中国改革报版面数据
import asyncio
import random
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from httpx import AsyncClient
from motor.motor_asyncio import AsyncIOMotorClient
start_date = datetime.strptime('2017-09', '%Y-%m')
"""中国改革报2017年9月份开始有数据"""
end_date = datetime.today()
"""截止到今天"""
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
# 链接数据库
client = AsyncIOMotorClient('mongodb://localhost:27017')
db = client['buweijiguanbao']
collection = db['zhongguogaigebao']
async def main():
collection_names = await db.list_collection_names()
# 判断数据表是否存在
if "zhongguogaigebao" not in collection_names:
# 如果不存在则从2017年9月开始爬取
print("中国改革报数据表不存在,开始采集!")
await getData(start_date, end_date)
else:
# 如果存在,则从数据库中获取最后一条记录的日期
last_record = await collection.find_one({}, sort=[('release_time', -1)])
last_date_str = last_record['release_time']
print("数据库截止时间:",last_date_str)
await getData(last_date_str, end_date)
async def getContent(soup: BeautifulSoup) -> str:
"""
:param soup: BeautifulSoup对象
:return: 文章内容
"""
content = ""
for p in soup.select("#ozoom p"):
para = p.text.strip()
if para:
content += para
content += '\n'
return content
async def getData(start_date: datetime, end_date: datetime):
"""
:param start_date: 开始日期
:param end_date: 结束日期
:return: None
"""
crawl_num = 0
# 创建一个列表保存月份
months = []
# 从开始日期到结束日期,每个月份都添加到列表中
current_date = start_date
while current_date <= end_date:
months.append(current_date)
# 增加一个月
if current_date.month == 12:
current_date = current_date.replace(year=current_date.year + 1, month=1)
else:
current_date = current_date.replace(month=current_date.month + 1)
# 遍历月份列表
for month in months:
# 构造URL
url = f'http://www.cfgw.net.cn/epaper/{month.strftime("%Y%m")}/period.xml'
"""http://www.cfgw.net.cn/epaper/201709/period.xml"""
print(url)
async with AsyncClient(headers=headers, timeout=60) as client:
# 发送GET请求
response = await client.get(url)
response.encoding = response.charset_encoding
print(f"一级连接状态:{response.status_code}")
if response.status_code == 200:
# 解析XML
soup = BeautifulSoup(response.text, 'xml')
for period in soup.find_all("period"):
try:
period_id = period.get("id")
url1 = f"http://www.cfgw.net.cn/epaper/{month.strftime('%Y%m')}/{period_id}/node_01.htm"
"""http://www.cfgw.net.cn/epaper/201709/05/node_01.htm"""
print(url1)
response2 = await client.get(url1)
response2.encoding = response2.charset_encoding
print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, 'lxml')
for item in soup2.select(".posRelative>a"):
url2 = f"http://www.cfgw.net.cn/epaper/{month.strftime('%Y%m')}/{period_id}/" + item.get(
"href")
"""http://www.cfgw.net.cn/epaper/201709/05/node_01/node_01.htm"""
banmianming = item.text.split("")[-1]
banmianhao = item.text.split("")[0]
print(url2)
response3 = await client.get(url2)
response3.encoding = response3.charset_encoding
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
soup3 = BeautifulSoup(response3.text, 'lxml')
for item2 in soup3.select("#articlelist > .clearfix > a"):
url3 = f"http://www.cfgw.net.cn/epaper/" + item2.get("href")[6:]
if await collection.find_one({"detail_url": url3}, {"_id": False}):
continue
title = item2.text.strip()
print(url3)
response4 = await client.get(url3)
response4.encoding = response4.charset_encoding
print(f"四级连接状态:{response4.status_code}")
if response4.status_code == 200:
soup4 = BeautifulSoup(response4.text, 'lxml')
try:
title = soup4.select("#Title")[0].text.strip()
except:
title = title
try:
subtitle = soup4.select("#SubTitle")[0].text.strip()
except:
subtitle = ""
try:
preTitle = soup4.select("#PreTitle")[0].text.strip()
except:
preTitle = ""
try:
author = soup4.find("author").text.strip()
except:
author = ""
try:
keyword = soup4.find("keyword").text.strip()
except:
keyword = ""
content = await getContent(soup4)
await collection.insert_one({
"title": title,
"subtitle": subtitle,
"preTitle": preTitle,
"author": author,
"banmianming": banmianming,
"banmianhao": banmianhao,
'keywordlist': keyword,
'detail_url': url3,
'release_time': month + timedelta(days=int(period_id)-1),
'insert_timestamp': datetime.today(),
'content': content
})
crawl_num += 1
print(
f"中国改革报---{month.strftime('%Y-%m')}-{period_id}---{banmianming}---{banmianhao}---{title}---采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(
f"中国改革报---{month.strftime('%Y-%m')}-{period_id}---{banmianming}---{banmianhao}----采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(
f"中国改革报---{month.strftime('%Y-%m')}-{period_id}-----采集完成!")
await asyncio.sleep(random.randint(5, 15))
except Exception as e:
await collection.insert_one(
{'banmianhao': 'empty',
'banmianming': 'empty',
'preTitle': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': month + timedelta(days=int(period_id)),
'insert_timestamp': datetime.today(),
'content': 'empty'}
)
print(e)
print(f"中国改革报采集完毕,共采集{crawl_num}条数据!")
asyncio.run(main())

View File

@ -0,0 +1,282 @@
# _*_ coding : UTF-8 _*_
# @Time : 2024/11/08 21:42
# @UpdateTime : 2024/11/08 21:42
# @Author : haochen zhong
# @File : CrawlZhongguojiaoyubao.py
# @Software : PyCharm
# @Comment : 本程序采集中国教育报数据
import asyncio
import random
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from httpx import AsyncClient
from motor.motor_asyncio import AsyncIOMotorClient
start_date = datetime.strptime('2022-01', '%Y-%m')
"""中国教育报2022年1月份开始有数据"""
end_date = datetime.today()
"""截止到今天"""
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
# 链接数据库
client = AsyncIOMotorClient('mongodb://localhost:27017')
db = client['buweijiguanbao']
collection = db['zhongguojiaoyubao']
async def main():
collection_names = await db.list_collection_names()
# 判断数据表是否存在
if "zhongguojiaoyubao" not in collection_names:
# 如果不存在则从2017年9月开始爬取
print("中国教育报数据表不存在,开始采集!")
await getData(start_date, end_date)
else:
# 如果存在,则从数据库中获取最后一条记录的日期
last_record = await collection.find_one({}, sort=[('release_time', -1)])
last_date_str = last_record['release_time']
print("数据库截止时间:", last_date_str)
await getData(last_date_str, end_date)
async def getContent(soup: BeautifulSoup) -> str:
"""
:param soup: BeautifulSoup对象
:return: 文章内容
"""
content = ""
for p in soup.select(".content_tt p"):
para = p.text.strip()
if para:
content += para
content += '\n'
return content
async def getData(start_date: datetime, end_date: datetime):
"""
:param start_date: 开始日期
:param end_date: 结束日期
:return: None
"""
crawl_num = 0
# 创建一个列表保存月份
months = []
# 从开始日期到结束日期,每个月份都添加到列表中
current_date = start_date
while current_date <= end_date:
months.append(current_date)
# 增加一个月
if current_date.month == 12:
current_date = current_date.replace(year=current_date.year + 1, month=1)
else:
current_date = current_date.replace(month=current_date.month + 1)
# 遍历月份列表
for month in months:
# 构造URL
url = f'http://paper.jyb.cn/zgjyb/html/{month.strftime("%Y-%m")}/period.xml'
"""http://paper.jyb.cn/zgjyb/html/2023-01/period.xml"""
print(url)
async with AsyncClient(headers=headers, timeout=60) as client:
response = await client.get(url)
response.encoding = response.charset_encoding
print(f"一级连接状态:{response.status_code}")
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'xml')
for period in soup.select("period"):
period_name = datetime.strptime(period.find("period_name").text.strip(), "%Y-%m-%d")
front_page = period.find("front_page").text.strip()
try:
url1 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/{front_page}"
print(url1)
response2 = await client.get(url1)
response2.encoding = response2.charset_encoding
print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, 'lxml')
for item in soup2.select(".right_title-name a"):
banmianming = item.text.split("")[-1]
banmianhao = item.text.split("")[0]
url2 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/" + item.get(
"href").replace("./","").strip()
print(url2)
response3 = await client.get(url2)
response3.encoding = response3.charset_encoding
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
soup3 = BeautifulSoup(response3.text, 'lxml')
for item2 in soup3.select("#titleList1 a"):
url3 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/" + item2.get(
"href")
if await collection.find_one({"detail_url": url3}, {"_id": False}):
continue
title = item2.text.strip()
print(url3)
response4 = await client.get(url3)
response4.encoding = response4.charset_encoding
print(f"四级连接状态:{response4.status_code}")
if response4.status_code == 200:
soup4 = BeautifulSoup(response4.text, 'lxml')
try:
title = soup4.select_one(".title1").text.strip()
except:
title = title
try:
subTitle = soup4.select(".title2")[0].text.strip()
except:
subTitle = ""
try:
author = soup4.select_one(".title3").text.strip()
except:
author = ""
try:
perTitle = soup4.select(".title2")[-1].text.strip()
except:
perTitle = ""
try:
keywordlist = soup4.find("founder-keyword").text.strip()
except:
keywordlist = ""
content = await getContent(soup4)
await collection.insert_one({
"title": title,
"subtitle": subTitle,
"preTitle": perTitle,
"author": author,
"banmianming": banmianming,
"banmianhao": banmianhao,
'keywordlist': keywordlist,
'detail_url': url3,
'release_time': period_name,
'insert_timestamp': datetime.today(),
'content': content
})
crawl_num += 1
print(
f"中国教育报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}---{title}---采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(
f"中国教育报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}-----采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(f"中国教育报---{period_name.strftime('%Y-%m-%d')}-----采集完成!")
await asyncio.sleep(random.randint(5, 15))
except Exception as e:
print(e)
await collection.insert_one(
{'banmianhao': 'empty',
'banmianming': 'empty',
'preTitle': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': period_name,
'insert_timestamp': datetime.today(),
'content': 'empty'}
)
else:
url = f"http://paper.jyb.cn/zgjyb/html/{month.strftime('%Y-%m')}/navi.xml"
response = await client.get(url)
response.encoding = response.charset_encoding
print(f"一级连接状态:{response.status_code}")
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'xml')
for period in soup.select("calendar"):
period_name = datetime.strptime(period.find("date").text.strip(), "%Y-%m-%d")
front_page = period.find("url").text.strip()[6:]
try:
url1 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/{front_page}"
print(url1)
response2 = await client.get(url1)
response2.encoding = response2.charset_encoding
print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, 'lxml')
for item in soup2.select(".right_title-name a"):
banmianming = item.text.split(":")[-1]
banmianhao = item.text.split(":")[0]
url2 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/" + item.get(
"href").replace("./","").strip()
print(url2)
response3 = await client.get(url2)
response3.encoding = response3.charset_encoding
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
soup3 = BeautifulSoup(response3.text, 'lxml')
for item2 in soup3.select("#titleList1 a"):
url3 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/" + item2.get(
"href")
if await collection.find_one({"detail_url": url3}, {"_id": False}):
continue
title = item2.text.strip()
print(url3)
response4 = await client.get(url3)
response4.encoding = response4.charset_encoding
print(f"四级连接状态:{response4.status_code}")
if response4.status_code == 200:
soup4 = BeautifulSoup(response4.text, 'lxml')
try:
title = soup4.select(".article-title")[0].text.strip()
except:
title = title
try:
subTitle = soup4.select(".article-subtitle")[0].text.strip()
except:
subTitle = ""
try:
author = soup4.select(".article-author")[0].text.strip()
except:
author = ""
try:
perTitle = soup4.select(".article-pretitle")[0].text.strip()
except:
perTitle = ""
try:
keywordlist = soup4.find("founder-keyword").text.strip()
except:
keywordlist = ""
content = await getContent(soup4)
await collection.insert_one({
"title": title,
"subtitle": subTitle,
"preTitle": perTitle,
"author": author,
"banmianming": banmianming,
"banmianhao": banmianhao,
'keywordlist': keywordlist,
'detail_url': url3,
'release_time': period_name,
'insert_timestamp': datetime.today(),
'content': content
})
crawl_num += 1
print(
f"中国教育报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}---{title}---采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(
f"中国教育报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}-----采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(f"中国教育报---{period_name.strftime('%Y-%m-%d')}-----采集完成!")
await asyncio.sleep(random.randint(5, 15))
except Exception as e:
await collection.insert_one(
{'banmianhao': 'empty',
'banmianming': 'empty',
'preTitle': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time':period_name ,
'insert_timestamp': datetime.today(),
'content': 'empty'}
)
print(f"中国教育报采集完毕,共采集{crawl_num}条数据!")
asyncio.run(main())

View File

@ -0,0 +1,185 @@
# _*_ coding : UTF-8 _*_
# @Time : 2024/11/08 00:07
# @UpdateTime : 2024/11/08 00:07
# @Author : haochen zhong
# @File : CrawlZhongguojingjidaobao.py
# @Software : PyCharm
# @Comment : 本程序采集中国经济导报数据
import asyncio
import random
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from httpx import AsyncClient
from motor.motor_asyncio import AsyncIOMotorClient
start_date = datetime.strptime('2012-09', '%Y-%m')
"""中国经济导报2012年9月份开始有数据"""
end_date = datetime.today()
"""截止到今天"""
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
# 链接数据库
client = AsyncIOMotorClient('mongodb://localhost:27017')
db = client['buweijiguanbao']
collection = db['zhongguojingjidaobao']
async def main():
collection_names = await db.list_collection_names()
# 判断数据表是否存在
if "zhongguojingjidaobao" not in collection_names:
# 如果不存在则从2017年9月开始爬取
print("中国经济导报数据表不存在,开始采集!")
await getData(start_date, end_date)
else:
# 如果存在,则从数据库中获取最后一条记录的日期
last_record = await collection.find_one({}, sort=[('release_time', -1)])
last_date_str = last_record['release_time']
print("数据库截止时间:", last_date_str)
await getData(last_date_str, end_date)
async def getContent(soup: BeautifulSoup) -> str:
"""
:param soup: BeautifulSoup对象
:return: 文章内容
"""
content = ""
for p in soup.select("#pgcontent"):
para = p.text.strip()
if para:
content += para
content += '\n'
return content
async def getData(start_date: datetime, end_date: datetime):
"""
:param start_date: 开始日期
:param end_date: 结束日期
:return: None
"""
crawl_num = 0
# 创建一个列表保存月份
months = []
# 从开始日期到结束日期,每个月份都添加到列表中
current_date = start_date
while current_date <= end_date:
months.append(current_date)
# 增加一个月
if current_date.month == 12:
current_date = current_date.replace(year=current_date.year + 1, month=1)
else:
current_date = current_date.replace(month=current_date.month + 1)
# 遍历月份列表
for month in months:
# 构造URL
url = f'http://www.ceh.com.cn/epaper/uniflows/html/{month.strftime("%Y/%m")}/date.txt'
"""http://www.ceh.com.cn/epaper/uniflows/html/2012/09/date.txt"""
print(url)
async with AsyncClient(headers=headers, timeout=60) as client:
# 发送GET请求
response = await client.get(url)
response.encoding = "gb2312"
print(f"一级连接状态:{response.status_code}")
if response.status_code == 200:
# 解析XML
soup = response.text.split("|")
for period in soup:
period_id, element = period.split(",")
if len(element) < 5:
continue
try:
url1 = f"http://www.ceh.com.cn/epaper/uniflows/html/{month.strftime('%Y/%m')}/{period_id}/boardurl.htm"
"""http://www.ceh.com.cn/epaper/uniflows/html/2012/09/01/boardurl.htm"""
print(url1)
response2 = await client.get(url1)
response2.encoding = "gb2312"
print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, 'lxml')
for item in soup2.select(".board_link td>a"):
url2 = f"http://www.ceh.com.cn/epaper/uniflows/html/{month.strftime('%Y/%m')}/{period_id}/" + item.get(
"href")
"""http://www.ceh.com.cn/epaper/uniflows/html/2024/11/07/01/default.htm"""
banmianming = item.text.split("")[-1].strip()
banmianhao = item.text.split("")[0].replace("&nbsp;", "").replace(" ", "").strip()
print(url2)
response3 = await client.get(url2)
response3.encoding = "gb2312"
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
soup3 = BeautifulSoup(response3.text, 'lxml')
for item2 in soup3.select("#mp_32"):
url3 = f"http://www.ceh.com.cn/epaper/uniflows/html/{month.strftime('%Y/%m')}/{period_id}/" + \
item.get("href").split("/")[0] + "/" + item2.get("href")
if await collection.find_one({"detail_url": url3}, {"_id": False}):
continue
title = item2.text.strip()
print(url3)
response4 = await client.get(url3)
response4.encoding = "gb2312"
print(f"四级连接状态:{response4.status_code}")
if response4.status_code == 200:
soup4 = BeautifulSoup(response4.text, 'lxml')
try:
title = soup4.select(".content_title")[0].text.strip()
except:
title = title
try:
subtitle = soup4.select(".subtitle")[0].text.strip()
except:
subtitle = ""
try:
preTitle = soup4.select(".yinti_title")[0].text.strip()
except:
preTitle = ""
try:
author = soup4.select(".others")[0].text.strip()
except:
author = ""
content = await getContent(soup4)
await collection.insert_one({
"title": title,
"subtitle": subtitle,
"preTitle": preTitle,
"author": author,
"banmianming": banmianming,
"banmianhao": banmianhao,
'keywordlist': 'empty',
'detail_url': url3,
'release_time': month + timedelta(days=int(period_id) - 1),
'insert_timestamp': datetime.today(),
'content': content
})
crawl_num += 1
print(
f"中国经济导报---{month.strftime('%Y-%m')}-{period_id}---{banmianming}---{banmianhao}---{title}---采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(
f"中国经济导报---{month.strftime('%Y-%m')}-{period_id}---{banmianming}---{banmianhao}----采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(
f"中国经济导报---{month.strftime('%Y-%m')}-{period_id}-----采集完成!")
await asyncio.sleep(random.randint(5, 15))
except Exception as e:
await collection.insert_one(
{'banmianhao': 'empty',
'banmianming': 'empty',
'preTitle': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': month + timedelta(days=int(period_id)),
'insert_timestamp': datetime.today(),
'content': 'empty'}
)
print(e)
print(f"中国经济导报采集完毕,共采集{crawl_num}条数据!")
asyncio.run(main())

View File

@ -0,0 +1,145 @@
# _*_ coding : UTF-8 _*_
# @Time : 2024/11/08 20:29
# @UpdateTime : 2024/11/08 20:29
# @Author : haochen zhong
# @File : CrawlAnhui.py
# @Software : PyCharm
# @Comment : 本程序采集安徽日报数字报数据
import asyncio
import random
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from httpx import AsyncClient
from motor.motor_asyncio import AsyncIOMotorClient
start_date = datetime.strptime('2017-09-29', '%Y-%m-%d')
"""安徽日报报2018年09月29日开始有数据"""
end_date = datetime.today()
"""截止到今天"""
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
# 链接数据库
client = AsyncIOMotorClient('mongodb://localhost:27017')
db = client['dfdm_sjribao']
collection = db['anhuiribao']
async def main():
collection_names = await db.list_collection_names()
# 判断数据表是否存在
if "anhuiribao" not in collection_names:
# 如果不存在则从2017年9月开始爬取
print("安徽日报报数据表不存在,开始采集!")
await getData(start_date, end_date)
else:
# 如果存在,则从数据库中获取最后一条记录的日期
last_record = await collection.find_one({}, sort=[('release_time', -1)])
last_date_str = last_record['release_time']
print("数据库截止时间:", last_date_str)
await getData(last_date_str, end_date)
async def getContent(soup: BeautifulSoup) -> str:
"""
:param soup: BeautifulSoup对象
:return: 文章内容
"""
content = ""
for p in soup.select(".content p"):
para = p.text.strip()
if para:
content += para
content += '\n'
return content
async def getData(start_date: datetime, end_date: datetime):
crawl_num = 0
for i in range((end_date - start_date).days):
date_now = start_date + timedelta(days=i + 1)
date_now_s = date_now.strftime('%Y%m/%d')
base_url = "https://szb.ahnews.com.cn/ahrb/layout/" + date_now_s + '/'
url = base_url + 'node_01.html'
"""https://szb.ahnews.com.cn/ahrb/layout/201811/01/node_01.html"""
try:
async with AsyncClient(headers=headers, timeout=60) as client:
print(url)
response = await client.get(url)
response.encoding = response.charset_encoding
print(f"一级连接状态:{response.status_code}")
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')
for item in soup.select(".Chunkiconlist p > a:nth-child(1)"):
banmianming = item.text.split("")[-1].strip()
banmianhao = item.text.split("")[0].replace("&nbsp;", "").replace(" ", "").strip()
url1 = base_url + item.get("href")
print(url1)
response2= await client.get(url1)
response2.encoding = response2.charset_encoding
print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, 'lxml')
for item2 in soup2.select(".newslist a"):
url2 = "https://szb.ahnews.com.cn/ahrb/" + item2.get("href")[9:]
"""https://szb.ahnews.com.cn/ahrb/content/201709/29/c17310.html"""
if await collection.find_one({"detail_url": url2}, {"_id": False}):
continue
title = item2.text.strip()
print(url2)
response3 = await client.get(url2)
response3.encoding = response3.charset_encoding
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
soup3 = BeautifulSoup(response3.text, 'lxml')
content = await getContent(soup3)
try:
title = soup3.select(".newsdetatit h3")[0].text.strip()
except:
title = title
try:
subTitle= soup3.select(".newsdetatext p")[0].text.strip()
except:
subTitle = ""
await collection.insert_one({
"title": title,
"subtitle": subTitle,
"preTitle": "",
"author": "",
"banmianming": banmianming,
"banmianhao": banmianhao,
'keywordlist': 'empty',
'detail_url': url2,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': content
})
crawl_num += 1
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!")
await asyncio.sleep(random.randint(5, 15))
print(f"安徽日报---{date_now_s}-----采集完成!")
await asyncio.sleep(random.randint(5, 15))
except Exception as e:
print(e)
await collection.insert_one(
{'banmianhao': 'empty',
'banmianming': 'empty',
'preTitle': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': 'empty'}
)
print(f"安徽日报采集完毕,共采集{crawl_num}条数据!")
asyncio.run(main())

View File

@ -0,0 +1,140 @@
# _*_ coding : UTF-8 _*_
# @Time : 2022/12/27 14:15
# @UpdateTime : 2023/11/08 16:30
# @Author : Haochen Zhong
# @File : CrawlGuizhou.py
# @Software : PyCharm
# @Comment : 本程序采集贵州日报数字报板面数据
import random
import time
from datetime import timedelta, datetime
import pymongo
import requests
from bs4 import BeautifulSoup
# 数据库起止时间
start_date = datetime.strptime('2021-12-31', '%Y-%m-%d')
"""贵州日报数字报2022-01-01开始有数据纪录"""
end_date = datetime.today()
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
# 创建数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client.dfdm_sjribao
guizhouribao = mydb.guizhouribao
# 设置随机时间
sleeptime = random.randint(2, 15)
def main():
# 判断数据库是否存在
collist = mydb.list_collection_names()
if "guizhouribao" in collist: # 检测集合是否存在
print("贵州集合存在,更新数据库")
# 数据库最新一条内容的时间
db_time = guizhouribao.find_one(sort=[('release_time', -1)])[
'release_time'] # 或者find().sort('_id', -1).limit(1)
print('数据库截止时间%s' % db_time)
# 输入更新数据库时间
input_time = datetime.today()
if db_time < input_time:
getData(db_time, input_time)
else:
print('数据库无需更新')
else:
# 爬取网页并建立数据库
print("数据库不存在,建立数据库!")
getData(start_date, end_date)
def get_content(soup3):
content = ""
for p in soup3.select("#ozoom p"):
para = p.text.strip()
if para:
content += para
content += '\n'
return content
def getData(start_date, end_date):
crawl_num = 0
for i in range((end_date - start_date).days):
date_now = start_date + timedelta(days=i + 1)
date_now_s = date_now.strftime('%Y%m/%d')
base_url = "http://szb.gzrbs.com.cn/pc/layout/" + date_now_s + "/"
url = base_url + "node_01.html"
# http://szb.gzrbs.com.cn/pc/layout/202201/01/node_01.html
try:
response = requests.get(url=url, headers=headers, timeout=(30, 45))
response.encoding = response.apparent_encoding
print(f"一级连接状态:{response.status_code}")
if response.status_code == 200:
soup = BeautifulSoup(response.text, "lxml")
for item in soup.select(".btn-block"):
banmianming = item.text.split("")[-1]
banmianhao = item.text.split("")[0]
url1 = base_url + item.get("href")
response2 = requests.get(url=url1, headers=headers, timeout=(30, 45))
response2.encoding = response2.apparent_encoding
print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, "lxml")
for item2 in soup2.select(".resultList a"):
title = item2.text.strip()
url2 = "http://szb.gzrbs.com.cn/pc/" + item2.get("href")[9:]
# http://szb.gzrbs.com.cn/pc/cont/202201/02/content_42202.html
response3 = requests.get(url=url2, headers=headers, timeout=(30, 45))
response3.encoding = response3.apparent_encoding
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
soup3 = BeautifulSoup(response3.text, "lxml")
try:
title = soup3.select("#Title")[0].text.strip()
except:
title = title
try:
subtitle = soup3.select("#SubTitle")[0].text.strip()
except:
subtitle = ""
try:
preTitle = soup3.select("#PreTitle")[0].text.strip()
except:
preTitle = ""
content = get_content(soup3)
guizhouribao.insert_one({'banmianhao': banmianhao,
'banmianming': banmianming,
'preTitle': preTitle,
'title': title,
'subtitle': subtitle,
'author': '',
'keywordlist': 'empty',
'detail_url': url2,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': content})
crawl_num += 1
print(f"贵州日报-{date_now_s}-{banmianming}-{title}-已完成")
time.sleep(sleeptime)
print(f"贵州日报-{date_now_s}-{banmianming}-已完成")
time.sleep(sleeptime)
print(f"贵州日报-{date_now_s}-已完成")
except Exception as result:
guizhouribao.insert_one({'banmianhao': 'empty',
'banmianming': 'empty',
'preTitle': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': 'empty'})
print(result)
print(f"贵州日报采集完毕,共采集{crawl_num}条数据!")
if __name__ == '__main__':
main()

View File

@ -0,0 +1,162 @@
# _*_ coding : UTF-8 _*_
# @Time : 2024-01-17 14:24:59
# @Author : haochen zhong
# @File : CrawlHainan.py
# @Software : PyCharm
# @Comment :
import re
from bs4 import BeautifulSoup
import requests
from datetime import timedelta, datetime
import time
import pymongo
import random
# 数据库起止时间
start_date = datetime.strptime('2008-02-29', '%Y-%m-%d')
end_date = datetime.today()
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
# 创建数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client.dfdm_sjribao
hainanribao = mydb.hainanribao
def main():
# 判断数据库是否存在
collist = mydb.list_collection_names()
if "hainanribao" in collist: # 检测集合是否存在
print("海南日报集合存在,更新数据库")
# 数据库最新一条内容的时间
db_time = hainanribao.find_one(sort=[('release_time', -1)])['release_time']
print(f'数据库截止时间{db_time}')
# 输入更新数据库时间
input_time = datetime.today()
if db_time < input_time:
getData(db_time, input_time)
else:
print('数据库无需更新')
else:
print("数据库不存在,建立数据库")
# 爬取网页并建立数据库
getData(start_date, end_date)
# 解析网页正文
def parse_html_text(soup):
"""
:param html: html字符串
:return: 正文 string
"""
content = ''
if soup.select('#ozoom'):
content = soup.select('#ozoom')[0].text.strip()
return content
def parse_subtitle(soup):
item = soup.select('.font02')
if re.findall(r'article-subtitle>-->(.*?)<!--', str(item)):
subtitle = re.findall(r'article-subtitle>-->(.*?)<!--', str(item))[0]
else:
subtitle = ''
return subtitle
def parse_h3title(soup):
item = soup.select('.font02')
if re.findall(r'article-pretitle>-->(.*?)<!--', str(item)):
h3title = re.findall(r'article-pretitle>-->(.*?)<!--', str(item))[0]
else:
h3title = ''
return h3title
def parse_author(soup):
item = soup.select('.font02')
if re.findall(r'article-subtitle>-->(.*?)<!--', str(item)):
author = re.findall(r'article-subtitle>-->(.*?)<!--', str(item))[0]
else:
author = ''
return author
# 爬取网页并建立数据库
def getData(start_date, end_date):
crawl_num = 0
for i in range((end_date - start_date).days):
date_now = start_date + timedelta(days=i + 1)
date_now_s = date_now.strftime('%Y-%m/%d')
base_url = "http://news.hndaily.cn/html/" + date_now_s + '/'
url = base_url + 'node_1.htm'
# 进入首页
try:
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
print(f'一级连接状态{response.status_code}')
if response.status_code == 200:
soup = BeautifulSoup(response.text, "lxml")
for item in soup.select('#pageLink'):
banmianhao = item.text.split("")[0].strip()
banmianming = item.text.split("")[-1].strip()
if banmianming == "广告":
continue
url1 = base_url + item.get('href')
response2 = requests.get(url1, headers=headers)
response2.encoding = response2.apparent_encoding
print(f'二级连接状态{response2.status_code}')
if response2.status_code == 200:
soup1 = BeautifulSoup(response2.text, "lxml")
for item1 in soup1.select('#main-ed-articlenav-list tr td div a'):
detail_url = base_url + item1.get('href')
print(detail_url)
title = item1.text.strip()
response3 = requests.get(detail_url, headers=headers)
response3.encoding = response3.apparent_encoding
print(f'三级连接状态:{response3.status_code}')
if response3.status_code == 200:
soup2 = BeautifulSoup(response3.text, "lxml")
try:
title = soup2.select('.font01')[0].text.strip()
except IndexError:
pass
subtitle = parse_subtitle(soup2)
h3title = parse_h3title(soup2)
author = parse_author(soup2)
content = parse_html_text(soup2)
hainanribao.insert_one({'banmianhao': banmianhao,
'banmianming': banmianming,
'title': title,
'subtitle': subtitle,
'h3title': h3title,
'author': author,
'keywordlist': '',
'detail_url': detail_url,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': content})
print(f"海南日报-{date_now_s}-{banmianhao}-{banmianming}-{title}已经完成")
crawl_num += 1
time.sleep(random.randint(3, 10))
print(f"海南日报-{date_now_s}-{banmianhao}-{banmianming}-已经完成")
time.sleep(random.randint(3, 10))
print(f"海南日报-{date_now_s}-已经完成")
except Exception as result:
hainanribao.insert_one({'banmianhao': 'empty',
'banmianming': 'empty',
'title': 'empty',
'subtitle': 'empty',
'h3title': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': 'empty'})
print(result)
print(f"海南日报采集完毕,本次共采集{crawl_num}条数据!")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,133 @@
# _*_ coding : UTF-8 _*_
# @Time : 2024-03-08 10:18:55
# @Author : haochen zhong
# @File : CrawlHenan.py
# @Software : PyCharm
# @Comment :采集河南日报数字报版面数据
import datetime
import random
import time
import pymongo
import requests
from bs4 import BeautifulSoup
start_date = datetime.datetime.strptime('2007-10-13', '%Y-%m-%d')
"""采集开始时间"""
end_date = datetime.datetime.today()
"""采集结束时间"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
"""自定义请求头"""
# 创建数据库
dbclient = pymongo.MongoClient('localhost', 27017)
"""连接数据库"""
mydb = dbclient.dfdm_sjribao
henanribao = mydb.henanribao
def main():
# 判断数据库是否存在
collist = mydb.list_collection_names()
if "henanribao" in collist: # 检测集合是否存在
print("河南集合存在,更新数据库")
# 数据库最新一条内容的时间
db_time = henanribao.find_one(sort=[('release_time', -1)])['release_time']
print('数据库截止时间%s' % db_time)
# 输入更新数据库时间
input_time = datetime.datetime.today()
if db_time < input_time:
getData(db_time, input_time)
else:
print('数据库无需更新')
else:
# 爬取网页并建立数据库
print("数据库不存在,建立数据库!")
getData(start_date, end_date)
def getContent(soup: BeautifulSoup):
content = ''
for p in soup.select('#articleContent p'):
para = p.text.strip()
if para:
content += para
content += '\n'
return content
def getData(start_date, end_date):
crawl_num = 0
for i in range((end_date - start_date).days): # gu:时间长度
date_now = start_date + datetime.timedelta(days=i + 1)
date_now_s = date_now.strftime('%Y-%m/%d')
base_url = "http://newpaper.dahe.cn/hnrb/html/" + date_now_s + '/'
url = base_url + 'node_1.htm'
# http://newpaper.dahe.cn/hnrb/html/2024-03/08/node_1.htm
print(url)
try:
response = requests.get(url, headers, timeout=60)
response.encoding = response.apparent_encoding
print(f"一级链接状态:{response.status_code}")
if response.status_code == 200:
soup = BeautifulSoup(response.text, "lxml")
for item in soup.select(".layout-catalogue-item>a:nth-child(1)"):
banmianhao = item.text.split("")[0]
banmianming = item.text.split("")[-1]
url1 = base_url + item.get("href")
response2 = requests.get(url1, headers)
response2.encoding = response2.apparent_encoding
print(f"二级链接状态:{response2.status_code}")
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, "lxml")
for item2 in soup2.select(".news-item a"):
title = item2.get("title", "").strip()
url2 = base_url + item2.get("href")
response3 = requests.get(url2, headers)
response3.encoding = response3.apparent_encoding
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
soup3 = BeautifulSoup(response3.text, "lxml")
content = getContent(soup3)
try:
preTitle = soup3.select(".headline")[0].text.strip()
except Exception as e:
preTitle = ""
try:
subtitle = soup3.select(".subtitle")[0].test.strip()
except Exception as e:
subtitle = ""
henanribao.insert_one({'banmianhao': banmianhao,
'banmianming': banmianming,
'title': title,
'subtitle': subtitle,
'preTitle': preTitle,
'author': '',
'keywordlist': '',
'detail_url': url2,
'release_time': date_now,
'insert_timestamp': datetime.datetime.today(),
'content': content})
crawl_num += 1
print(f"河南日报-{date_now_s}-{banmianhao}-{title}---采集成功!")
time.sleep(random.randint(5, 10))
print(f"河南日报-{date_now_s}-{banmianhao}---采集成功!")
print(f"河南日报-{date_now_s}---采集成功!")
except Exception as result:
henanribao.insert_one({'banmianhao': 'empty',
'banmianming': 'empty',
'preTitle': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': date_now,
'insert_timestamp': datetime.datetime.today(),
'content': 'empty'})
print(result)
print(f"河南日报采集完毕,共采集{crawl_num}条数据!")
if __name__ == '__main__':
main()

View File

@ -0,0 +1,140 @@
# _*_ coding : UTF-8 _*_
# @Time : 2022/12/29 13:48
# @Author : Haochen Zhong
# @File : CrawlNingxia.py
# @Software : PyCharm
# @Comment : 本程序采集宁夏日报版面数据
import requests
import time
import pymongo
import random
from datetime import timedelta, datetime
from bs4 import BeautifulSoup
# 数据库起止时间
start_date = datetime.strptime('2022-01-31', '%Y-%m-%d')
"""宁夏日报2022-02-01开始有数据"""
end_date = datetime.today()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
# 创建数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client.dfdm_sjribao
ningxiaribao = mydb.ningxiaribao
# 设置随机时间
sleeptime = random.randint(2, 10)
def main():
# 判断数据库是否存在
collist = mydb.list_collection_names()
if "ningxiaribao" in collist: # 检测集合是否存在
print("宁夏集合存在,更新数据库")
# 数据库最新一条内容的时间
db_time = ningxiaribao.find_one(sort=[('release_time', -1)])[
'release_time'] # 或者find().sort('_id', -1).limit(1)
print('数据库截止时间%s' % db_time)
# 输入更新数据库时间
input_time = datetime.today()
if db_time < input_time:
getData(db_time, input_time)
else:
print('数据库无需更新!')
else:
# 爬取网页并建立数据库
print("数据库不存在,建立数据库!")
getData(start_date, end_date)
def get_content(soup3):
content = ""
for p in soup3.select("#ozoom p"):
para = p.text.strip()
if para:
content += para
content += '\n'
return content
def getData(start_date, end_date):
crawl_num = 0
for i in range((end_date - start_date).days): # gu:时间长度
date_now = start_date + timedelta(days=i + 1)
date_now_s = date_now.strftime('%Y%m/%d')
base_url = "https://szb.nxrb.cn/nxrb/pc/layout/" + date_now_s + "/"
url = base_url + "node_01.html"
# https://szb.nxrb.cn/nxrb/pc/layout/202202/01/node_01.html
try:
response = requests.get(url=url, headers=headers, timeout=(30, 45))
response.encoding = response.apparent_encoding
print(f"一级连接状态: {response.status_code}")
if response.status_code == 200:
soup = BeautifulSoup(response.text, "lxml")
for item in soup.select(".nav-list .btn-block"):
banmianhao = item.text.split("")[0].strip()
banmianming = item.text.split("")[-1].strip()
url1 = base_url + item.get("href")
response2 = requests.get(url=url1, headers=headers, timeout=(30, 45))
response2.encoding = response2.apparent_encoding
print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, "lxml")
for item2 in soup2.select(".news-list .resultList a"):
url_title = item2.text.strip()
url2 = "https://szb.nxrb.cn/nxrb/pc/" + item2.get("href")[9:]
print(url2)
response3 = requests.get(url=url2, headers=headers, timeout=(30, 45))
response3.encoding = response3.apparent_encoding
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
soup3 = BeautifulSoup(response3.text, "lxml")
try:
pretitle = soup3.select("#PreTitle")[0].text.strip()
except:
pretitle = ""
try:
title = soup3.select("#Title")[0].text.strip()
except:
title = url_title
try:
subtitle = soup3.select("SubTitle")[0].text.strip()
except:
subtitle = ""
content = get_content(soup3)
ningxiaribao.insert_one({'banmianhao': banmianhao,
'banmianming': banmianming,
'title': title,
'subtitle': subtitle,
'h3title': pretitle,
'author': '',
'keywordlist': 'empty',
'detail_url': url2,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': content})
crawl_num += 1
print(f"宁夏日报-{date_now_s}-{banmianhao}-{title}-已完成")
time.sleep(sleeptime)
print(f"宁夏日报-{date_now_s}-{banmianhao}-已完成")
time.sleep(sleeptime)
print(f"宁夏日报-{date_now_s}-已完成")
time.sleep(sleeptime)
except Exception as result:
ningxiaribao.insert_one({'banmianhao': 'empty',
'banmianming': 'empty',
'title': 'empty',
'subtitle': 'empty',
'h3title': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': 'empty'})
print(result)
print(f"宁夏日报采集完成,成功采集{crawl_num}条数据!")
if __name__ == '__main__':
main()

View File

@ -0,0 +1,137 @@
# _*_ coding : UTF-8 _*_
# @Time : 2024/01/09 14:15
# @UpdateTime : 2024/01/09 16:30
# @Author : Haochen Zhong
# @File : CrawlSiChuan.py
# @Software : PyCharm
# @Comment : 本程序采集四川日报数字报板面数据
import re
from bs4 import BeautifulSoup
import requests
from datetime import timedelta, datetime
import time
import pymongo
import random
# 数据库起止时间
start_date = datetime.strptime('2021-12-31', '%Y-%m-%d')
end_date = datetime.today()
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
# 创建数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client.dfdm_sjribao
sichuanribao = mydb.sichuanribao
def main():
# 判断数据库是否存在
collist = mydb.list_collection_names()
if "sichuanribao" in collist: # 检测集合是否存在
print("四川日报集合存在,更新数据库")
# 数据库最新一条内容的时间
db_time = sichuanribao.find_one(sort=[('release_time', -1)])[
'release_time'] # 或者find().sort('_id', -1).limit(1)
print('数据库截止时间%s' % db_time)
# 输入更新数据库时间
input_time = datetime.today()
if db_time < input_time:
getData(db_time, input_time)
else:
print('数据库无需更新')
else:
# 爬取网页并建立数据库
print("数据库不存在,建立数据库!")
getData(start_date, end_date)
def getContent(soup):
content = ''
for p in soup.select('#main2 > div.main2_r > ul > li:nth-child(2) p'):
para = p.text.strip()
if para:
content += para
content += '\n'
return content
def getSubtitle(soup):
subtitle = ''
if soup.select('#main2 > div.main2_r > ul > li:nth-child(1) > h2:nth-child(1)'):
for p in soup.select('#main2 > div.main2_r > ul > li:nth-child(1) > h2:nth-child(1)'):
para = p.text.strip()
if para:
subtitle += para
subtitle += '\n'
return subtitle
def getData(start_date, end_date):
crawl_num = 0
for i in range((end_date - start_date).days):
date_now = start_date + timedelta(days=i + 1)
date_now_s = date_now.strftime('%Y%m%d')
base_url = "https://epaper.scdaily.cn/shtml/scrb/"
url = base_url + date_now_s + '/index.shtml'
try:
response = requests.get(url, headers)
print(f"一级链接状态:{response.status_code}")
if response.status_code == 200:
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, "lxml")
for item in soup.select("#main > div.main_r > ul:nth-child(2) > li:nth-child(2) a"):
banmianhao = item.text.split("")[0]
banmianming = item.text.split("")[-1]
url1 = "https://epaper.scdaily.cn" + item.get("href")
response2 = requests.get(url1, headers)
print(f"二级链接状态:{response2.status_code}")
if response2.status_code == 200:
response2.encoding = response2.apparent_encoding
soup2 = BeautifulSoup(response2.text, "lxml")
for item2 in soup2.select("#main > div.main_r > ul:nth-child(3) > li:nth-child(2) a"):
url2 = "https://epaper.scdaily.cn" + item2.get("href")
title = item2.get("title")
response3 = requests.get(url2, headers)
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
response3.encoding = response3.apparent_encoding
soup3 = BeautifulSoup(response3.text, "lxml")
content = getContent(soup3)
subtitle = getSubtitle(soup3)
sichuanribao.insert_one({'banmianhao': banmianhao,
'banmianming': banmianming,
'title': title,
'subtitle': subtitle,
'h3title': '',
'author': '',
'keywordlist': '',
'detail_url': url2,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': content})
print(f"四川日报--{date_now_s}-{banmianhao}-{title}----已完成")
crawl_num += 1
time.sleep(random.randint(3, 10))
print(f"四川日报--{date_now_s}-{banmianhao}----已完成")
time.sleep(random.randint(3, 10))
print(f"四川日报--{date_now_s}-----已完成")
time.sleep(random.randint(3, 10))
except Exception as result:
sichuanribao.insert_one({'banmianhao': 'empty',
'banmianming': 'empty',
'preTitle': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': 'empty'})
print(result)
print(f"四川日报采集完毕,共采集{crawl_num}条数据!")
if __name__ == '__main__':
main()

View File

@ -0,0 +1,167 @@
# _*_ coding : UTF-8 _*_
# @Time : 2022/6/17 8:50
# @Author : Haochen Zhong
# @File : 本程序用于抓取上海新民晚报数据
# @Project : Pytharm
from bs4 import BeautifulSoup, Comment
import requests
from datetime import timedelta, datetime
import time
import pymongo
import random
start_date = datetime.strptime('2018-12-31', '%Y-%m-%d') # 抓取上海新民晚报从2019-01-01到至今的数据
end_date = datetime.today()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
# 创建数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client.dfdm_qitaguanbao
shxinminwanbao = mydb.shxinminwanbao
# 设置随机时间
def main():
# 判断数据库是否存在
collist = mydb.list_collection_names()
if "shxinminwanbao" in collist: # 检测集合是否存在
print("上海新民晚报集合存在,更新数据库")
# 数据库最新一条内容的时间
db_time = shxinminwanbao.find_one(sort=[('release_time', -1)])['release_time']
print('数据库截止时间%s' % db_time)
# 输入更新数据库时间
input_time = datetime.today()
if db_time < input_time:
getData(db_time, input_time)
else:
print('数据库无需更新')
else:
# 爬取网页并建立数据库
print('数据库不存在,建立数据库!')
getData(start_date, end_date)
def parse_html_text(soup2):
img_list = soup2.select('.dzb-enter-desc-box p img')
if img_list:
img = '图片链接:\n'
for i in img_list:
img_url = 'https:' + i.get('src')
img += img_url
img += '\n'
content = img + '正文内容:\n'
for p in soup2.select('.dzb-enter-desc-box p'):
para = p.text.split(' ')
for x in para:
if x != '' or x != '\\n\\n':
content += x.strip()
content += '\n'
else:
content = ''
for p in soup2.select('.dzb-enter-desc-box p'):
para = p.text.split(' ')
for x in para:
if x.strip() != '' or x != '\\n\\n':
content += x.strip()
content += '\n'
return content
def getData(start_date, end_date):
for i in range((end_date - start_date).days):
date_now = start_date + timedelta(days=i + 1)
date_now_s = date_now.strftime('%Y-%m-%d')
base_url = "https://paper.xinmin.cn/html/xmwb/" + date_now_s + '/'
url = base_url + '1.html'
art_base_url = 'https://paper.xinmin.cn'
# 进入首页
try:
try:
response = requests.get(url=url, headers=headers, timeout=30)
except:
time.sleep(10)
response = requests.get(url=url, headers=headers, timeout=30)
response.encoding = response.apparent_encoding
print('一级连接状态:%d' % response.status_code)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')
response.close()
# 提取所有版面信息
for item in soup.select('.dzb-enter-mulu-wrap-nav a'):
url1 = art_base_url + item.get('href')
banmianhao = item.get('title').split(':')[0]
banmianming = item.get('title').split(':')[-1]
try:
response2 = requests.get(url=url1, headers=headers, timeout=30)
except:
time.sleep(10)
response2 = requests.get(url=url1, headers=headers, timeout=30)
response2.encoding = response2.apparent_encoding
print('二级连接状态:%d' % response2.status_code)
if response2.status_code == 200:
soup1 = BeautifulSoup(response2.text, 'lxml')
response2.close()
for item1 in soup1.select('.dzb-enter-benban-wrap div a'):
url2 = art_base_url + item1.get('href')
try:
response3 = requests.get(url=url2, headers=headers, timeout=30)
except:
time.sleep(10)
response3 = requests.get(url=url2, headers=headers, timeout=30)
response3.encoding = response3.apparent_encoding
print('三级连接状态:%d' % response3.status_code)
if response3.status_code == 200:
soup2 = BeautifulSoup(response3.text, 'lxml')
response3.close()
title = soup2.select('.dzb-title-box')[0].text.strip()
pass_list = ['上海地区今明天气', '上海市今明天气预报', '广告']
if title in pass_list: # 筛除每天海今明天气和广告
time.sleep(random.randint(2, 8))
continue
subtitle = soup2.select('.dzb-sub-title-box')[0].text.strip()
# 查找所有注释
comments = soup2.find_all(string=lambda text: isinstance(text, Comment))
author = ""
# 遍历注释,找到包含作者的注释
for comment in comments:
if 'dzb-author-box' in comment:
# 使用 BeautifulSoup 解析注释内容
author_soup = BeautifulSoup(comment, 'html.parser')
author = author_soup.find('span', class_='dzb-author-box').text
pretitle = soup2.select('.dzb-special-title-box')[0].text.strip()
content = parse_html_text(soup2)
shxinminwanbao.insert_one({'banmianhao': banmianhao,
'banmianming': banmianming,
'pretitle': pretitle,
'title': title,
'subtitle': subtitle,
'author': author,
'keywordlist': '',
'detail_url': url2,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': content})
print('上海新民晚报-%s-%s-%s-已完成' % (date_now_s, banmianhao, title))
time.sleep(random.randint(2, 8))
print('上海新民晚报-%s-%s-已完成' % (date_now_s, banmianhao))
print("上海新民晚报-%s-已经完成" % date_now_s)
except Exception as result:
shxinminwanbao.insert_one({'banmianhao': 'empty',
'banmianming': 'empty',
'title': 'empty',
'subtitle': 'empty',
'h3title': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': 'empty'})
print(result)
if __name__ == '__main__':
main()
print("爬取完毕!")

View File

@ -0,0 +1,72 @@
import datetime
import random
import time
import pymongo
import requests
from bs4 import BeautifulSoup
# 模拟用户访问
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/51.0.2704.63 Safari/537.36',
'Connection': 'close'}
# 创建数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client.sjzf_zcwj
shanghaizcwj = mydb.shanghaizcwj
base_url = "https://www.shanghai.gov.cn"
def getContent(soup: BeautifulSoup) -> str:
"""
获取文章正文内容
:param soup:
:return:
"""
content: str = ""
for p in soup.select('#ivs_content p'):
para: str = p.text.strip()
if para:
content += para
content += '\n'
return content
def getData():
pages = 28
for i in range(1, pages + 1):
if i == 1:
url = "https://www.shanghai.gov.cn/xxzfgzwj/index.html"
else:
url = f"https://www.shanghai.gov.cn/xxzfgzwj/index_{i}.html"
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
if response.status_code == 200:
soup = BeautifulSoup(response.text, "lxml")
# print(response.text)
trList = soup.select(".trout-region-list tbody tr")
for item in trList:
data = item.select("a")[0]
title = data.get("title", "")
url = base_url + data.get("href", "")
print(url)
if shanghaizcwj.find_one({"url": url}):
continue
subtitle = data.select_one(".text-color").text.strip()
response2 = requests.get(url=url, headers=headers)
response2.encoding = response2.apparent_encoding
print(response2.status_code)
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, "lxml")
content: str = getContent(soup=soup2)
shanghaizcwj.insert_one({
"title": title,
"subtitle": subtitle,
"content": content,
"url": url,
})
time.sleep(random.randint(3, 5))
print(title, "采集完成")
getData()

View File

@ -0,0 +1,153 @@
# _*_ coding : UTF-8 _*_
# @Time : 2023/8/27 22:28
# @Author : Haochen Zhong
# @File : Exportxjzfgz.py
# @Software : PyCharm
# @Comment : 本程序采集新疆维吾尔自治区人民政府规章库
import datetime
import random
import time
import pymongo
import requests
from bs4 import BeautifulSoup
# 模拟用户访问
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/51.0.2704.63 Safari/537.36',
'Connection': 'close'}
# 创建数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client.sjzf_zcwj
xinjiangzcwj = mydb.xinjiangzcwj
def getContent(soup: BeautifulSoup) -> str:
"""
获取文章正文内容
:param soup:
:return:
"""
content: str = ""
for p in soup.select('.gknbxq_detail p'):
para: str = p.text.strip()
if para:
content += para
content += '\n'
return content
def getData():
"""程序主函数"""
count = 10000
"""设置单次获取文章数量,可以任意设置正整数"""
dataUrl = "https://www.xinjiang.gov.cn/interface-cms/qryManuscriptByWebsiteId"
"""请求所有文章数据连接"""
dataJson = {
"websiteId": "2a4092ca8c2a4255bfec9f13f114aba6",
"channelId": [
"2aceb5d534434a9fb3550295b52a87e5"
],
"domainMetaList": [
{}
],
"pageSize": f"{count}",
"pageNum": 1,
"title": None
}
"""请求参数"""
response = requests.post(url=dataUrl, headers=headers, json=dataJson, timeout=60)
response.encoding = response.apparent_encoding
print(f"一级链接状态:{response.status_code}")
if response.status_code == 200:
dataList = response.json()["results"]
for item in dataList:
try:
url: str = item["websiteDomain"] + item["url"]
"""文章链接"""
result = xinjiangzcwj.find_one({"url": url})
if result:
continue
typeOneName: str = item["channelName"]
"""文章归类"""
title: str = item["title"]
"""文章标题"""
subTitle: str = item["subTitle"]
"""文章副标题"""
if item["publishedTime"]:
pubtime: float = datetime.datetime.strptime(item["publishedTime"], "%Y-%m-%d").timestamp()
"""发布日期"""
else:
pubtime: float = 0
"""发布日期"""
puborg: str = item["domainMetaList"]["xxgkml"]["resultList"]["fwjg2"]["cnName"]
"""发文机关(自治区)"""
articleType: str = item["domainMetaList"]["xxgkml"]["resultList"]["gwzl2"]["cnName"]
"""公文种类"""
if item["domainMetaList"]["xxgkml"]["resultList"]["cwrq2"]["cnName"]:
ptime: float = datetime.datetime.strptime(
item["domainMetaList"]["xxgkml"]["resultList"]["cwrq2"]["cnName"],
"%Y-%m-%d").timestamp()
"""成文日期"""
else:
ptime: float = 0
"""成文日期"""
index: str = item["domainMetaList"]["xxgkml"]["resultList"]["syh2"]["cnName"]
"""索引号"""
pcode: str = item["domainMetaList"]["xxgkml"]["resultList"]["wenh2"]["cnName"]
"""文号"""
effectiveness: str = item["domainMetaList"]["xxgkml"]["resultList"]["yxx01"]["cnName"]
"""有效性"""
typeSecondName: str = item["domainMetaList"]["xxgkml"]["resultList"]["wz2"]["cnName"]
"""文种(自治区)"""
year: str = item["domainMetaList"]["xxgkml"]["resultList"]["nianf2"]["cnName"]
"""年份"""
childtype: str = item["domainMetaList"]["xxgkml"]["resultList"]["ztfl2"]["cnName"]
"""主题分类"""
author: str = item["domainMetaList"]["默认元数据集"]["resultList"]["author"]["cnName"]
"""作者"""
source: str = item["domainMetaList"]["默认元数据集"]["resultList"]["source"]["cnName"]
"""来源"""
if item["manuscriptRelatedRes"]:
manuscriptRelatedRes: str = item["websiteDomain"] + item["manuscriptRelatedRes"]
"""附件链接"""
else:
manuscriptRelatedRes: str = ""
"""附件链接"""
response = requests.get(url=url, headers=headers, timeout=60)
response.encoding = response.apparent_encoding
print(f"二级链接状态:{response.status_code}")
if response.status_code == 200:
soup: BeautifulSoup = BeautifulSoup(response.text, "html.parser")
content: str = getContent(soup=soup)
xinjiangzcwj.insert_one(
{
'typeOneName': typeOneName,
'typeSecondName': typeSecondName,
'articleType': articleType,
"title": title,
"subTitle": subTitle,
"childtype": childtype,
"index": index,
"pcode": pcode,
"puborg": puborg,
"ptime": ptime,
"pubtime": pubtime,
"effectiveness": effectiveness,
"author": author,
"year": year,
"manuscriptRelatedRes": manuscriptRelatedRes,
"url": url,
"source": source,
"content": content
}
)
print(f"{typeOneName}--{typeSecondName}--{title}-已完成")
time.sleep(random.randint(3, 8))
except Exception as e:
print(e)
continue
if __name__ == '__main__':
getData()

View File

@ -0,0 +1,185 @@
# _*_ coding : UTF-8 _*_
# @Time : 2023/8/28 0:50
# @Author : Haochen Zhong
# @File : Exportxjzfgz.py
# @Software : PyCharm
# @Comment : 本程序用于到处新疆维吾尔自治区人民政府规章
import datetime
import os
import time
import pandas as pd
import pymongo
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.oxml.ns import qn
from docx.shared import Pt, Cm
client = pymongo.MongoClient('localhost', 27017)
"""与mongoDB数据库建立连接"""
mydb = client.sjzf_zcwj
"""政策文件存放在数据库的一级目录对象"""
xinjiangzcwj = mydb.xinjiangzcwj
"""政策文件存放对象"""
savePath = ""
"""导出文件存放路径"""
def replace_invalid_chars(text):
"""
替换Window系统和Linux系统文件路径禁止字符统一转换成Html实体编码
"""
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
"""Window系统和Linux系统文件路径禁止字符列表"""
replace_char = ['&lt;', '&gt;', '&#58;', '&quot;', '&#47;', '&#92;', '&#124;', '&#63;', '&#42;']
"""Window系统和Linux系统文件路径禁止字符替换列表 统一转换成html实体编码"""
for i, char in enumerate(invalid_chars):
text = text.replace(char, replace_char[i])
return text
def analysisTime(timestamp: int) -> str:
"""
处理时间将1970-01-01之前的时间戳正确转换
"""
if timestamp == 0:
return "未知"
if timestamp < 0:
# 计算从 1970-01-01 开始的时间间隔
delta = datetime.timedelta(seconds=abs(timestamp))
date = datetime.datetime(1970, 1, 1) - delta
else:
date = datetime.datetime.fromtimestamp(timestamp)
# 格式化为字符串
return date.strftime('%Y-%m-%d')
def saveFile():
num = 0
startTime = time.time()
global savePath
query = {
'typeOneName': "",
'typeSecondName': "",
'articleType': "",
"title": "",
"subTitle": "",
"childtype": "",
"index": "",
"pcode": "",
"puborg": "",
"ptime": "",
"pubtime": "",
"effectiveness": "",
"author": "",
"year": "",
"manuscriptRelatedRes": "",
"url": "",
"source": "",
"content": ""
}
query = {f'{k}': v for k, v in query.items() if v}
"""需要过滤的文章,默认不过滤"""
dataList = list(xinjiangzcwj.find(query))
if not savePath:
savePath = input("请输入数据存放路径:")
totalPath = os.path.join(savePath, "数据统计表.csv")
for data in dataList:
try:
typeOneName = data["typeOneName"]
"""一级分类目录"""
typeSecondName = data["typeSecondName"]
"""二级分类目录"""
articleType = data["articleType"]
"""四级分类目录"""
# 创建目录
output_directory = os.path.join(savePath, typeOneName, typeSecondName)
if not os.path.exists(output_directory):
os.makedirs(output_directory)
doc = Document()
firstLine = doc.add_paragraph()
firstLineText = f"索引号:{data['index']}\t\t有效性:{data['effectiveness']}"
firstLine_run = firstLine.add_run(firstLineText)
firstLine_run.font.size = Pt(12)
firstLine_run.font.name = 'Times New Roman' # 设置标题西文字体
firstLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
secondLine = doc.add_paragraph()
secondLineText = f"发文机关:{data['puborg']}\t\t发文字号:{data['pcode']}"
secondLine_run = secondLine.add_run(secondLineText)
secondLine_run.font.size = Pt(12)
secondLine_run.font.name = 'Times New Roman' # 设置标题西文字体
secondLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
thirdLine = doc.add_paragraph()
thirdLineText = f"标题:{data['title']}"
thirdLine_run = thirdLine.add_run(thirdLineText)
thirdLine_run.font.size = Pt(12)
thirdLine_run.font.name = 'Times New Roman' # 设置标题西文字体
thirdLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
fourLine = doc.add_paragraph()
pubtime = analysisTime(data['pubtime'])
ptime = analysisTime(data['ptime'])
fourLineText = f"成文日期:{ptime}\t\t发布日期:{pubtime}"
fourLine_run = fourLine.add_run(fourLineText)
fourLine_run.font.size = Pt(12)
fourLine_run.font.name = 'Times New Roman' # 设置标题西文字体
fourLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
urlLine = doc.add_paragraph()
urlLineText = f"文章链接:{data['url']}"
urlLine_run = urlLine.add_run(urlLineText)
urlLine_run.font.size = Pt(12)
urlLine_run.font.name = 'Times New Roman' # 设置标题西文字体
urlLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
title = doc.add_paragraph()
title_run = title.add_run(data["title"])
title_run.bold = True
title_run.font.size = Pt(22)
title_run.font.name = 'Times New Roman' # 设置标题西文字体
title_run.element.rPr.rFonts.set(qn('w:eastAsia'), "华文中宋")
title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER # 设置大标题居中对齐
for section in data["content"].split("\n"):
paragraph = doc.add_paragraph()
run = paragraph.add_run("\t" + section)
run.font.size = Pt(16)
run.font.name = "Times New Roman"
run.element.rPr.rFonts.set(qn('w:eastAsia'), "仿宋")
run.first_line_indent = Cm(0.74)
if data["manuscriptRelatedRes"]:
urlLine = doc.add_paragraph()
urlLineText = f"附件链接:{data['manuscriptRelatedRes']}"
urlLine_run = urlLine.add_run(urlLineText)
urlLine_run.font.size = Pt(12)
urlLine_run.font.name = 'Times New Roman' # 设置标题西文字体
urlLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
if len(data["title"]) > 45:
title_ = data["title"][len(data["title"]) - 30:]
else:
title_ = data["title"]
fileName = f"{replace_invalid_chars(title_)}.docx"
filePath = os.path.join(output_directory, fileName)
doc.save(filePath)
num += 1
print(f"{typeOneName}--{typeSecondName}--{data['title']}--导出成功!")
except Exception as e:
print(e)
continue
csvData = pd.DataFrame(dataList)
csvData.columns = ["数据库ID", "文章归类", "文种(自治区)", "公文种类", "文章标题", "文章副标题", "主题分类",
"索引号", "文号", "发文机关(自治区)", "成文日期时间戳", "发布日期时间戳", "有效性", "作者", "年份",
"附件链接",
"文章链接", "来源", "正文内容"]
csvData.to_csv(totalPath, encoding="utf-8-sig",index_label="序号")
print(f"耗时:{time.time() - startTime} 秒,一共导出{num}份文件,详情数据请看数据统计表.csv")
if __name__ == '__main__':
saveFile()

View File

@ -0,0 +1,62 @@
import asyncio
import datetime
import random
import time
import pymongo
import requests
from httpx import AsyncClient
# 模拟用户访问
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/51.0.2704.63 Safari/537.36', }
# 创建数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client.sjzf_zcwj
shenzhenzcwj = mydb.shenzhenzcwj
yearList = ['158104', '148604', '141910', '125615', '103604', '101620', '101621', '101622', '101623', '101624',
'101625', '101626', '101627', '101628', '101629', '101630', '101631', '101632', '101633', '101634',
'101635', '101636', '101637', '101638', '146351', '146338', '146325', '146311', '146298', '146285',
'146272', '146205', '146190', '145973', '145972', '145970']
def update_json_data(original_data, new_data):
# 遍历新数据的键值对
for key, value in new_data.items():
# 如果新数据的值不为 None 或者空字符串,更新原数据
if value is not None and value != "":
original_data[key] = value
return original_data
async def getData():
async with AsyncClient(headers=headers, timeout=60, verify=False) as client:
for i in yearList:
url = f"http://www.sz.gov.cn/postmeta/i/{i}.json"
print(url)
response = await client.get(url=url)
response.encoding = response.charset_encoding
print(response.status_code)
if response.status_code == 200:
for item in response.json()["children"]:
url2 = f"http://www.sz.gov.cn/postmeta/i/{item['id']}.json"
print(url2)
response2 = await client.get(url=url2)
response2.encoding = response2.charset_encoding
print(response2.status_code)
if response2.status_code == 200:
for item2 in response2.json()["articles"][1:]:
if shenzhenzcwj.find_one({"id":item2["id"]}):
continue
url3 = f"http://www.sz.gov.cn/postmeta/p/{item2['id'] // 1000000}/{item2['id'] // 1000}/{item2['id']}.json"
response3 = await client.get(url=url3)
response3.encoding = response3.charset_encoding
print(response3.status_code)
if response3.status_code == 200:
data = response3.json()
newData = update_json_data(item2, data)
shenzhenzcwj.insert_one(newData)
print(newData["title"],"采集完成")
await asyncio.sleep(random.randint(2,3))
asyncio.run(getData())