初始化仓库
This commit is contained in:
commit
3bfb57b662
52
.gitignore
vendored
Normal file
52
.gitignore
vendored
Normal file
@ -0,0 +1,52 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
|
||||
config.json
|
||||
# Virtual environment
|
||||
venv/
|
||||
env/
|
||||
.venv/
|
||||
.venv3/
|
||||
.Python
|
||||
*.sqlite3
|
||||
|
||||
# IDE-specific files
|
||||
.idea/
|
||||
.vscode/
|
||||
|
||||
# Compiled source
|
||||
*.com
|
||||
*.class
|
||||
*.dll
|
||||
*.exe
|
||||
*.o
|
||||
*.so
|
||||
|
||||
# Logs and databases
|
||||
*.log
|
||||
*.sql
|
||||
*.sqlite
|
||||
|
||||
# Output files
|
||||
dist/
|
||||
build/
|
||||
*.egg-info/
|
||||
*.egg
|
||||
|
||||
# OS-specific files
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Miscellaneous
|
||||
*.bak
|
||||
*.swp
|
||||
*.tmp
|
||||
*.tmp.*
|
||||
*.~*
|
||||
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints/
|
||||
BIN
requirements.txt
Normal file
BIN
requirements.txt
Normal file
Binary file not shown.
186
国内党媒/CrawlZhongguogaigebao.py
Normal file
186
国内党媒/CrawlZhongguogaigebao.py
Normal file
@ -0,0 +1,186 @@
|
||||
# _*_ coding : UTF-8 _*_
|
||||
# @Time : 2024/11/06 21:35
|
||||
# @UpdateTime : 2024/11/06 21:35
|
||||
# @Author : haochen zhong
|
||||
# @File : CrawlZhongguogaigebao.py
|
||||
# @Software : PyCharm
|
||||
# @Comment : 本程序采集中国改革报版面数据
|
||||
import asyncio
|
||||
import random
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from httpx import AsyncClient
|
||||
from motor.motor_asyncio import AsyncIOMotorClient
|
||||
|
||||
start_date = datetime.strptime('2017-09', '%Y-%m')
|
||||
"""中国改革报2017年9月份开始有数据"""
|
||||
end_date = datetime.today()
|
||||
"""截止到今天"""
|
||||
headers = {
|
||||
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
|
||||
|
||||
# 链接数据库
|
||||
client = AsyncIOMotorClient('mongodb://localhost:27017')
|
||||
db = client['buweijiguanbao']
|
||||
collection = db['zhongguogaigebao']
|
||||
|
||||
|
||||
async def main():
|
||||
collection_names = await db.list_collection_names()
|
||||
# 判断数据表是否存在
|
||||
if "zhongguogaigebao" not in collection_names:
|
||||
# 如果不存在,则从2017年9月开始爬取
|
||||
print("中国改革报数据表不存在,开始采集!")
|
||||
await getData(start_date, end_date)
|
||||
else:
|
||||
# 如果存在,则从数据库中获取最后一条记录的日期
|
||||
last_record = await collection.find_one({}, sort=[('release_time', -1)])
|
||||
last_date_str = last_record['release_time']
|
||||
print("数据库截止时间:",last_date_str)
|
||||
await getData(last_date_str, end_date)
|
||||
|
||||
|
||||
async def getContent(soup: BeautifulSoup) -> str:
|
||||
"""
|
||||
:param soup: BeautifulSoup对象
|
||||
:return: 文章内容
|
||||
"""
|
||||
content = ""
|
||||
for p in soup.select("#ozoom p"):
|
||||
para = p.text.strip()
|
||||
if para:
|
||||
content += para
|
||||
content += '\n'
|
||||
return content
|
||||
|
||||
|
||||
async def getData(start_date: datetime, end_date: datetime):
|
||||
"""
|
||||
:param start_date: 开始日期
|
||||
:param end_date: 结束日期
|
||||
:return: None
|
||||
"""
|
||||
crawl_num = 0
|
||||
# 创建一个列表保存月份
|
||||
months = []
|
||||
# 从开始日期到结束日期,每个月份都添加到列表中
|
||||
current_date = start_date
|
||||
while current_date <= end_date:
|
||||
months.append(current_date)
|
||||
# 增加一个月
|
||||
if current_date.month == 12:
|
||||
current_date = current_date.replace(year=current_date.year + 1, month=1)
|
||||
else:
|
||||
current_date = current_date.replace(month=current_date.month + 1)
|
||||
# 遍历月份列表
|
||||
for month in months:
|
||||
# 构造URL
|
||||
url = f'http://www.cfgw.net.cn/epaper/{month.strftime("%Y%m")}/period.xml'
|
||||
"""http://www.cfgw.net.cn/epaper/201709/period.xml"""
|
||||
print(url)
|
||||
async with AsyncClient(headers=headers, timeout=60) as client:
|
||||
# 发送GET请求
|
||||
response = await client.get(url)
|
||||
response.encoding = response.charset_encoding
|
||||
print(f"一级连接状态:{response.status_code}")
|
||||
if response.status_code == 200:
|
||||
# 解析XML
|
||||
soup = BeautifulSoup(response.text, 'xml')
|
||||
for period in soup.find_all("period"):
|
||||
try:
|
||||
period_id = period.get("id")
|
||||
url1 = f"http://www.cfgw.net.cn/epaper/{month.strftime('%Y%m')}/{period_id}/node_01.htm"
|
||||
"""http://www.cfgw.net.cn/epaper/201709/05/node_01.htm"""
|
||||
print(url1)
|
||||
response2 = await client.get(url1)
|
||||
response2.encoding = response2.charset_encoding
|
||||
print(f"二级连接状态:{response2.status_code}")
|
||||
if response2.status_code == 200:
|
||||
soup2 = BeautifulSoup(response2.text, 'lxml')
|
||||
for item in soup2.select(".posRelative>a"):
|
||||
url2 = f"http://www.cfgw.net.cn/epaper/{month.strftime('%Y%m')}/{period_id}/" + item.get(
|
||||
"href")
|
||||
"""http://www.cfgw.net.cn/epaper/201709/05/node_01/node_01.htm"""
|
||||
banmianming = item.text.split(":")[-1]
|
||||
banmianhao = item.text.split(":")[0]
|
||||
print(url2)
|
||||
response3 = await client.get(url2)
|
||||
response3.encoding = response3.charset_encoding
|
||||
print(f"三级连接状态:{response3.status_code}")
|
||||
if response3.status_code == 200:
|
||||
soup3 = BeautifulSoup(response3.text, 'lxml')
|
||||
for item2 in soup3.select("#articlelist > .clearfix > a"):
|
||||
url3 = f"http://www.cfgw.net.cn/epaper/" + item2.get("href")[6:]
|
||||
if await collection.find_one({"detail_url": url3}, {"_id": False}):
|
||||
continue
|
||||
title = item2.text.strip()
|
||||
print(url3)
|
||||
response4 = await client.get(url3)
|
||||
response4.encoding = response4.charset_encoding
|
||||
print(f"四级连接状态:{response4.status_code}")
|
||||
if response4.status_code == 200:
|
||||
soup4 = BeautifulSoup(response4.text, 'lxml')
|
||||
try:
|
||||
title = soup4.select("#Title")[0].text.strip()
|
||||
except:
|
||||
title = title
|
||||
try:
|
||||
subtitle = soup4.select("#SubTitle")[0].text.strip()
|
||||
except:
|
||||
subtitle = ""
|
||||
try:
|
||||
preTitle = soup4.select("#PreTitle")[0].text.strip()
|
||||
except:
|
||||
preTitle = ""
|
||||
try:
|
||||
author = soup4.find("author").text.strip()
|
||||
except:
|
||||
author = ""
|
||||
try:
|
||||
keyword = soup4.find("keyword").text.strip()
|
||||
except:
|
||||
keyword = ""
|
||||
content = await getContent(soup4)
|
||||
await collection.insert_one({
|
||||
"title": title,
|
||||
"subtitle": subtitle,
|
||||
"preTitle": preTitle,
|
||||
"author": author,
|
||||
"banmianming": banmianming,
|
||||
"banmianhao": banmianhao,
|
||||
'keywordlist': keyword,
|
||||
'detail_url': url3,
|
||||
'release_time': month + timedelta(days=int(period_id)-1),
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': content
|
||||
})
|
||||
crawl_num += 1
|
||||
print(
|
||||
f"中国改革报---{month.strftime('%Y-%m')}-{period_id}---{banmianming}---{banmianhao}---{title}---采集完成!")
|
||||
await asyncio.sleep(random.randint(5, 15))
|
||||
print(
|
||||
f"中国改革报---{month.strftime('%Y-%m')}-{period_id}---{banmianming}---{banmianhao}----采集完成!")
|
||||
await asyncio.sleep(random.randint(5, 15))
|
||||
print(
|
||||
f"中国改革报---{month.strftime('%Y-%m')}-{period_id}-----采集完成!")
|
||||
await asyncio.sleep(random.randint(5, 15))
|
||||
except Exception as e:
|
||||
await collection.insert_one(
|
||||
{'banmianhao': 'empty',
|
||||
'banmianming': 'empty',
|
||||
'preTitle': 'empty',
|
||||
'title': 'empty',
|
||||
'subtitle': 'empty',
|
||||
'author': 'empty',
|
||||
'keywordlist': 'empty',
|
||||
'detail_url': url,
|
||||
'release_time': month + timedelta(days=int(period_id)),
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': 'empty'}
|
||||
)
|
||||
print(e)
|
||||
print(f"中国改革报采集完毕,共采集{crawl_num}条数据!")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
282
国内党媒/CrawlZhongguojiaoyubao.py
Normal file
282
国内党媒/CrawlZhongguojiaoyubao.py
Normal file
@ -0,0 +1,282 @@
|
||||
# _*_ coding : UTF-8 _*_
|
||||
# @Time : 2024/11/08 21:42
|
||||
# @UpdateTime : 2024/11/08 21:42
|
||||
# @Author : haochen zhong
|
||||
# @File : CrawlZhongguojiaoyubao.py
|
||||
# @Software : PyCharm
|
||||
# @Comment : 本程序采集中国教育报数据
|
||||
|
||||
import asyncio
|
||||
import random
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from httpx import AsyncClient
|
||||
from motor.motor_asyncio import AsyncIOMotorClient
|
||||
|
||||
start_date = datetime.strptime('2022-01', '%Y-%m')
|
||||
"""中国教育报2022年1月份开始有数据"""
|
||||
end_date = datetime.today()
|
||||
"""截止到今天"""
|
||||
headers = {
|
||||
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
|
||||
|
||||
# 链接数据库
|
||||
client = AsyncIOMotorClient('mongodb://localhost:27017')
|
||||
db = client['buweijiguanbao']
|
||||
collection = db['zhongguojiaoyubao']
|
||||
|
||||
|
||||
async def main():
|
||||
collection_names = await db.list_collection_names()
|
||||
# 判断数据表是否存在
|
||||
if "zhongguojiaoyubao" not in collection_names:
|
||||
# 如果不存在,则从2017年9月开始爬取
|
||||
print("中国教育报数据表不存在,开始采集!")
|
||||
await getData(start_date, end_date)
|
||||
else:
|
||||
# 如果存在,则从数据库中获取最后一条记录的日期
|
||||
last_record = await collection.find_one({}, sort=[('release_time', -1)])
|
||||
last_date_str = last_record['release_time']
|
||||
print("数据库截止时间:", last_date_str)
|
||||
await getData(last_date_str, end_date)
|
||||
|
||||
|
||||
async def getContent(soup: BeautifulSoup) -> str:
|
||||
"""
|
||||
:param soup: BeautifulSoup对象
|
||||
:return: 文章内容
|
||||
"""
|
||||
content = ""
|
||||
for p in soup.select(".content_tt p"):
|
||||
para = p.text.strip()
|
||||
if para:
|
||||
content += para
|
||||
content += '\n'
|
||||
return content
|
||||
|
||||
|
||||
async def getData(start_date: datetime, end_date: datetime):
|
||||
"""
|
||||
:param start_date: 开始日期
|
||||
:param end_date: 结束日期
|
||||
:return: None
|
||||
"""
|
||||
crawl_num = 0
|
||||
# 创建一个列表保存月份
|
||||
months = []
|
||||
# 从开始日期到结束日期,每个月份都添加到列表中
|
||||
current_date = start_date
|
||||
while current_date <= end_date:
|
||||
months.append(current_date)
|
||||
# 增加一个月
|
||||
if current_date.month == 12:
|
||||
current_date = current_date.replace(year=current_date.year + 1, month=1)
|
||||
else:
|
||||
current_date = current_date.replace(month=current_date.month + 1)
|
||||
# 遍历月份列表
|
||||
for month in months:
|
||||
# 构造URL
|
||||
url = f'http://paper.jyb.cn/zgjyb/html/{month.strftime("%Y-%m")}/period.xml'
|
||||
"""http://paper.jyb.cn/zgjyb/html/2023-01/period.xml"""
|
||||
print(url)
|
||||
async with AsyncClient(headers=headers, timeout=60) as client:
|
||||
response = await client.get(url)
|
||||
response.encoding = response.charset_encoding
|
||||
print(f"一级连接状态:{response.status_code}")
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.text, 'xml')
|
||||
for period in soup.select("period"):
|
||||
period_name = datetime.strptime(period.find("period_name").text.strip(), "%Y-%m-%d")
|
||||
front_page = period.find("front_page").text.strip()
|
||||
try:
|
||||
url1 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/{front_page}"
|
||||
print(url1)
|
||||
response2 = await client.get(url1)
|
||||
response2.encoding = response2.charset_encoding
|
||||
print(f"二级连接状态:{response2.status_code}")
|
||||
if response2.status_code == 200:
|
||||
soup2 = BeautifulSoup(response2.text, 'lxml')
|
||||
for item in soup2.select(".right_title-name a"):
|
||||
banmianming = item.text.split(":")[-1]
|
||||
banmianhao = item.text.split(":")[0]
|
||||
url2 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/" + item.get(
|
||||
"href").replace("./","").strip()
|
||||
print(url2)
|
||||
response3 = await client.get(url2)
|
||||
response3.encoding = response3.charset_encoding
|
||||
print(f"三级连接状态:{response3.status_code}")
|
||||
if response3.status_code == 200:
|
||||
soup3 = BeautifulSoup(response3.text, 'lxml')
|
||||
for item2 in soup3.select("#titleList1 a"):
|
||||
url3 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/" + item2.get(
|
||||
"href")
|
||||
if await collection.find_one({"detail_url": url3}, {"_id": False}):
|
||||
continue
|
||||
title = item2.text.strip()
|
||||
print(url3)
|
||||
response4 = await client.get(url3)
|
||||
response4.encoding = response4.charset_encoding
|
||||
print(f"四级连接状态:{response4.status_code}")
|
||||
if response4.status_code == 200:
|
||||
soup4 = BeautifulSoup(response4.text, 'lxml')
|
||||
try:
|
||||
title = soup4.select_one(".title1").text.strip()
|
||||
except:
|
||||
title = title
|
||||
try:
|
||||
subTitle = soup4.select(".title2")[0].text.strip()
|
||||
except:
|
||||
subTitle = ""
|
||||
try:
|
||||
author = soup4.select_one(".title3").text.strip()
|
||||
except:
|
||||
author = ""
|
||||
try:
|
||||
perTitle = soup4.select(".title2")[-1].text.strip()
|
||||
except:
|
||||
perTitle = ""
|
||||
try:
|
||||
keywordlist = soup4.find("founder-keyword").text.strip()
|
||||
except:
|
||||
keywordlist = ""
|
||||
content = await getContent(soup4)
|
||||
await collection.insert_one({
|
||||
"title": title,
|
||||
"subtitle": subTitle,
|
||||
"preTitle": perTitle,
|
||||
"author": author,
|
||||
"banmianming": banmianming,
|
||||
"banmianhao": banmianhao,
|
||||
'keywordlist': keywordlist,
|
||||
'detail_url': url3,
|
||||
'release_time': period_name,
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': content
|
||||
})
|
||||
crawl_num += 1
|
||||
print(
|
||||
f"中国教育报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}---{title}---采集完成!")
|
||||
await asyncio.sleep(random.randint(5, 15))
|
||||
print(
|
||||
f"中国教育报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}-----采集完成!")
|
||||
await asyncio.sleep(random.randint(5, 15))
|
||||
print(f"中国教育报---{period_name.strftime('%Y-%m-%d')}-----采集完成!")
|
||||
await asyncio.sleep(random.randint(5, 15))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
await collection.insert_one(
|
||||
{'banmianhao': 'empty',
|
||||
'banmianming': 'empty',
|
||||
'preTitle': 'empty',
|
||||
'title': 'empty',
|
||||
'subtitle': 'empty',
|
||||
'author': 'empty',
|
||||
'keywordlist': 'empty',
|
||||
'detail_url': url,
|
||||
'release_time': period_name,
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': 'empty'}
|
||||
)
|
||||
|
||||
else:
|
||||
url = f"http://paper.jyb.cn/zgjyb/html/{month.strftime('%Y-%m')}/navi.xml"
|
||||
response = await client.get(url)
|
||||
response.encoding = response.charset_encoding
|
||||
print(f"一级连接状态:{response.status_code}")
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.text, 'xml')
|
||||
for period in soup.select("calendar"):
|
||||
period_name = datetime.strptime(period.find("date").text.strip(), "%Y-%m-%d")
|
||||
front_page = period.find("url").text.strip()[6:]
|
||||
try:
|
||||
url1 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/{front_page}"
|
||||
print(url1)
|
||||
response2 = await client.get(url1)
|
||||
response2.encoding = response2.charset_encoding
|
||||
print(f"二级连接状态:{response2.status_code}")
|
||||
if response2.status_code == 200:
|
||||
soup2 = BeautifulSoup(response2.text, 'lxml')
|
||||
for item in soup2.select(".right_title-name a"):
|
||||
banmianming = item.text.split(":")[-1]
|
||||
banmianhao = item.text.split(":")[0]
|
||||
url2 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/" + item.get(
|
||||
"href").replace("./","").strip()
|
||||
print(url2)
|
||||
response3 = await client.get(url2)
|
||||
response3.encoding = response3.charset_encoding
|
||||
print(f"三级连接状态:{response3.status_code}")
|
||||
if response3.status_code == 200:
|
||||
soup3 = BeautifulSoup(response3.text, 'lxml')
|
||||
for item2 in soup3.select("#titleList1 a"):
|
||||
url3 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/" + item2.get(
|
||||
"href")
|
||||
if await collection.find_one({"detail_url": url3}, {"_id": False}):
|
||||
continue
|
||||
title = item2.text.strip()
|
||||
print(url3)
|
||||
response4 = await client.get(url3)
|
||||
response4.encoding = response4.charset_encoding
|
||||
print(f"四级连接状态:{response4.status_code}")
|
||||
if response4.status_code == 200:
|
||||
soup4 = BeautifulSoup(response4.text, 'lxml')
|
||||
try:
|
||||
title = soup4.select(".article-title")[0].text.strip()
|
||||
except:
|
||||
title = title
|
||||
try:
|
||||
subTitle = soup4.select(".article-subtitle")[0].text.strip()
|
||||
except:
|
||||
subTitle = ""
|
||||
try:
|
||||
author = soup4.select(".article-author")[0].text.strip()
|
||||
except:
|
||||
author = ""
|
||||
try:
|
||||
perTitle = soup4.select(".article-pretitle")[0].text.strip()
|
||||
except:
|
||||
perTitle = ""
|
||||
try:
|
||||
keywordlist = soup4.find("founder-keyword").text.strip()
|
||||
except:
|
||||
keywordlist = ""
|
||||
content = await getContent(soup4)
|
||||
await collection.insert_one({
|
||||
"title": title,
|
||||
"subtitle": subTitle,
|
||||
"preTitle": perTitle,
|
||||
"author": author,
|
||||
"banmianming": banmianming,
|
||||
"banmianhao": banmianhao,
|
||||
'keywordlist': keywordlist,
|
||||
'detail_url': url3,
|
||||
'release_time': period_name,
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': content
|
||||
})
|
||||
crawl_num += 1
|
||||
print(
|
||||
f"中国教育报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}---{title}---采集完成!")
|
||||
await asyncio.sleep(random.randint(5, 15))
|
||||
print(
|
||||
f"中国教育报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}-----采集完成!")
|
||||
await asyncio.sleep(random.randint(5, 15))
|
||||
print(f"中国教育报---{period_name.strftime('%Y-%m-%d')}-----采集完成!")
|
||||
await asyncio.sleep(random.randint(5, 15))
|
||||
except Exception as e:
|
||||
await collection.insert_one(
|
||||
{'banmianhao': 'empty',
|
||||
'banmianming': 'empty',
|
||||
'preTitle': 'empty',
|
||||
'title': 'empty',
|
||||
'subtitle': 'empty',
|
||||
'author': 'empty',
|
||||
'keywordlist': 'empty',
|
||||
'detail_url': url,
|
||||
'release_time':period_name ,
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': 'empty'}
|
||||
)
|
||||
print(f"中国教育报采集完毕,共采集{crawl_num}条数据!")
|
||||
|
||||
asyncio.run(main())
|
||||
185
国内党媒/CrawlZhongguojingjidaobao.py
Normal file
185
国内党媒/CrawlZhongguojingjidaobao.py
Normal file
@ -0,0 +1,185 @@
|
||||
# _*_ coding : UTF-8 _*_
|
||||
# @Time : 2024/11/08 00:07
|
||||
# @UpdateTime : 2024/11/08 00:07
|
||||
# @Author : haochen zhong
|
||||
# @File : CrawlZhongguojingjidaobao.py
|
||||
# @Software : PyCharm
|
||||
# @Comment : 本程序采集中国经济导报数据
|
||||
import asyncio
|
||||
import random
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from httpx import AsyncClient
|
||||
from motor.motor_asyncio import AsyncIOMotorClient
|
||||
|
||||
start_date = datetime.strptime('2012-09', '%Y-%m')
|
||||
"""中国经济导报2012年9月份开始有数据"""
|
||||
end_date = datetime.today()
|
||||
"""截止到今天"""
|
||||
headers = {
|
||||
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
|
||||
|
||||
# 链接数据库
|
||||
client = AsyncIOMotorClient('mongodb://localhost:27017')
|
||||
db = client['buweijiguanbao']
|
||||
collection = db['zhongguojingjidaobao']
|
||||
|
||||
|
||||
async def main():
|
||||
collection_names = await db.list_collection_names()
|
||||
# 判断数据表是否存在
|
||||
if "zhongguojingjidaobao" not in collection_names:
|
||||
# 如果不存在,则从2017年9月开始爬取
|
||||
print("中国经济导报数据表不存在,开始采集!")
|
||||
await getData(start_date, end_date)
|
||||
else:
|
||||
# 如果存在,则从数据库中获取最后一条记录的日期
|
||||
last_record = await collection.find_one({}, sort=[('release_time', -1)])
|
||||
last_date_str = last_record['release_time']
|
||||
print("数据库截止时间:", last_date_str)
|
||||
await getData(last_date_str, end_date)
|
||||
|
||||
|
||||
async def getContent(soup: BeautifulSoup) -> str:
|
||||
"""
|
||||
:param soup: BeautifulSoup对象
|
||||
:return: 文章内容
|
||||
"""
|
||||
content = ""
|
||||
for p in soup.select("#pgcontent"):
|
||||
para = p.text.strip()
|
||||
if para:
|
||||
content += para
|
||||
content += '\n'
|
||||
return content
|
||||
|
||||
|
||||
async def getData(start_date: datetime, end_date: datetime):
|
||||
"""
|
||||
:param start_date: 开始日期
|
||||
:param end_date: 结束日期
|
||||
:return: None
|
||||
"""
|
||||
crawl_num = 0
|
||||
# 创建一个列表保存月份
|
||||
months = []
|
||||
# 从开始日期到结束日期,每个月份都添加到列表中
|
||||
current_date = start_date
|
||||
while current_date <= end_date:
|
||||
months.append(current_date)
|
||||
# 增加一个月
|
||||
if current_date.month == 12:
|
||||
current_date = current_date.replace(year=current_date.year + 1, month=1)
|
||||
else:
|
||||
current_date = current_date.replace(month=current_date.month + 1)
|
||||
# 遍历月份列表
|
||||
for month in months:
|
||||
# 构造URL
|
||||
url = f'http://www.ceh.com.cn/epaper/uniflows/html/{month.strftime("%Y/%m")}/date.txt'
|
||||
"""http://www.ceh.com.cn/epaper/uniflows/html/2012/09/date.txt"""
|
||||
print(url)
|
||||
async with AsyncClient(headers=headers, timeout=60) as client:
|
||||
# 发送GET请求
|
||||
response = await client.get(url)
|
||||
response.encoding = "gb2312"
|
||||
print(f"一级连接状态:{response.status_code}")
|
||||
if response.status_code == 200:
|
||||
# 解析XML
|
||||
soup = response.text.split("|")
|
||||
for period in soup:
|
||||
period_id, element = period.split(",")
|
||||
if len(element) < 5:
|
||||
continue
|
||||
try:
|
||||
url1 = f"http://www.ceh.com.cn/epaper/uniflows/html/{month.strftime('%Y/%m')}/{period_id}/boardurl.htm"
|
||||
"""http://www.ceh.com.cn/epaper/uniflows/html/2012/09/01/boardurl.htm"""
|
||||
print(url1)
|
||||
response2 = await client.get(url1)
|
||||
response2.encoding = "gb2312"
|
||||
print(f"二级连接状态:{response2.status_code}")
|
||||
if response2.status_code == 200:
|
||||
soup2 = BeautifulSoup(response2.text, 'lxml')
|
||||
for item in soup2.select(".board_link td>a"):
|
||||
url2 = f"http://www.ceh.com.cn/epaper/uniflows/html/{month.strftime('%Y/%m')}/{period_id}/" + item.get(
|
||||
"href")
|
||||
"""http://www.ceh.com.cn/epaper/uniflows/html/2024/11/07/01/default.htm"""
|
||||
banmianming = item.text.split(":")[-1].strip()
|
||||
banmianhao = item.text.split(":")[0].replace(" ", "").replace(" ", "").strip()
|
||||
print(url2)
|
||||
response3 = await client.get(url2)
|
||||
response3.encoding = "gb2312"
|
||||
print(f"三级连接状态:{response3.status_code}")
|
||||
if response3.status_code == 200:
|
||||
soup3 = BeautifulSoup(response3.text, 'lxml')
|
||||
for item2 in soup3.select("#mp_32"):
|
||||
url3 = f"http://www.ceh.com.cn/epaper/uniflows/html/{month.strftime('%Y/%m')}/{period_id}/" + \
|
||||
item.get("href").split("/")[0] + "/" + item2.get("href")
|
||||
if await collection.find_one({"detail_url": url3}, {"_id": False}):
|
||||
continue
|
||||
title = item2.text.strip()
|
||||
print(url3)
|
||||
response4 = await client.get(url3)
|
||||
response4.encoding = "gb2312"
|
||||
print(f"四级连接状态:{response4.status_code}")
|
||||
if response4.status_code == 200:
|
||||
soup4 = BeautifulSoup(response4.text, 'lxml')
|
||||
try:
|
||||
title = soup4.select(".content_title")[0].text.strip()
|
||||
except:
|
||||
title = title
|
||||
try:
|
||||
subtitle = soup4.select(".subtitle")[0].text.strip()
|
||||
except:
|
||||
subtitle = ""
|
||||
try:
|
||||
preTitle = soup4.select(".yinti_title")[0].text.strip()
|
||||
except:
|
||||
preTitle = ""
|
||||
try:
|
||||
author = soup4.select(".others")[0].text.strip()
|
||||
except:
|
||||
author = ""
|
||||
content = await getContent(soup4)
|
||||
await collection.insert_one({
|
||||
"title": title,
|
||||
"subtitle": subtitle,
|
||||
"preTitle": preTitle,
|
||||
"author": author,
|
||||
"banmianming": banmianming,
|
||||
"banmianhao": banmianhao,
|
||||
'keywordlist': 'empty',
|
||||
'detail_url': url3,
|
||||
'release_time': month + timedelta(days=int(period_id) - 1),
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': content
|
||||
})
|
||||
crawl_num += 1
|
||||
print(
|
||||
f"中国经济导报---{month.strftime('%Y-%m')}-{period_id}---{banmianming}---{banmianhao}---{title}---采集完成!")
|
||||
await asyncio.sleep(random.randint(5, 15))
|
||||
print(
|
||||
f"中国经济导报---{month.strftime('%Y-%m')}-{period_id}---{banmianming}---{banmianhao}----采集完成!")
|
||||
await asyncio.sleep(random.randint(5, 15))
|
||||
print(
|
||||
f"中国经济导报---{month.strftime('%Y-%m')}-{period_id}-----采集完成!")
|
||||
await asyncio.sleep(random.randint(5, 15))
|
||||
except Exception as e:
|
||||
await collection.insert_one(
|
||||
{'banmianhao': 'empty',
|
||||
'banmianming': 'empty',
|
||||
'preTitle': 'empty',
|
||||
'title': 'empty',
|
||||
'subtitle': 'empty',
|
||||
'author': 'empty',
|
||||
'keywordlist': 'empty',
|
||||
'detail_url': url,
|
||||
'release_time': month + timedelta(days=int(period_id)),
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': 'empty'}
|
||||
)
|
||||
print(e)
|
||||
print(f"中国经济导报采集完毕,共采集{crawl_num}条数据!")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
BIN
国内党媒/v1.0各部委的机关报纸和专业报纸名录信息20241104.xlsx
Normal file
BIN
国内党媒/v1.0各部委的机关报纸和专业报纸名录信息20241104.xlsx
Normal file
Binary file not shown.
145
地方政策/报刊/CrawlAnhui.py
Normal file
145
地方政策/报刊/CrawlAnhui.py
Normal file
@ -0,0 +1,145 @@
|
||||
# _*_ coding : UTF-8 _*_
|
||||
# @Time : 2024/11/08 20:29
|
||||
# @UpdateTime : 2024/11/08 20:29
|
||||
# @Author : haochen zhong
|
||||
# @File : CrawlAnhui.py
|
||||
# @Software : PyCharm
|
||||
# @Comment : 本程序采集安徽日报数字报数据
|
||||
|
||||
|
||||
import asyncio
|
||||
import random
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from httpx import AsyncClient
|
||||
from motor.motor_asyncio import AsyncIOMotorClient
|
||||
|
||||
start_date = datetime.strptime('2017-09-29', '%Y-%m-%d')
|
||||
"""安徽日报报2018年09月29日开始有数据"""
|
||||
end_date = datetime.today()
|
||||
"""截止到今天"""
|
||||
headers = {
|
||||
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
|
||||
|
||||
# 链接数据库
|
||||
client = AsyncIOMotorClient('mongodb://localhost:27017')
|
||||
db = client['dfdm_sjribao']
|
||||
collection = db['anhuiribao']
|
||||
|
||||
|
||||
async def main():
|
||||
collection_names = await db.list_collection_names()
|
||||
# 判断数据表是否存在
|
||||
if "anhuiribao" not in collection_names:
|
||||
# 如果不存在,则从2017年9月开始爬取
|
||||
print("安徽日报报数据表不存在,开始采集!")
|
||||
await getData(start_date, end_date)
|
||||
else:
|
||||
# 如果存在,则从数据库中获取最后一条记录的日期
|
||||
last_record = await collection.find_one({}, sort=[('release_time', -1)])
|
||||
last_date_str = last_record['release_time']
|
||||
print("数据库截止时间:", last_date_str)
|
||||
await getData(last_date_str, end_date)
|
||||
|
||||
|
||||
async def getContent(soup: BeautifulSoup) -> str:
|
||||
"""
|
||||
:param soup: BeautifulSoup对象
|
||||
:return: 文章内容
|
||||
"""
|
||||
content = ""
|
||||
for p in soup.select(".content p"):
|
||||
para = p.text.strip()
|
||||
if para:
|
||||
content += para
|
||||
content += '\n'
|
||||
return content
|
||||
|
||||
|
||||
async def getData(start_date: datetime, end_date: datetime):
|
||||
crawl_num = 0
|
||||
for i in range((end_date - start_date).days):
|
||||
date_now = start_date + timedelta(days=i + 1)
|
||||
date_now_s = date_now.strftime('%Y%m/%d')
|
||||
base_url = "https://szb.ahnews.com.cn/ahrb/layout/" + date_now_s + '/'
|
||||
url = base_url + 'node_01.html'
|
||||
"""https://szb.ahnews.com.cn/ahrb/layout/201811/01/node_01.html"""
|
||||
try:
|
||||
async with AsyncClient(headers=headers, timeout=60) as client:
|
||||
print(url)
|
||||
response = await client.get(url)
|
||||
response.encoding = response.charset_encoding
|
||||
print(f"一级连接状态:{response.status_code}")
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
for item in soup.select(".Chunkiconlist p > a:nth-child(1)"):
|
||||
banmianming = item.text.split(":")[-1].strip()
|
||||
banmianhao = item.text.split(":")[0].replace(" ", "").replace(" ", "").strip()
|
||||
url1 = base_url + item.get("href")
|
||||
print(url1)
|
||||
response2= await client.get(url1)
|
||||
response2.encoding = response2.charset_encoding
|
||||
print(f"二级连接状态:{response2.status_code}")
|
||||
if response2.status_code == 200:
|
||||
soup2 = BeautifulSoup(response2.text, 'lxml')
|
||||
for item2 in soup2.select(".newslist a"):
|
||||
url2 = "https://szb.ahnews.com.cn/ahrb/" + item2.get("href")[9:]
|
||||
"""https://szb.ahnews.com.cn/ahrb/content/201709/29/c17310.html"""
|
||||
if await collection.find_one({"detail_url": url2}, {"_id": False}):
|
||||
continue
|
||||
title = item2.text.strip()
|
||||
print(url2)
|
||||
response3 = await client.get(url2)
|
||||
response3.encoding = response3.charset_encoding
|
||||
print(f"三级连接状态:{response3.status_code}")
|
||||
if response3.status_code == 200:
|
||||
soup3 = BeautifulSoup(response3.text, 'lxml')
|
||||
content = await getContent(soup3)
|
||||
try:
|
||||
title = soup3.select(".newsdetatit h3")[0].text.strip()
|
||||
except:
|
||||
title = title
|
||||
try:
|
||||
subTitle= soup3.select(".newsdetatext p")[0].text.strip()
|
||||
except:
|
||||
subTitle = ""
|
||||
await collection.insert_one({
|
||||
"title": title,
|
||||
"subtitle": subTitle,
|
||||
"preTitle": "",
|
||||
"author": "",
|
||||
"banmianming": banmianming,
|
||||
"banmianhao": banmianhao,
|
||||
'keywordlist': 'empty',
|
||||
'detail_url': url2,
|
||||
'release_time': date_now,
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': content
|
||||
})
|
||||
crawl_num += 1
|
||||
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!")
|
||||
await asyncio.sleep(random.randint(5, 15))
|
||||
print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!")
|
||||
await asyncio.sleep(random.randint(5, 15))
|
||||
print(f"安徽日报---{date_now_s}-----采集完成!")
|
||||
await asyncio.sleep(random.randint(5, 15))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
await collection.insert_one(
|
||||
{'banmianhao': 'empty',
|
||||
'banmianming': 'empty',
|
||||
'preTitle': 'empty',
|
||||
'title': 'empty',
|
||||
'subtitle': 'empty',
|
||||
'author': 'empty',
|
||||
'keywordlist': 'empty',
|
||||
'detail_url': url,
|
||||
'release_time': date_now,
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': 'empty'}
|
||||
)
|
||||
print(f"安徽日报采集完毕,共采集{crawl_num}条数据!")
|
||||
|
||||
asyncio.run(main())
|
||||
|
||||
140
地方政策/报刊/CrawlGuizhou.py
Normal file
140
地方政策/报刊/CrawlGuizhou.py
Normal file
@ -0,0 +1,140 @@
|
||||
# _*_ coding : UTF-8 _*_
|
||||
# @Time : 2022/12/27 14:15
|
||||
# @UpdateTime : 2023/11/08 16:30
|
||||
# @Author : Haochen Zhong
|
||||
# @File : CrawlGuizhou.py
|
||||
# @Software : PyCharm
|
||||
# @Comment : 本程序采集贵州日报数字报板面数据
|
||||
import random
|
||||
import time
|
||||
from datetime import timedelta, datetime
|
||||
|
||||
import pymongo
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# 数据库起止时间
|
||||
start_date = datetime.strptime('2021-12-31', '%Y-%m-%d')
|
||||
"""贵州日报数字报2022-01-01开始有数据纪录"""
|
||||
end_date = datetime.today()
|
||||
headers = {
|
||||
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
|
||||
# 创建数据库
|
||||
client = pymongo.MongoClient('localhost', 27017)
|
||||
mydb = client.dfdm_sjribao
|
||||
guizhouribao = mydb.guizhouribao
|
||||
# 设置随机时间
|
||||
sleeptime = random.randint(2, 15)
|
||||
|
||||
|
||||
def main():
|
||||
# 判断数据库是否存在
|
||||
collist = mydb.list_collection_names()
|
||||
if "guizhouribao" in collist: # 检测集合是否存在
|
||||
print("贵州集合存在,更新数据库")
|
||||
# 数据库最新一条内容的时间
|
||||
db_time = guizhouribao.find_one(sort=[('release_time', -1)])[
|
||||
'release_time'] # 或者find().sort('_id', -1).limit(1)
|
||||
print('数据库截止时间%s' % db_time)
|
||||
# 输入更新数据库时间
|
||||
input_time = datetime.today()
|
||||
if db_time < input_time:
|
||||
getData(db_time, input_time)
|
||||
else:
|
||||
print('数据库无需更新')
|
||||
else:
|
||||
# 爬取网页并建立数据库
|
||||
print("数据库不存在,建立数据库!")
|
||||
getData(start_date, end_date)
|
||||
|
||||
|
||||
def get_content(soup3):
|
||||
content = ""
|
||||
for p in soup3.select("#ozoom p"):
|
||||
para = p.text.strip()
|
||||
if para:
|
||||
content += para
|
||||
content += '\n'
|
||||
return content
|
||||
|
||||
|
||||
def getData(start_date, end_date):
|
||||
crawl_num = 0
|
||||
for i in range((end_date - start_date).days):
|
||||
date_now = start_date + timedelta(days=i + 1)
|
||||
date_now_s = date_now.strftime('%Y%m/%d')
|
||||
base_url = "http://szb.gzrbs.com.cn/pc/layout/" + date_now_s + "/"
|
||||
url = base_url + "node_01.html"
|
||||
# http://szb.gzrbs.com.cn/pc/layout/202201/01/node_01.html
|
||||
try:
|
||||
response = requests.get(url=url, headers=headers, timeout=(30, 45))
|
||||
response.encoding = response.apparent_encoding
|
||||
print(f"一级连接状态:{response.status_code}")
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
for item in soup.select(".btn-block"):
|
||||
banmianming = item.text.split(":")[-1]
|
||||
banmianhao = item.text.split(":")[0]
|
||||
url1 = base_url + item.get("href")
|
||||
response2 = requests.get(url=url1, headers=headers, timeout=(30, 45))
|
||||
response2.encoding = response2.apparent_encoding
|
||||
print(f"二级连接状态:{response2.status_code}")
|
||||
if response2.status_code == 200:
|
||||
soup2 = BeautifulSoup(response2.text, "lxml")
|
||||
for item2 in soup2.select(".resultList a"):
|
||||
title = item2.text.strip()
|
||||
url2 = "http://szb.gzrbs.com.cn/pc/" + item2.get("href")[9:]
|
||||
# http://szb.gzrbs.com.cn/pc/cont/202201/02/content_42202.html
|
||||
response3 = requests.get(url=url2, headers=headers, timeout=(30, 45))
|
||||
response3.encoding = response3.apparent_encoding
|
||||
print(f"三级连接状态:{response3.status_code}")
|
||||
if response3.status_code == 200:
|
||||
soup3 = BeautifulSoup(response3.text, "lxml")
|
||||
try:
|
||||
title = soup3.select("#Title")[0].text.strip()
|
||||
except:
|
||||
title = title
|
||||
try:
|
||||
subtitle = soup3.select("#SubTitle")[0].text.strip()
|
||||
except:
|
||||
subtitle = ""
|
||||
try:
|
||||
preTitle = soup3.select("#PreTitle")[0].text.strip()
|
||||
except:
|
||||
preTitle = ""
|
||||
content = get_content(soup3)
|
||||
guizhouribao.insert_one({'banmianhao': banmianhao,
|
||||
'banmianming': banmianming,
|
||||
'preTitle': preTitle,
|
||||
'title': title,
|
||||
'subtitle': subtitle,
|
||||
'author': '',
|
||||
'keywordlist': 'empty',
|
||||
'detail_url': url2,
|
||||
'release_time': date_now,
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': content})
|
||||
crawl_num += 1
|
||||
print(f"贵州日报-{date_now_s}-{banmianming}-{title}-已完成")
|
||||
time.sleep(sleeptime)
|
||||
print(f"贵州日报-{date_now_s}-{banmianming}-已完成")
|
||||
time.sleep(sleeptime)
|
||||
print(f"贵州日报-{date_now_s}-已完成")
|
||||
except Exception as result:
|
||||
guizhouribao.insert_one({'banmianhao': 'empty',
|
||||
'banmianming': 'empty',
|
||||
'preTitle': 'empty',
|
||||
'title': 'empty',
|
||||
'subtitle': 'empty',
|
||||
'author': 'empty',
|
||||
'keywordlist': 'empty',
|
||||
'detail_url': url,
|
||||
'release_time': date_now,
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': 'empty'})
|
||||
print(result)
|
||||
print(f"贵州日报采集完毕,共采集{crawl_num}条数据!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
162
地方政策/报刊/CrawlHainan.py
Normal file
162
地方政策/报刊/CrawlHainan.py
Normal file
@ -0,0 +1,162 @@
|
||||
# _*_ coding : UTF-8 _*_
|
||||
# @Time : 2024-01-17 14:24:59
|
||||
# @Author : haochen zhong
|
||||
# @File : CrawlHainan.py
|
||||
# @Software : PyCharm
|
||||
# @Comment :
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from datetime import timedelta, datetime
|
||||
import time
|
||||
import pymongo
|
||||
import random
|
||||
|
||||
# 数据库起止时间
|
||||
start_date = datetime.strptime('2008-02-29', '%Y-%m-%d')
|
||||
end_date = datetime.today()
|
||||
headers = {
|
||||
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
|
||||
# 创建数据库
|
||||
client = pymongo.MongoClient('localhost', 27017)
|
||||
mydb = client.dfdm_sjribao
|
||||
hainanribao = mydb.hainanribao
|
||||
|
||||
|
||||
def main():
|
||||
# 判断数据库是否存在
|
||||
collist = mydb.list_collection_names()
|
||||
if "hainanribao" in collist: # 检测集合是否存在
|
||||
print("海南日报集合存在,更新数据库")
|
||||
# 数据库最新一条内容的时间
|
||||
db_time = hainanribao.find_one(sort=[('release_time', -1)])['release_time']
|
||||
print(f'数据库截止时间{db_time}')
|
||||
# 输入更新数据库时间
|
||||
input_time = datetime.today()
|
||||
if db_time < input_time:
|
||||
getData(db_time, input_time)
|
||||
else:
|
||||
print('数据库无需更新')
|
||||
else:
|
||||
print("数据库不存在,建立数据库")
|
||||
# 爬取网页并建立数据库
|
||||
getData(start_date, end_date)
|
||||
|
||||
|
||||
# 解析网页正文
|
||||
def parse_html_text(soup):
|
||||
"""
|
||||
:param html: html字符串
|
||||
:return: 正文 string
|
||||
"""
|
||||
content = ''
|
||||
if soup.select('#ozoom'):
|
||||
content = soup.select('#ozoom')[0].text.strip()
|
||||
return content
|
||||
|
||||
|
||||
def parse_subtitle(soup):
|
||||
item = soup.select('.font02')
|
||||
if re.findall(r'article-subtitle>-->(.*?)<!--', str(item)):
|
||||
subtitle = re.findall(r'article-subtitle>-->(.*?)<!--', str(item))[0]
|
||||
else:
|
||||
subtitle = ''
|
||||
return subtitle
|
||||
|
||||
|
||||
def parse_h3title(soup):
|
||||
item = soup.select('.font02')
|
||||
if re.findall(r'article-pretitle>-->(.*?)<!--', str(item)):
|
||||
h3title = re.findall(r'article-pretitle>-->(.*?)<!--', str(item))[0]
|
||||
else:
|
||||
h3title = ''
|
||||
return h3title
|
||||
|
||||
|
||||
def parse_author(soup):
|
||||
item = soup.select('.font02')
|
||||
if re.findall(r'article-subtitle>-->(.*?)<!--', str(item)):
|
||||
author = re.findall(r'article-subtitle>-->(.*?)<!--', str(item))[0]
|
||||
else:
|
||||
author = ''
|
||||
return author
|
||||
|
||||
|
||||
# 爬取网页并建立数据库
|
||||
def getData(start_date, end_date):
|
||||
crawl_num = 0
|
||||
for i in range((end_date - start_date).days):
|
||||
date_now = start_date + timedelta(days=i + 1)
|
||||
date_now_s = date_now.strftime('%Y-%m/%d')
|
||||
base_url = "http://news.hndaily.cn/html/" + date_now_s + '/'
|
||||
url = base_url + 'node_1.htm'
|
||||
# 进入首页
|
||||
try:
|
||||
response = requests.get(url, headers=headers)
|
||||
response.encoding = response.apparent_encoding
|
||||
print(f'一级连接状态{response.status_code}')
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
for item in soup.select('#pageLink'):
|
||||
banmianhao = item.text.split(":")[0].strip()
|
||||
banmianming = item.text.split(":")[-1].strip()
|
||||
if banmianming == "广告":
|
||||
continue
|
||||
url1 = base_url + item.get('href')
|
||||
response2 = requests.get(url1, headers=headers)
|
||||
response2.encoding = response2.apparent_encoding
|
||||
print(f'二级连接状态{response2.status_code}')
|
||||
if response2.status_code == 200:
|
||||
soup1 = BeautifulSoup(response2.text, "lxml")
|
||||
for item1 in soup1.select('#main-ed-articlenav-list tr td div a'):
|
||||
detail_url = base_url + item1.get('href')
|
||||
print(detail_url)
|
||||
title = item1.text.strip()
|
||||
response3 = requests.get(detail_url, headers=headers)
|
||||
response3.encoding = response3.apparent_encoding
|
||||
print(f'三级连接状态:{response3.status_code}')
|
||||
if response3.status_code == 200:
|
||||
soup2 = BeautifulSoup(response3.text, "lxml")
|
||||
try:
|
||||
title = soup2.select('.font01')[0].text.strip()
|
||||
except IndexError:
|
||||
pass
|
||||
subtitle = parse_subtitle(soup2)
|
||||
h3title = parse_h3title(soup2)
|
||||
author = parse_author(soup2)
|
||||
content = parse_html_text(soup2)
|
||||
hainanribao.insert_one({'banmianhao': banmianhao,
|
||||
'banmianming': banmianming,
|
||||
'title': title,
|
||||
'subtitle': subtitle,
|
||||
'h3title': h3title,
|
||||
'author': author,
|
||||
'keywordlist': '',
|
||||
'detail_url': detail_url,
|
||||
'release_time': date_now,
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': content})
|
||||
print(f"海南日报-{date_now_s}-{banmianhao}-{banmianming}-{title}已经完成")
|
||||
crawl_num += 1
|
||||
time.sleep(random.randint(3, 10))
|
||||
print(f"海南日报-{date_now_s}-{banmianhao}-{banmianming}-已经完成")
|
||||
time.sleep(random.randint(3, 10))
|
||||
print(f"海南日报-{date_now_s}-已经完成")
|
||||
except Exception as result:
|
||||
hainanribao.insert_one({'banmianhao': 'empty',
|
||||
'banmianming': 'empty',
|
||||
'title': 'empty',
|
||||
'subtitle': 'empty',
|
||||
'h3title': 'empty',
|
||||
'author': 'empty',
|
||||
'keywordlist': 'empty',
|
||||
'detail_url': url,
|
||||
'release_time': date_now,
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': 'empty'})
|
||||
print(result)
|
||||
print(f"海南日报采集完毕,本次共采集{crawl_num}条数据!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
133
地方政策/报刊/CrawlHenan.py
Normal file
133
地方政策/报刊/CrawlHenan.py
Normal file
@ -0,0 +1,133 @@
|
||||
# _*_ coding : UTF-8 _*_
|
||||
# @Time : 2024-03-08 10:18:55
|
||||
# @Author : haochen zhong
|
||||
# @File : CrawlHenan.py
|
||||
# @Software : PyCharm
|
||||
# @Comment :采集河南日报数字报版面数据
|
||||
import datetime
|
||||
import random
|
||||
import time
|
||||
|
||||
import pymongo
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
start_date = datetime.datetime.strptime('2007-10-13', '%Y-%m-%d')
|
||||
"""采集开始时间"""
|
||||
end_date = datetime.datetime.today()
|
||||
"""采集结束时间"""
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
|
||||
"""自定义请求头"""
|
||||
# 创建数据库
|
||||
dbclient = pymongo.MongoClient('localhost', 27017)
|
||||
"""连接数据库"""
|
||||
mydb = dbclient.dfdm_sjribao
|
||||
henanribao = mydb.henanribao
|
||||
|
||||
|
||||
def main():
|
||||
# 判断数据库是否存在
|
||||
collist = mydb.list_collection_names()
|
||||
if "henanribao" in collist: # 检测集合是否存在
|
||||
print("河南集合存在,更新数据库")
|
||||
# 数据库最新一条内容的时间
|
||||
db_time = henanribao.find_one(sort=[('release_time', -1)])['release_time']
|
||||
print('数据库截止时间%s' % db_time)
|
||||
# 输入更新数据库时间
|
||||
input_time = datetime.datetime.today()
|
||||
if db_time < input_time:
|
||||
getData(db_time, input_time)
|
||||
else:
|
||||
print('数据库无需更新')
|
||||
else:
|
||||
# 爬取网页并建立数据库
|
||||
print("数据库不存在,建立数据库!")
|
||||
getData(start_date, end_date)
|
||||
|
||||
|
||||
def getContent(soup: BeautifulSoup):
|
||||
content = ''
|
||||
for p in soup.select('#articleContent p'):
|
||||
para = p.text.strip()
|
||||
if para:
|
||||
content += para
|
||||
content += '\n'
|
||||
return content
|
||||
|
||||
|
||||
def getData(start_date, end_date):
|
||||
crawl_num = 0
|
||||
for i in range((end_date - start_date).days): # gu:时间长度
|
||||
date_now = start_date + datetime.timedelta(days=i + 1)
|
||||
date_now_s = date_now.strftime('%Y-%m/%d')
|
||||
base_url = "http://newpaper.dahe.cn/hnrb/html/" + date_now_s + '/'
|
||||
url = base_url + 'node_1.htm'
|
||||
# http://newpaper.dahe.cn/hnrb/html/2024-03/08/node_1.htm
|
||||
print(url)
|
||||
try:
|
||||
response = requests.get(url, headers, timeout=60)
|
||||
response.encoding = response.apparent_encoding
|
||||
print(f"一级链接状态:{response.status_code}")
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
for item in soup.select(".layout-catalogue-item>a:nth-child(1)"):
|
||||
banmianhao = item.text.split(":")[0]
|
||||
banmianming = item.text.split(":")[-1]
|
||||
url1 = base_url + item.get("href")
|
||||
response2 = requests.get(url1, headers)
|
||||
response2.encoding = response2.apparent_encoding
|
||||
print(f"二级链接状态:{response2.status_code}")
|
||||
if response2.status_code == 200:
|
||||
soup2 = BeautifulSoup(response2.text, "lxml")
|
||||
for item2 in soup2.select(".news-item a"):
|
||||
title = item2.get("title", "").strip()
|
||||
url2 = base_url + item2.get("href")
|
||||
response3 = requests.get(url2, headers)
|
||||
response3.encoding = response3.apparent_encoding
|
||||
print(f"三级连接状态:{response3.status_code}")
|
||||
if response3.status_code == 200:
|
||||
soup3 = BeautifulSoup(response3.text, "lxml")
|
||||
content = getContent(soup3)
|
||||
try:
|
||||
preTitle = soup3.select(".headline")[0].text.strip()
|
||||
except Exception as e:
|
||||
preTitle = ""
|
||||
try:
|
||||
subtitle = soup3.select(".subtitle")[0].test.strip()
|
||||
except Exception as e:
|
||||
subtitle = ""
|
||||
henanribao.insert_one({'banmianhao': banmianhao,
|
||||
'banmianming': banmianming,
|
||||
'title': title,
|
||||
'subtitle': subtitle,
|
||||
'preTitle': preTitle,
|
||||
'author': '',
|
||||
'keywordlist': '',
|
||||
'detail_url': url2,
|
||||
'release_time': date_now,
|
||||
'insert_timestamp': datetime.datetime.today(),
|
||||
'content': content})
|
||||
crawl_num += 1
|
||||
print(f"河南日报-{date_now_s}-{banmianhao}-{title}---采集成功!")
|
||||
time.sleep(random.randint(5, 10))
|
||||
print(f"河南日报-{date_now_s}-{banmianhao}---采集成功!")
|
||||
print(f"河南日报-{date_now_s}---采集成功!")
|
||||
except Exception as result:
|
||||
henanribao.insert_one({'banmianhao': 'empty',
|
||||
'banmianming': 'empty',
|
||||
'preTitle': 'empty',
|
||||
'title': 'empty',
|
||||
'subtitle': 'empty',
|
||||
'author': 'empty',
|
||||
'keywordlist': 'empty',
|
||||
'detail_url': url,
|
||||
'release_time': date_now,
|
||||
'insert_timestamp': datetime.datetime.today(),
|
||||
'content': 'empty'})
|
||||
print(result)
|
||||
print(f"河南日报采集完毕,共采集{crawl_num}条数据!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
140
地方政策/报刊/CrawlNingxia.py
Normal file
140
地方政策/报刊/CrawlNingxia.py
Normal file
@ -0,0 +1,140 @@
|
||||
# _*_ coding : UTF-8 _*_
|
||||
# @Time : 2022/12/29 13:48
|
||||
# @Author : Haochen Zhong
|
||||
# @File : CrawlNingxia.py
|
||||
# @Software : PyCharm
|
||||
# @Comment : 本程序采集宁夏日报版面数据
|
||||
|
||||
import requests
|
||||
import time
|
||||
import pymongo
|
||||
import random
|
||||
from datetime import timedelta, datetime
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# 数据库起止时间
|
||||
start_date = datetime.strptime('2022-01-31', '%Y-%m-%d')
|
||||
"""宁夏日报2022-02-01开始有数据"""
|
||||
end_date = datetime.today()
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
|
||||
# 创建数据库
|
||||
client = pymongo.MongoClient('localhost', 27017)
|
||||
mydb = client.dfdm_sjribao
|
||||
ningxiaribao = mydb.ningxiaribao
|
||||
# 设置随机时间
|
||||
sleeptime = random.randint(2, 10)
|
||||
|
||||
|
||||
def main():
|
||||
# 判断数据库是否存在
|
||||
collist = mydb.list_collection_names()
|
||||
if "ningxiaribao" in collist: # 检测集合是否存在
|
||||
print("宁夏集合存在,更新数据库")
|
||||
# 数据库最新一条内容的时间
|
||||
db_time = ningxiaribao.find_one(sort=[('release_time', -1)])[
|
||||
'release_time'] # 或者find().sort('_id', -1).limit(1)
|
||||
print('数据库截止时间%s' % db_time)
|
||||
# 输入更新数据库时间
|
||||
input_time = datetime.today()
|
||||
if db_time < input_time:
|
||||
getData(db_time, input_time)
|
||||
else:
|
||||
print('数据库无需更新!')
|
||||
else:
|
||||
# 爬取网页并建立数据库
|
||||
print("数据库不存在,建立数据库!")
|
||||
getData(start_date, end_date)
|
||||
|
||||
|
||||
def get_content(soup3):
|
||||
content = ""
|
||||
for p in soup3.select("#ozoom p"):
|
||||
para = p.text.strip()
|
||||
if para:
|
||||
content += para
|
||||
content += '\n'
|
||||
return content
|
||||
|
||||
|
||||
def getData(start_date, end_date):
|
||||
crawl_num = 0
|
||||
for i in range((end_date - start_date).days): # gu:时间长度
|
||||
date_now = start_date + timedelta(days=i + 1)
|
||||
date_now_s = date_now.strftime('%Y%m/%d')
|
||||
base_url = "https://szb.nxrb.cn/nxrb/pc/layout/" + date_now_s + "/"
|
||||
url = base_url + "node_01.html"
|
||||
# https://szb.nxrb.cn/nxrb/pc/layout/202202/01/node_01.html
|
||||
try:
|
||||
response = requests.get(url=url, headers=headers, timeout=(30, 45))
|
||||
response.encoding = response.apparent_encoding
|
||||
print(f"一级连接状态: {response.status_code}")
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
for item in soup.select(".nav-list .btn-block"):
|
||||
banmianhao = item.text.split(":")[0].strip()
|
||||
banmianming = item.text.split(":")[-1].strip()
|
||||
url1 = base_url + item.get("href")
|
||||
response2 = requests.get(url=url1, headers=headers, timeout=(30, 45))
|
||||
response2.encoding = response2.apparent_encoding
|
||||
print(f"二级连接状态:{response2.status_code}")
|
||||
if response2.status_code == 200:
|
||||
soup2 = BeautifulSoup(response2.text, "lxml")
|
||||
for item2 in soup2.select(".news-list .resultList a"):
|
||||
url_title = item2.text.strip()
|
||||
url2 = "https://szb.nxrb.cn/nxrb/pc/" + item2.get("href")[9:]
|
||||
print(url2)
|
||||
response3 = requests.get(url=url2, headers=headers, timeout=(30, 45))
|
||||
response3.encoding = response3.apparent_encoding
|
||||
print(f"三级连接状态:{response3.status_code}")
|
||||
if response3.status_code == 200:
|
||||
soup3 = BeautifulSoup(response3.text, "lxml")
|
||||
try:
|
||||
pretitle = soup3.select("#PreTitle")[0].text.strip()
|
||||
except:
|
||||
pretitle = ""
|
||||
try:
|
||||
title = soup3.select("#Title")[0].text.strip()
|
||||
except:
|
||||
title = url_title
|
||||
try:
|
||||
subtitle = soup3.select("SubTitle")[0].text.strip()
|
||||
except:
|
||||
subtitle = ""
|
||||
content = get_content(soup3)
|
||||
ningxiaribao.insert_one({'banmianhao': banmianhao,
|
||||
'banmianming': banmianming,
|
||||
'title': title,
|
||||
'subtitle': subtitle,
|
||||
'h3title': pretitle,
|
||||
'author': '',
|
||||
'keywordlist': 'empty',
|
||||
'detail_url': url2,
|
||||
'release_time': date_now,
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': content})
|
||||
crawl_num += 1
|
||||
print(f"宁夏日报-{date_now_s}-{banmianhao}-{title}-已完成")
|
||||
time.sleep(sleeptime)
|
||||
print(f"宁夏日报-{date_now_s}-{banmianhao}-已完成")
|
||||
time.sleep(sleeptime)
|
||||
print(f"宁夏日报-{date_now_s}-已完成")
|
||||
time.sleep(sleeptime)
|
||||
except Exception as result:
|
||||
ningxiaribao.insert_one({'banmianhao': 'empty',
|
||||
'banmianming': 'empty',
|
||||
'title': 'empty',
|
||||
'subtitle': 'empty',
|
||||
'h3title': 'empty',
|
||||
'author': 'empty',
|
||||
'keywordlist': 'empty',
|
||||
'detail_url': url,
|
||||
'release_time': date_now,
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': 'empty'})
|
||||
print(result)
|
||||
print(f"宁夏日报采集完成,成功采集{crawl_num}条数据!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
137
地方政策/报刊/CrawlSiChuan.py
Normal file
137
地方政策/报刊/CrawlSiChuan.py
Normal file
@ -0,0 +1,137 @@
|
||||
# _*_ coding : UTF-8 _*_
|
||||
# @Time : 2024/01/09 14:15
|
||||
# @UpdateTime : 2024/01/09 16:30
|
||||
# @Author : Haochen Zhong
|
||||
# @File : CrawlSiChuan.py
|
||||
# @Software : PyCharm
|
||||
# @Comment : 本程序采集四川日报数字报板面数据
|
||||
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
from datetime import timedelta, datetime
|
||||
import time
|
||||
import pymongo
|
||||
import random
|
||||
|
||||
# 数据库起止时间
|
||||
start_date = datetime.strptime('2021-12-31', '%Y-%m-%d')
|
||||
end_date = datetime.today()
|
||||
headers = {
|
||||
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
|
||||
# 创建数据库
|
||||
client = pymongo.MongoClient('localhost', 27017)
|
||||
mydb = client.dfdm_sjribao
|
||||
sichuanribao = mydb.sichuanribao
|
||||
|
||||
|
||||
def main():
|
||||
# 判断数据库是否存在
|
||||
collist = mydb.list_collection_names()
|
||||
if "sichuanribao" in collist: # 检测集合是否存在
|
||||
print("四川日报集合存在,更新数据库")
|
||||
# 数据库最新一条内容的时间
|
||||
db_time = sichuanribao.find_one(sort=[('release_time', -1)])[
|
||||
'release_time'] # 或者find().sort('_id', -1).limit(1)
|
||||
print('数据库截止时间%s' % db_time)
|
||||
# 输入更新数据库时间
|
||||
input_time = datetime.today()
|
||||
if db_time < input_time:
|
||||
getData(db_time, input_time)
|
||||
else:
|
||||
print('数据库无需更新')
|
||||
else:
|
||||
# 爬取网页并建立数据库
|
||||
print("数据库不存在,建立数据库!")
|
||||
getData(start_date, end_date)
|
||||
|
||||
|
||||
def getContent(soup):
|
||||
content = ''
|
||||
for p in soup.select('#main2 > div.main2_r > ul > li:nth-child(2) p'):
|
||||
para = p.text.strip()
|
||||
if para:
|
||||
content += para
|
||||
content += '\n'
|
||||
return content
|
||||
|
||||
|
||||
def getSubtitle(soup):
|
||||
subtitle = ''
|
||||
if soup.select('#main2 > div.main2_r > ul > li:nth-child(1) > h2:nth-child(1)'):
|
||||
for p in soup.select('#main2 > div.main2_r > ul > li:nth-child(1) > h2:nth-child(1)'):
|
||||
para = p.text.strip()
|
||||
if para:
|
||||
subtitle += para
|
||||
subtitle += '\n'
|
||||
return subtitle
|
||||
|
||||
|
||||
def getData(start_date, end_date):
|
||||
crawl_num = 0
|
||||
for i in range((end_date - start_date).days):
|
||||
date_now = start_date + timedelta(days=i + 1)
|
||||
date_now_s = date_now.strftime('%Y%m%d')
|
||||
base_url = "https://epaper.scdaily.cn/shtml/scrb/"
|
||||
url = base_url + date_now_s + '/index.shtml'
|
||||
try:
|
||||
response = requests.get(url, headers)
|
||||
print(f"一级链接状态:{response.status_code}")
|
||||
if response.status_code == 200:
|
||||
response.encoding = response.apparent_encoding
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
for item in soup.select("#main > div.main_r > ul:nth-child(2) > li:nth-child(2) a"):
|
||||
banmianhao = item.text.split(":")[0]
|
||||
banmianming = item.text.split(":")[-1]
|
||||
url1 = "https://epaper.scdaily.cn" + item.get("href")
|
||||
response2 = requests.get(url1, headers)
|
||||
print(f"二级链接状态:{response2.status_code}")
|
||||
if response2.status_code == 200:
|
||||
response2.encoding = response2.apparent_encoding
|
||||
soup2 = BeautifulSoup(response2.text, "lxml")
|
||||
for item2 in soup2.select("#main > div.main_r > ul:nth-child(3) > li:nth-child(2) a"):
|
||||
url2 = "https://epaper.scdaily.cn" + item2.get("href")
|
||||
title = item2.get("title")
|
||||
response3 = requests.get(url2, headers)
|
||||
print(f"三级连接状态:{response3.status_code}")
|
||||
if response3.status_code == 200:
|
||||
response3.encoding = response3.apparent_encoding
|
||||
soup3 = BeautifulSoup(response3.text, "lxml")
|
||||
content = getContent(soup3)
|
||||
subtitle = getSubtitle(soup3)
|
||||
sichuanribao.insert_one({'banmianhao': banmianhao,
|
||||
'banmianming': banmianming,
|
||||
'title': title,
|
||||
'subtitle': subtitle,
|
||||
'h3title': '',
|
||||
'author': '',
|
||||
'keywordlist': '',
|
||||
'detail_url': url2,
|
||||
'release_time': date_now,
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': content})
|
||||
print(f"四川日报--{date_now_s}-{banmianhao}-{title}----已完成")
|
||||
crawl_num += 1
|
||||
time.sleep(random.randint(3, 10))
|
||||
print(f"四川日报--{date_now_s}-{banmianhao}----已完成")
|
||||
time.sleep(random.randint(3, 10))
|
||||
print(f"四川日报--{date_now_s}-----已完成")
|
||||
time.sleep(random.randint(3, 10))
|
||||
except Exception as result:
|
||||
sichuanribao.insert_one({'banmianhao': 'empty',
|
||||
'banmianming': 'empty',
|
||||
'preTitle': 'empty',
|
||||
'title': 'empty',
|
||||
'subtitle': 'empty',
|
||||
'author': 'empty',
|
||||
'keywordlist': 'empty',
|
||||
'detail_url': url,
|
||||
'release_time': date_now,
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': 'empty'})
|
||||
print(result)
|
||||
print(f"四川日报采集完毕,共采集{crawl_num}条数据!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
167
地方政策/报刊/CrawlXinminwanbao.py
Normal file
167
地方政策/报刊/CrawlXinminwanbao.py
Normal file
@ -0,0 +1,167 @@
|
||||
# _*_ coding : UTF-8 _*_
|
||||
# @Time : 2022/6/17 8:50
|
||||
# @Author : Haochen Zhong
|
||||
# @File : 本程序用于抓取上海新民晚报数据
|
||||
# @Project : Pytharm
|
||||
from bs4 import BeautifulSoup, Comment
|
||||
import requests
|
||||
from datetime import timedelta, datetime
|
||||
import time
|
||||
import pymongo
|
||||
import random
|
||||
|
||||
start_date = datetime.strptime('2018-12-31', '%Y-%m-%d') # 抓取上海新民晚报从2019-01-01到至今的数据
|
||||
end_date = datetime.today()
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
|
||||
# 创建数据库
|
||||
client = pymongo.MongoClient('localhost', 27017)
|
||||
mydb = client.dfdm_qitaguanbao
|
||||
shxinminwanbao = mydb.shxinminwanbao
|
||||
|
||||
|
||||
# 设置随机时间
|
||||
|
||||
|
||||
def main():
|
||||
# 判断数据库是否存在
|
||||
collist = mydb.list_collection_names()
|
||||
if "shxinminwanbao" in collist: # 检测集合是否存在
|
||||
print("上海新民晚报集合存在,更新数据库")
|
||||
# 数据库最新一条内容的时间
|
||||
db_time = shxinminwanbao.find_one(sort=[('release_time', -1)])['release_time']
|
||||
print('数据库截止时间%s' % db_time)
|
||||
# 输入更新数据库时间
|
||||
input_time = datetime.today()
|
||||
if db_time < input_time:
|
||||
getData(db_time, input_time)
|
||||
else:
|
||||
print('数据库无需更新')
|
||||
else:
|
||||
# 爬取网页并建立数据库
|
||||
print('数据库不存在,建立数据库!')
|
||||
getData(start_date, end_date)
|
||||
|
||||
|
||||
def parse_html_text(soup2):
|
||||
img_list = soup2.select('.dzb-enter-desc-box p img')
|
||||
if img_list:
|
||||
img = '图片链接:\n'
|
||||
for i in img_list:
|
||||
img_url = 'https:' + i.get('src')
|
||||
img += img_url
|
||||
img += '\n'
|
||||
content = img + '正文内容:\n'
|
||||
for p in soup2.select('.dzb-enter-desc-box p'):
|
||||
para = p.text.split(' ')
|
||||
for x in para:
|
||||
if x != '' or x != '\\n\\n':
|
||||
content += x.strip()
|
||||
content += '\n'
|
||||
else:
|
||||
content = ''
|
||||
for p in soup2.select('.dzb-enter-desc-box p'):
|
||||
para = p.text.split(' ')
|
||||
for x in para:
|
||||
if x.strip() != '' or x != '\\n\\n':
|
||||
content += x.strip()
|
||||
content += '\n'
|
||||
return content
|
||||
|
||||
|
||||
def getData(start_date, end_date):
|
||||
for i in range((end_date - start_date).days):
|
||||
date_now = start_date + timedelta(days=i + 1)
|
||||
date_now_s = date_now.strftime('%Y-%m-%d')
|
||||
base_url = "https://paper.xinmin.cn/html/xmwb/" + date_now_s + '/'
|
||||
url = base_url + '1.html'
|
||||
art_base_url = 'https://paper.xinmin.cn'
|
||||
# 进入首页
|
||||
try:
|
||||
try:
|
||||
response = requests.get(url=url, headers=headers, timeout=30)
|
||||
except:
|
||||
time.sleep(10)
|
||||
response = requests.get(url=url, headers=headers, timeout=30)
|
||||
response.encoding = response.apparent_encoding
|
||||
print('一级连接状态:%d' % response.status_code)
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
response.close()
|
||||
# 提取所有版面信息
|
||||
for item in soup.select('.dzb-enter-mulu-wrap-nav a'):
|
||||
url1 = art_base_url + item.get('href')
|
||||
banmianhao = item.get('title').split(':')[0]
|
||||
banmianming = item.get('title').split(':')[-1]
|
||||
try:
|
||||
response2 = requests.get(url=url1, headers=headers, timeout=30)
|
||||
except:
|
||||
time.sleep(10)
|
||||
response2 = requests.get(url=url1, headers=headers, timeout=30)
|
||||
response2.encoding = response2.apparent_encoding
|
||||
print('二级连接状态:%d' % response2.status_code)
|
||||
if response2.status_code == 200:
|
||||
soup1 = BeautifulSoup(response2.text, 'lxml')
|
||||
response2.close()
|
||||
for item1 in soup1.select('.dzb-enter-benban-wrap div a'):
|
||||
url2 = art_base_url + item1.get('href')
|
||||
try:
|
||||
response3 = requests.get(url=url2, headers=headers, timeout=30)
|
||||
except:
|
||||
time.sleep(10)
|
||||
response3 = requests.get(url=url2, headers=headers, timeout=30)
|
||||
response3.encoding = response3.apparent_encoding
|
||||
print('三级连接状态:%d' % response3.status_code)
|
||||
if response3.status_code == 200:
|
||||
soup2 = BeautifulSoup(response3.text, 'lxml')
|
||||
response3.close()
|
||||
title = soup2.select('.dzb-title-box')[0].text.strip()
|
||||
pass_list = ['上海地区今明天气', '上海市今明天气预报', '广告']
|
||||
if title in pass_list: # 筛除每天海今明天气和广告
|
||||
time.sleep(random.randint(2, 8))
|
||||
continue
|
||||
subtitle = soup2.select('.dzb-sub-title-box')[0].text.strip()
|
||||
# 查找所有注释
|
||||
comments = soup2.find_all(string=lambda text: isinstance(text, Comment))
|
||||
author = ""
|
||||
# 遍历注释,找到包含作者的注释
|
||||
for comment in comments:
|
||||
if 'dzb-author-box' in comment:
|
||||
# 使用 BeautifulSoup 解析注释内容
|
||||
author_soup = BeautifulSoup(comment, 'html.parser')
|
||||
author = author_soup.find('span', class_='dzb-author-box').text
|
||||
pretitle = soup2.select('.dzb-special-title-box')[0].text.strip()
|
||||
content = parse_html_text(soup2)
|
||||
shxinminwanbao.insert_one({'banmianhao': banmianhao,
|
||||
'banmianming': banmianming,
|
||||
'pretitle': pretitle,
|
||||
'title': title,
|
||||
'subtitle': subtitle,
|
||||
'author': author,
|
||||
'keywordlist': '',
|
||||
'detail_url': url2,
|
||||
'release_time': date_now,
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': content})
|
||||
print('上海新民晚报-%s-%s-%s-已完成' % (date_now_s, banmianhao, title))
|
||||
time.sleep(random.randint(2, 8))
|
||||
print('上海新民晚报-%s-%s-已完成' % (date_now_s, banmianhao))
|
||||
print("上海新民晚报-%s-已经完成" % date_now_s)
|
||||
except Exception as result:
|
||||
shxinminwanbao.insert_one({'banmianhao': 'empty',
|
||||
'banmianming': 'empty',
|
||||
'title': 'empty',
|
||||
'subtitle': 'empty',
|
||||
'h3title': 'empty',
|
||||
'author': 'empty',
|
||||
'keywordlist': 'empty',
|
||||
'detail_url': url,
|
||||
'release_time': date_now,
|
||||
'insert_timestamp': datetime.today(),
|
||||
'content': 'empty'})
|
||||
print(result)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
print("爬取完毕!")
|
||||
72
地方政策/政策/上海/CrawlShanghaiZhengce.py
Normal file
72
地方政策/政策/上海/CrawlShanghaiZhengce.py
Normal file
@ -0,0 +1,72 @@
|
||||
import datetime
|
||||
import random
|
||||
import time
|
||||
|
||||
import pymongo
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# 模拟用户访问
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/51.0.2704.63 Safari/537.36',
|
||||
'Connection': 'close'}
|
||||
# 创建数据库
|
||||
client = pymongo.MongoClient('localhost', 27017)
|
||||
mydb = client.sjzf_zcwj
|
||||
shanghaizcwj = mydb.shanghaizcwj
|
||||
base_url = "https://www.shanghai.gov.cn"
|
||||
|
||||
|
||||
def getContent(soup: BeautifulSoup) -> str:
|
||||
"""
|
||||
获取文章正文内容
|
||||
:param soup:
|
||||
:return:
|
||||
"""
|
||||
content: str = ""
|
||||
for p in soup.select('#ivs_content p'):
|
||||
para: str = p.text.strip()
|
||||
if para:
|
||||
content += para
|
||||
content += '\n'
|
||||
return content
|
||||
|
||||
|
||||
def getData():
|
||||
pages = 28
|
||||
for i in range(1, pages + 1):
|
||||
if i == 1:
|
||||
url = "https://www.shanghai.gov.cn/xxzfgzwj/index.html"
|
||||
else:
|
||||
url = f"https://www.shanghai.gov.cn/xxzfgzwj/index_{i}.html"
|
||||
response = requests.get(url, headers=headers)
|
||||
response.encoding = response.apparent_encoding
|
||||
if response.status_code == 200:
|
||||
soup = BeautifulSoup(response.text, "lxml")
|
||||
# print(response.text)
|
||||
trList = soup.select(".trout-region-list tbody tr")
|
||||
for item in trList:
|
||||
data = item.select("a")[0]
|
||||
title = data.get("title", "")
|
||||
url = base_url + data.get("href", "")
|
||||
print(url)
|
||||
if shanghaizcwj.find_one({"url": url}):
|
||||
continue
|
||||
subtitle = data.select_one(".text-color").text.strip()
|
||||
response2 = requests.get(url=url, headers=headers)
|
||||
response2.encoding = response2.apparent_encoding
|
||||
print(response2.status_code)
|
||||
if response2.status_code == 200:
|
||||
soup2 = BeautifulSoup(response2.text, "lxml")
|
||||
content: str = getContent(soup=soup2)
|
||||
shanghaizcwj.insert_one({
|
||||
"title": title,
|
||||
"subtitle": subtitle,
|
||||
"content": content,
|
||||
"url": url,
|
||||
})
|
||||
time.sleep(random.randint(3, 5))
|
||||
print(title, "采集完成")
|
||||
|
||||
|
||||
getData()
|
||||
153
地方政策/政策/新疆/crawl/Crawlxjzfgz.py
Normal file
153
地方政策/政策/新疆/crawl/Crawlxjzfgz.py
Normal file
@ -0,0 +1,153 @@
|
||||
# _*_ coding : UTF-8 _*_
|
||||
# @Time : 2023/8/27 22:28
|
||||
# @Author : Haochen Zhong
|
||||
# @File : Exportxjzfgz.py
|
||||
# @Software : PyCharm
|
||||
# @Comment : 本程序采集新疆维吾尔自治区人民政府规章库
|
||||
import datetime
|
||||
import random
|
||||
import time
|
||||
|
||||
import pymongo
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# 模拟用户访问
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/51.0.2704.63 Safari/537.36',
|
||||
'Connection': 'close'}
|
||||
# 创建数据库
|
||||
client = pymongo.MongoClient('localhost', 27017)
|
||||
mydb = client.sjzf_zcwj
|
||||
xinjiangzcwj = mydb.xinjiangzcwj
|
||||
|
||||
|
||||
def getContent(soup: BeautifulSoup) -> str:
|
||||
"""
|
||||
获取文章正文内容
|
||||
:param soup:
|
||||
:return:
|
||||
"""
|
||||
content: str = ""
|
||||
for p in soup.select('.gknbxq_detail p'):
|
||||
para: str = p.text.strip()
|
||||
if para:
|
||||
content += para
|
||||
content += '\n'
|
||||
return content
|
||||
|
||||
|
||||
def getData():
|
||||
"""程序主函数"""
|
||||
count = 10000
|
||||
"""设置单次获取文章数量,可以任意设置正整数"""
|
||||
dataUrl = "https://www.xinjiang.gov.cn/interface-cms/qryManuscriptByWebsiteId"
|
||||
"""请求所有文章数据连接"""
|
||||
dataJson = {
|
||||
"websiteId": "2a4092ca8c2a4255bfec9f13f114aba6",
|
||||
"channelId": [
|
||||
"2aceb5d534434a9fb3550295b52a87e5"
|
||||
],
|
||||
"domainMetaList": [
|
||||
{}
|
||||
],
|
||||
"pageSize": f"{count}",
|
||||
"pageNum": 1,
|
||||
"title": None
|
||||
}
|
||||
"""请求参数"""
|
||||
response = requests.post(url=dataUrl, headers=headers, json=dataJson, timeout=60)
|
||||
response.encoding = response.apparent_encoding
|
||||
print(f"一级链接状态:{response.status_code}")
|
||||
if response.status_code == 200:
|
||||
dataList = response.json()["results"]
|
||||
for item in dataList:
|
||||
try:
|
||||
url: str = item["websiteDomain"] + item["url"]
|
||||
"""文章链接"""
|
||||
result = xinjiangzcwj.find_one({"url": url})
|
||||
if result:
|
||||
continue
|
||||
typeOneName: str = item["channelName"]
|
||||
"""文章归类"""
|
||||
title: str = item["title"]
|
||||
"""文章标题"""
|
||||
subTitle: str = item["subTitle"]
|
||||
"""文章副标题"""
|
||||
if item["publishedTime"]:
|
||||
pubtime: float = datetime.datetime.strptime(item["publishedTime"], "%Y-%m-%d").timestamp()
|
||||
"""发布日期"""
|
||||
else:
|
||||
pubtime: float = 0
|
||||
"""发布日期"""
|
||||
puborg: str = item["domainMetaList"]["xxgkml"]["resultList"]["fwjg2"]["cnName"]
|
||||
"""发文机关(自治区)"""
|
||||
articleType: str = item["domainMetaList"]["xxgkml"]["resultList"]["gwzl2"]["cnName"]
|
||||
"""公文种类"""
|
||||
if item["domainMetaList"]["xxgkml"]["resultList"]["cwrq2"]["cnName"]:
|
||||
ptime: float = datetime.datetime.strptime(
|
||||
item["domainMetaList"]["xxgkml"]["resultList"]["cwrq2"]["cnName"],
|
||||
"%Y-%m-%d").timestamp()
|
||||
"""成文日期"""
|
||||
else:
|
||||
ptime: float = 0
|
||||
"""成文日期"""
|
||||
index: str = item["domainMetaList"]["xxgkml"]["resultList"]["syh2"]["cnName"]
|
||||
"""索引号"""
|
||||
pcode: str = item["domainMetaList"]["xxgkml"]["resultList"]["wenh2"]["cnName"]
|
||||
"""文号"""
|
||||
effectiveness: str = item["domainMetaList"]["xxgkml"]["resultList"]["yxx01"]["cnName"]
|
||||
"""有效性"""
|
||||
typeSecondName: str = item["domainMetaList"]["xxgkml"]["resultList"]["wz2"]["cnName"]
|
||||
"""文种(自治区)"""
|
||||
year: str = item["domainMetaList"]["xxgkml"]["resultList"]["nianf2"]["cnName"]
|
||||
"""年份"""
|
||||
childtype: str = item["domainMetaList"]["xxgkml"]["resultList"]["ztfl2"]["cnName"]
|
||||
"""主题分类"""
|
||||
author: str = item["domainMetaList"]["默认元数据集"]["resultList"]["author"]["cnName"]
|
||||
"""作者"""
|
||||
source: str = item["domainMetaList"]["默认元数据集"]["resultList"]["source"]["cnName"]
|
||||
"""来源"""
|
||||
if item["manuscriptRelatedRes"]:
|
||||
manuscriptRelatedRes: str = item["websiteDomain"] + item["manuscriptRelatedRes"]
|
||||
"""附件链接"""
|
||||
else:
|
||||
manuscriptRelatedRes: str = ""
|
||||
"""附件链接"""
|
||||
response = requests.get(url=url, headers=headers, timeout=60)
|
||||
response.encoding = response.apparent_encoding
|
||||
print(f"二级链接状态:{response.status_code}")
|
||||
if response.status_code == 200:
|
||||
soup: BeautifulSoup = BeautifulSoup(response.text, "html.parser")
|
||||
content: str = getContent(soup=soup)
|
||||
xinjiangzcwj.insert_one(
|
||||
{
|
||||
'typeOneName': typeOneName,
|
||||
'typeSecondName': typeSecondName,
|
||||
'articleType': articleType,
|
||||
"title": title,
|
||||
"subTitle": subTitle,
|
||||
"childtype": childtype,
|
||||
"index": index,
|
||||
"pcode": pcode,
|
||||
"puborg": puborg,
|
||||
"ptime": ptime,
|
||||
"pubtime": pubtime,
|
||||
"effectiveness": effectiveness,
|
||||
"author": author,
|
||||
"year": year,
|
||||
"manuscriptRelatedRes": manuscriptRelatedRes,
|
||||
"url": url,
|
||||
"source": source,
|
||||
"content": content
|
||||
}
|
||||
)
|
||||
print(f"{typeOneName}--{typeSecondName}--{title}-已完成")
|
||||
time.sleep(random.randint(3, 8))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
continue
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
getData()
|
||||
185
地方政策/政策/新疆/export/Exportxjzfgz.py
Normal file
185
地方政策/政策/新疆/export/Exportxjzfgz.py
Normal file
@ -0,0 +1,185 @@
|
||||
# _*_ coding : UTF-8 _*_
|
||||
# @Time : 2023/8/28 0:50
|
||||
# @Author : Haochen Zhong
|
||||
# @File : Exportxjzfgz.py
|
||||
# @Software : PyCharm
|
||||
# @Comment : 本程序用于到处新疆维吾尔自治区人民政府规章
|
||||
|
||||
import datetime
|
||||
import os
|
||||
import time
|
||||
|
||||
import pandas as pd
|
||||
import pymongo
|
||||
from docx import Document
|
||||
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
|
||||
from docx.oxml.ns import qn
|
||||
from docx.shared import Pt, Cm
|
||||
|
||||
client = pymongo.MongoClient('localhost', 27017)
|
||||
"""与mongoDB数据库建立连接"""
|
||||
mydb = client.sjzf_zcwj
|
||||
"""政策文件存放在数据库的一级目录对象"""
|
||||
xinjiangzcwj = mydb.xinjiangzcwj
|
||||
"""政策文件存放对象"""
|
||||
|
||||
savePath = ""
|
||||
"""导出文件存放路径"""
|
||||
|
||||
|
||||
def replace_invalid_chars(text):
|
||||
"""
|
||||
替换Window系统和Linux系统文件路径禁止字符,统一转换成Html实体编码
|
||||
"""
|
||||
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
|
||||
"""Window系统和Linux系统文件路径禁止字符列表"""
|
||||
replace_char = ['<', '>', ':', '"', '/', '\', '|', '?', '*']
|
||||
"""Window系统和Linux系统文件路径禁止字符替换列表 统一转换成html实体编码"""
|
||||
|
||||
for i, char in enumerate(invalid_chars):
|
||||
text = text.replace(char, replace_char[i])
|
||||
return text
|
||||
|
||||
|
||||
def analysisTime(timestamp: int) -> str:
|
||||
"""
|
||||
处理时间,将1970-01-01之前的时间戳正确转换
|
||||
"""
|
||||
if timestamp == 0:
|
||||
return "未知"
|
||||
if timestamp < 0:
|
||||
# 计算从 1970-01-01 开始的时间间隔
|
||||
delta = datetime.timedelta(seconds=abs(timestamp))
|
||||
date = datetime.datetime(1970, 1, 1) - delta
|
||||
else:
|
||||
date = datetime.datetime.fromtimestamp(timestamp)
|
||||
# 格式化为字符串
|
||||
return date.strftime('%Y-%m-%d')
|
||||
|
||||
|
||||
def saveFile():
|
||||
num = 0
|
||||
startTime = time.time()
|
||||
global savePath
|
||||
query = {
|
||||
'typeOneName': "",
|
||||
'typeSecondName': "",
|
||||
'articleType': "",
|
||||
"title": "",
|
||||
"subTitle": "",
|
||||
"childtype": "",
|
||||
"index": "",
|
||||
"pcode": "",
|
||||
"puborg": "",
|
||||
"ptime": "",
|
||||
"pubtime": "",
|
||||
"effectiveness": "",
|
||||
"author": "",
|
||||
"year": "",
|
||||
"manuscriptRelatedRes": "",
|
||||
"url": "",
|
||||
"source": "",
|
||||
"content": ""
|
||||
}
|
||||
query = {f'{k}': v for k, v in query.items() if v}
|
||||
"""需要过滤的文章,默认不过滤"""
|
||||
dataList = list(xinjiangzcwj.find(query))
|
||||
if not savePath:
|
||||
savePath = input("请输入数据存放路径:")
|
||||
totalPath = os.path.join(savePath, "数据统计表.csv")
|
||||
for data in dataList:
|
||||
try:
|
||||
typeOneName = data["typeOneName"]
|
||||
"""一级分类目录"""
|
||||
typeSecondName = data["typeSecondName"]
|
||||
"""二级分类目录"""
|
||||
articleType = data["articleType"]
|
||||
"""四级分类目录"""
|
||||
# 创建目录
|
||||
output_directory = os.path.join(savePath, typeOneName, typeSecondName)
|
||||
if not os.path.exists(output_directory):
|
||||
os.makedirs(output_directory)
|
||||
doc = Document()
|
||||
firstLine = doc.add_paragraph()
|
||||
firstLineText = f"索引号:{data['index']}\t\t有效性:{data['effectiveness']}"
|
||||
firstLine_run = firstLine.add_run(firstLineText)
|
||||
firstLine_run.font.size = Pt(12)
|
||||
firstLine_run.font.name = 'Times New Roman' # 设置标题西文字体
|
||||
firstLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
|
||||
|
||||
secondLine = doc.add_paragraph()
|
||||
secondLineText = f"发文机关:{data['puborg']}\t\t发文字号:{data['pcode']}"
|
||||
secondLine_run = secondLine.add_run(secondLineText)
|
||||
secondLine_run.font.size = Pt(12)
|
||||
secondLine_run.font.name = 'Times New Roman' # 设置标题西文字体
|
||||
secondLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
|
||||
|
||||
thirdLine = doc.add_paragraph()
|
||||
thirdLineText = f"标题:{data['title']}"
|
||||
thirdLine_run = thirdLine.add_run(thirdLineText)
|
||||
thirdLine_run.font.size = Pt(12)
|
||||
thirdLine_run.font.name = 'Times New Roman' # 设置标题西文字体
|
||||
thirdLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
|
||||
|
||||
fourLine = doc.add_paragraph()
|
||||
pubtime = analysisTime(data['pubtime'])
|
||||
ptime = analysisTime(data['ptime'])
|
||||
fourLineText = f"成文日期:{ptime}\t\t发布日期:{pubtime}"
|
||||
fourLine_run = fourLine.add_run(fourLineText)
|
||||
fourLine_run.font.size = Pt(12)
|
||||
fourLine_run.font.name = 'Times New Roman' # 设置标题西文字体
|
||||
fourLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
|
||||
|
||||
urlLine = doc.add_paragraph()
|
||||
urlLineText = f"文章链接:{data['url']}"
|
||||
urlLine_run = urlLine.add_run(urlLineText)
|
||||
urlLine_run.font.size = Pt(12)
|
||||
urlLine_run.font.name = 'Times New Roman' # 设置标题西文字体
|
||||
urlLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
|
||||
|
||||
title = doc.add_paragraph()
|
||||
title_run = title.add_run(data["title"])
|
||||
title_run.bold = True
|
||||
title_run.font.size = Pt(22)
|
||||
title_run.font.name = 'Times New Roman' # 设置标题西文字体
|
||||
title_run.element.rPr.rFonts.set(qn('w:eastAsia'), "华文中宋")
|
||||
title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER # 设置大标题居中对齐
|
||||
|
||||
for section in data["content"].split("\n"):
|
||||
paragraph = doc.add_paragraph()
|
||||
run = paragraph.add_run("\t" + section)
|
||||
run.font.size = Pt(16)
|
||||
run.font.name = "Times New Roman"
|
||||
run.element.rPr.rFonts.set(qn('w:eastAsia'), "仿宋")
|
||||
run.first_line_indent = Cm(0.74)
|
||||
|
||||
if data["manuscriptRelatedRes"]:
|
||||
urlLine = doc.add_paragraph()
|
||||
urlLineText = f"附件链接:{data['manuscriptRelatedRes']}"
|
||||
urlLine_run = urlLine.add_run(urlLineText)
|
||||
urlLine_run.font.size = Pt(12)
|
||||
urlLine_run.font.name = 'Times New Roman' # 设置标题西文字体
|
||||
urlLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
|
||||
if len(data["title"]) > 45:
|
||||
title_ = data["title"][len(data["title"]) - 30:]
|
||||
else:
|
||||
title_ = data["title"]
|
||||
fileName = f"{replace_invalid_chars(title_)}.docx"
|
||||
filePath = os.path.join(output_directory, fileName)
|
||||
doc.save(filePath)
|
||||
num += 1
|
||||
print(f"{typeOneName}--{typeSecondName}--{data['title']}--导出成功!")
|
||||
except Exception as e:
|
||||
print(e)
|
||||
continue
|
||||
csvData = pd.DataFrame(dataList)
|
||||
csvData.columns = ["数据库ID", "文章归类", "文种(自治区)", "公文种类", "文章标题", "文章副标题", "主题分类",
|
||||
"索引号", "文号", "发文机关(自治区)", "成文日期时间戳", "发布日期时间戳", "有效性", "作者", "年份",
|
||||
"附件链接",
|
||||
"文章链接", "来源", "正文内容"]
|
||||
csvData.to_csv(totalPath, encoding="utf-8-sig",index_label="序号")
|
||||
print(f"耗时:{time.time() - startTime} 秒,一共导出{num}份文件,详情数据请看数据统计表.csv")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
saveFile()
|
||||
62
地方政策/政策/深圳/CrawlShenZhen.py
Normal file
62
地方政策/政策/深圳/CrawlShenZhen.py
Normal file
@ -0,0 +1,62 @@
|
||||
import asyncio
|
||||
import datetime
|
||||
import random
|
||||
import time
|
||||
|
||||
import pymongo
|
||||
import requests
|
||||
from httpx import AsyncClient
|
||||
|
||||
# 模拟用户访问
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/51.0.2704.63 Safari/537.36', }
|
||||
# 创建数据库
|
||||
client = pymongo.MongoClient('localhost', 27017)
|
||||
mydb = client.sjzf_zcwj
|
||||
shenzhenzcwj = mydb.shenzhenzcwj
|
||||
|
||||
yearList = ['158104', '148604', '141910', '125615', '103604', '101620', '101621', '101622', '101623', '101624',
|
||||
'101625', '101626', '101627', '101628', '101629', '101630', '101631', '101632', '101633', '101634',
|
||||
'101635', '101636', '101637', '101638', '146351', '146338', '146325', '146311', '146298', '146285',
|
||||
'146272', '146205', '146190', '145973', '145972', '145970']
|
||||
|
||||
|
||||
def update_json_data(original_data, new_data):
|
||||
# 遍历新数据的键值对
|
||||
for key, value in new_data.items():
|
||||
# 如果新数据的值不为 None 或者空字符串,更新原数据
|
||||
if value is not None and value != "":
|
||||
original_data[key] = value
|
||||
return original_data
|
||||
|
||||
|
||||
async def getData():
|
||||
async with AsyncClient(headers=headers, timeout=60, verify=False) as client:
|
||||
for i in yearList:
|
||||
url = f"http://www.sz.gov.cn/postmeta/i/{i}.json"
|
||||
print(url)
|
||||
response = await client.get(url=url)
|
||||
response.encoding = response.charset_encoding
|
||||
print(response.status_code)
|
||||
if response.status_code == 200:
|
||||
for item in response.json()["children"]:
|
||||
url2 = f"http://www.sz.gov.cn/postmeta/i/{item['id']}.json"
|
||||
print(url2)
|
||||
response2 = await client.get(url=url2)
|
||||
response2.encoding = response2.charset_encoding
|
||||
print(response2.status_code)
|
||||
if response2.status_code == 200:
|
||||
for item2 in response2.json()["articles"][1:]:
|
||||
if shenzhenzcwj.find_one({"id":item2["id"]}):
|
||||
continue
|
||||
url3 = f"http://www.sz.gov.cn/postmeta/p/{item2['id'] // 1000000}/{item2['id'] // 1000}/{item2['id']}.json"
|
||||
response3 = await client.get(url=url3)
|
||||
response3.encoding = response3.charset_encoding
|
||||
print(response3.status_code)
|
||||
if response3.status_code == 200:
|
||||
data = response3.json()
|
||||
newData = update_json_data(item2, data)
|
||||
shenzhenzcwj.insert_one(newData)
|
||||
print(newData["title"],"采集完成")
|
||||
await asyncio.sleep(random.randint(2,3))
|
||||
asyncio.run(getData())
|
||||
Loading…
x
Reference in New Issue
Block a user