154 lines
6.2 KiB
Python
Raw Normal View History

2024-11-09 17:00:30 +08:00
# _*_ coding : UTF-8 _*_
# @Time : 2023/8/27 22:28
# @Author : Haochen Zhong
# @File : Exportxjzfgz.py
# @Software : PyCharm
# @Comment : 本程序采集新疆维吾尔自治区人民政府规章库
import datetime
import random
import time
import pymongo
import requests
from bs4 import BeautifulSoup
# 模拟用户访问
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/51.0.2704.63 Safari/537.36',
'Connection': 'close'}
# 创建数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client.sjzf_zcwj
xinjiangzcwj = mydb.xinjiangzcwj
def getContent(soup: BeautifulSoup) -> str:
"""
获取文章正文内容
:param soup:
:return:
"""
content: str = ""
for p in soup.select('.gknbxq_detail p'):
para: str = p.text.strip()
if para:
content += para
content += '\n'
return content
def getData():
"""程序主函数"""
count = 10000
"""设置单次获取文章数量,可以任意设置正整数"""
dataUrl = "https://www.xinjiang.gov.cn/interface-cms/qryManuscriptByWebsiteId"
"""请求所有文章数据连接"""
dataJson = {
"websiteId": "2a4092ca8c2a4255bfec9f13f114aba6",
"channelId": [
"2aceb5d534434a9fb3550295b52a87e5"
],
"domainMetaList": [
{}
],
"pageSize": f"{count}",
"pageNum": 1,
"title": None
}
"""请求参数"""
response = requests.post(url=dataUrl, headers=headers, json=dataJson, timeout=60)
response.encoding = response.apparent_encoding
print(f"一级链接状态:{response.status_code}")
if response.status_code == 200:
dataList = response.json()["results"]
for item in dataList:
try:
url: str = item["websiteDomain"] + item["url"]
"""文章链接"""
result = xinjiangzcwj.find_one({"url": url})
if result:
continue
typeOneName: str = item["channelName"]
"""文章归类"""
title: str = item["title"]
"""文章标题"""
subTitle: str = item["subTitle"]
"""文章副标题"""
if item["publishedTime"]:
pubtime: float = datetime.datetime.strptime(item["publishedTime"], "%Y-%m-%d").timestamp()
"""发布日期"""
else:
pubtime: float = 0
"""发布日期"""
puborg: str = item["domainMetaList"]["xxgkml"]["resultList"]["fwjg2"]["cnName"]
"""发文机关(自治区)"""
articleType: str = item["domainMetaList"]["xxgkml"]["resultList"]["gwzl2"]["cnName"]
"""公文种类"""
if item["domainMetaList"]["xxgkml"]["resultList"]["cwrq2"]["cnName"]:
ptime: float = datetime.datetime.strptime(
item["domainMetaList"]["xxgkml"]["resultList"]["cwrq2"]["cnName"],
"%Y-%m-%d").timestamp()
"""成文日期"""
else:
ptime: float = 0
"""成文日期"""
index: str = item["domainMetaList"]["xxgkml"]["resultList"]["syh2"]["cnName"]
"""索引号"""
pcode: str = item["domainMetaList"]["xxgkml"]["resultList"]["wenh2"]["cnName"]
"""文号"""
effectiveness: str = item["domainMetaList"]["xxgkml"]["resultList"]["yxx01"]["cnName"]
"""有效性"""
typeSecondName: str = item["domainMetaList"]["xxgkml"]["resultList"]["wz2"]["cnName"]
"""文种(自治区)"""
year: str = item["domainMetaList"]["xxgkml"]["resultList"]["nianf2"]["cnName"]
"""年份"""
childtype: str = item["domainMetaList"]["xxgkml"]["resultList"]["ztfl2"]["cnName"]
"""主题分类"""
author: str = item["domainMetaList"]["默认元数据集"]["resultList"]["author"]["cnName"]
"""作者"""
source: str = item["domainMetaList"]["默认元数据集"]["resultList"]["source"]["cnName"]
"""来源"""
if item["manuscriptRelatedRes"]:
manuscriptRelatedRes: str = item["websiteDomain"] + item["manuscriptRelatedRes"]
"""附件链接"""
else:
manuscriptRelatedRes: str = ""
"""附件链接"""
response = requests.get(url=url, headers=headers, timeout=60)
response.encoding = response.apparent_encoding
print(f"二级链接状态:{response.status_code}")
if response.status_code == 200:
soup: BeautifulSoup = BeautifulSoup(response.text, "html.parser")
content: str = getContent(soup=soup)
xinjiangzcwj.insert_one(
{
'typeOneName': typeOneName,
'typeSecondName': typeSecondName,
'articleType': articleType,
"title": title,
"subTitle": subTitle,
"childtype": childtype,
"index": index,
"pcode": pcode,
"puborg": puborg,
"ptime": ptime,
"pubtime": pubtime,
"effectiveness": effectiveness,
"author": author,
"year": year,
"manuscriptRelatedRes": manuscriptRelatedRes,
"url": url,
"source": source,
"content": content
}
)
print(f"{typeOneName}--{typeSecondName}--{title}-已完成")
time.sleep(random.randint(3, 8))
except Exception as e:
print(e)
continue
if __name__ == '__main__':
getData()