154 lines
6.2 KiB
Python
154 lines
6.2 KiB
Python
# _*_ coding : UTF-8 _*_
|
|
# @Time : 2023/8/27 22:28
|
|
# @Author : Haochen Zhong
|
|
# @File : Exportxjzfgz.py
|
|
# @Software : PyCharm
|
|
# @Comment : 本程序采集新疆维吾尔自治区人民政府规章库
|
|
import datetime
|
|
import random
|
|
import time
|
|
|
|
import pymongo
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
# 模拟用户访问
|
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
|
'Chrome/51.0.2704.63 Safari/537.36',
|
|
'Connection': 'close'}
|
|
# 创建数据库
|
|
client = pymongo.MongoClient('localhost', 27017)
|
|
mydb = client.sjzf_zcwj
|
|
xinjiangzcwj = mydb.xinjiangzcwj
|
|
|
|
|
|
def getContent(soup: BeautifulSoup) -> str:
|
|
"""
|
|
获取文章正文内容
|
|
:param soup:
|
|
:return:
|
|
"""
|
|
content: str = ""
|
|
for p in soup.select('.gknbxq_detail p'):
|
|
para: str = p.text.strip()
|
|
if para:
|
|
content += para
|
|
content += '\n'
|
|
return content
|
|
|
|
|
|
def getData():
|
|
"""程序主函数"""
|
|
count = 10000
|
|
"""设置单次获取文章数量,可以任意设置正整数"""
|
|
dataUrl = "https://www.xinjiang.gov.cn/interface-cms/qryManuscriptByWebsiteId"
|
|
"""请求所有文章数据连接"""
|
|
dataJson = {
|
|
"websiteId": "2a4092ca8c2a4255bfec9f13f114aba6",
|
|
"channelId": [
|
|
"2aceb5d534434a9fb3550295b52a87e5"
|
|
],
|
|
"domainMetaList": [
|
|
{}
|
|
],
|
|
"pageSize": f"{count}",
|
|
"pageNum": 1,
|
|
"title": None
|
|
}
|
|
"""请求参数"""
|
|
response = requests.post(url=dataUrl, headers=headers, json=dataJson, timeout=60)
|
|
response.encoding = response.apparent_encoding
|
|
print(f"一级链接状态:{response.status_code}")
|
|
if response.status_code == 200:
|
|
dataList = response.json()["results"]
|
|
for item in dataList:
|
|
try:
|
|
url: str = item["websiteDomain"] + item["url"]
|
|
"""文章链接"""
|
|
result = xinjiangzcwj.find_one({"url": url})
|
|
if result:
|
|
continue
|
|
typeOneName: str = item["channelName"]
|
|
"""文章归类"""
|
|
title: str = item["title"]
|
|
"""文章标题"""
|
|
subTitle: str = item["subTitle"]
|
|
"""文章副标题"""
|
|
if item["publishedTime"]:
|
|
pubtime: float = datetime.datetime.strptime(item["publishedTime"], "%Y-%m-%d").timestamp()
|
|
"""发布日期"""
|
|
else:
|
|
pubtime: float = 0
|
|
"""发布日期"""
|
|
puborg: str = item["domainMetaList"]["xxgkml"]["resultList"]["fwjg2"]["cnName"]
|
|
"""发文机关(自治区)"""
|
|
articleType: str = item["domainMetaList"]["xxgkml"]["resultList"]["gwzl2"]["cnName"]
|
|
"""公文种类"""
|
|
if item["domainMetaList"]["xxgkml"]["resultList"]["cwrq2"]["cnName"]:
|
|
ptime: float = datetime.datetime.strptime(
|
|
item["domainMetaList"]["xxgkml"]["resultList"]["cwrq2"]["cnName"],
|
|
"%Y-%m-%d").timestamp()
|
|
"""成文日期"""
|
|
else:
|
|
ptime: float = 0
|
|
"""成文日期"""
|
|
index: str = item["domainMetaList"]["xxgkml"]["resultList"]["syh2"]["cnName"]
|
|
"""索引号"""
|
|
pcode: str = item["domainMetaList"]["xxgkml"]["resultList"]["wenh2"]["cnName"]
|
|
"""文号"""
|
|
effectiveness: str = item["domainMetaList"]["xxgkml"]["resultList"]["yxx01"]["cnName"]
|
|
"""有效性"""
|
|
typeSecondName: str = item["domainMetaList"]["xxgkml"]["resultList"]["wz2"]["cnName"]
|
|
"""文种(自治区)"""
|
|
year: str = item["domainMetaList"]["xxgkml"]["resultList"]["nianf2"]["cnName"]
|
|
"""年份"""
|
|
childtype: str = item["domainMetaList"]["xxgkml"]["resultList"]["ztfl2"]["cnName"]
|
|
"""主题分类"""
|
|
author: str = item["domainMetaList"]["默认元数据集"]["resultList"]["author"]["cnName"]
|
|
"""作者"""
|
|
source: str = item["domainMetaList"]["默认元数据集"]["resultList"]["source"]["cnName"]
|
|
"""来源"""
|
|
if item["manuscriptRelatedRes"]:
|
|
manuscriptRelatedRes: str = item["websiteDomain"] + item["manuscriptRelatedRes"]
|
|
"""附件链接"""
|
|
else:
|
|
manuscriptRelatedRes: str = ""
|
|
"""附件链接"""
|
|
response = requests.get(url=url, headers=headers, timeout=60)
|
|
response.encoding = response.apparent_encoding
|
|
print(f"二级链接状态:{response.status_code}")
|
|
if response.status_code == 200:
|
|
soup: BeautifulSoup = BeautifulSoup(response.text, "html.parser")
|
|
content: str = getContent(soup=soup)
|
|
xinjiangzcwj.insert_one(
|
|
{
|
|
'typeOneName': typeOneName,
|
|
'typeSecondName': typeSecondName,
|
|
'articleType': articleType,
|
|
"title": title,
|
|
"subTitle": subTitle,
|
|
"childtype": childtype,
|
|
"index": index,
|
|
"pcode": pcode,
|
|
"puborg": puborg,
|
|
"ptime": ptime,
|
|
"pubtime": pubtime,
|
|
"effectiveness": effectiveness,
|
|
"author": author,
|
|
"year": year,
|
|
"manuscriptRelatedRes": manuscriptRelatedRes,
|
|
"url": url,
|
|
"source": source,
|
|
"content": content
|
|
}
|
|
)
|
|
print(f"{typeOneName}--{typeSecondName}--{title}-已完成")
|
|
time.sleep(random.randint(3, 8))
|
|
except Exception as e:
|
|
print(e)
|
|
continue
|
|
|
|
|
|
if __name__ == '__main__':
|
|
getData()
|