# _*_ coding : UTF-8 _*_ # @Time : 2023/8/27 22:28 # @Author : Haochen Zhong # @File : Exportxjzfgz.py # @Software : PyCharm # @Comment : 本程序采集新疆维吾尔自治区人民政府规章库 import datetime import random import time import pymongo import requests from bs4 import BeautifulSoup # 模拟用户访问 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/51.0.2704.63 Safari/537.36', 'Connection': 'close'} # 创建数据库 client = pymongo.MongoClient('localhost', 27017) mydb = client.sjzf_zcwj xinjiangzcwj = mydb.xinjiangzcwj def getContent(soup: BeautifulSoup) -> str: """ 获取文章正文内容 :param soup: :return: """ content: str = "" for p in soup.select('.gknbxq_detail p'): para: str = p.text.strip() if para: content += para content += '\n' return content def getData(): """程序主函数""" count = 10000 """设置单次获取文章数量,可以任意设置正整数""" dataUrl = "https://www.xinjiang.gov.cn/interface-cms/qryManuscriptByWebsiteId" """请求所有文章数据连接""" dataJson = { "websiteId": "2a4092ca8c2a4255bfec9f13f114aba6", "channelId": [ "2aceb5d534434a9fb3550295b52a87e5" ], "domainMetaList": [ {} ], "pageSize": f"{count}", "pageNum": 1, "title": None } """请求参数""" response = requests.post(url=dataUrl, headers=headers, json=dataJson, timeout=60) response.encoding = response.apparent_encoding print(f"一级链接状态:{response.status_code}") if response.status_code == 200: dataList = response.json()["results"] for item in dataList: try: url: str = item["websiteDomain"] + item["url"] """文章链接""" result = xinjiangzcwj.find_one({"url": url}) if result: continue typeOneName: str = item["channelName"] """文章归类""" title: str = item["title"] """文章标题""" subTitle: str = item["subTitle"] """文章副标题""" if item["publishedTime"]: pubtime: float = datetime.datetime.strptime(item["publishedTime"], "%Y-%m-%d").timestamp() """发布日期""" else: pubtime: float = 0 """发布日期""" puborg: str = item["domainMetaList"]["xxgkml"]["resultList"]["fwjg2"]["cnName"] """发文机关(自治区)""" articleType: str = item["domainMetaList"]["xxgkml"]["resultList"]["gwzl2"]["cnName"] """公文种类""" if item["domainMetaList"]["xxgkml"]["resultList"]["cwrq2"]["cnName"]: ptime: float = datetime.datetime.strptime( item["domainMetaList"]["xxgkml"]["resultList"]["cwrq2"]["cnName"], "%Y-%m-%d").timestamp() """成文日期""" else: ptime: float = 0 """成文日期""" index: str = item["domainMetaList"]["xxgkml"]["resultList"]["syh2"]["cnName"] """索引号""" pcode: str = item["domainMetaList"]["xxgkml"]["resultList"]["wenh2"]["cnName"] """文号""" effectiveness: str = item["domainMetaList"]["xxgkml"]["resultList"]["yxx01"]["cnName"] """有效性""" typeSecondName: str = item["domainMetaList"]["xxgkml"]["resultList"]["wz2"]["cnName"] """文种(自治区)""" year: str = item["domainMetaList"]["xxgkml"]["resultList"]["nianf2"]["cnName"] """年份""" childtype: str = item["domainMetaList"]["xxgkml"]["resultList"]["ztfl2"]["cnName"] """主题分类""" author: str = item["domainMetaList"]["默认元数据集"]["resultList"]["author"]["cnName"] """作者""" source: str = item["domainMetaList"]["默认元数据集"]["resultList"]["source"]["cnName"] """来源""" if item["manuscriptRelatedRes"]: manuscriptRelatedRes: str = item["websiteDomain"] + item["manuscriptRelatedRes"] """附件链接""" else: manuscriptRelatedRes: str = "" """附件链接""" response = requests.get(url=url, headers=headers, timeout=60) response.encoding = response.apparent_encoding print(f"二级链接状态:{response.status_code}") if response.status_code == 200: soup: BeautifulSoup = BeautifulSoup(response.text, "html.parser") content: str = getContent(soup=soup) xinjiangzcwj.insert_one( { 'typeOneName': typeOneName, 'typeSecondName': typeSecondName, 'articleType': articleType, "title": title, "subTitle": subTitle, "childtype": childtype, "index": index, "pcode": pcode, "puborg": puborg, "ptime": ptime, "pubtime": pubtime, "effectiveness": effectiveness, "author": author, "year": year, "manuscriptRelatedRes": manuscriptRelatedRes, "url": url, "source": source, "content": content } ) print(f"{typeOneName}--{typeSecondName}--{title}-已完成") time.sleep(random.randint(3, 8)) except Exception as e: print(e) continue if __name__ == '__main__': getData()