guoneimeitishujucaiji/地方政策/政策/新疆/crawl/Crawlxjzfgz.py

# _*_ coding : UTF-8 _*_
# @Time : 2023/8/27 22:28
# @Author : Haochen Zhong
# @File : Exportxjzfgz.py
# @Software : PyCharm
# @Comment : 本程序采集新疆维吾尔自治区人民政府规章库
import datetime
import random
import time

import pymongo
import requests
from bs4 import BeautifulSoup

# 模拟用户访问
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/51.0.2704.63 Safari/537.36',
           'Connection': 'close'}
# 创建数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client.sjzf_zcwj
xinjiangzcwj = mydb.xinjiangzcwj


def getContent(soup: BeautifulSoup) -> str:
    """
    获取文章正文内容
    :param soup:
    :return:
    """
    content: str = ""
    for p in soup.select('.gknbxq_detail p'):
        para: str = p.text.strip()
        if para:
            content += para
            content += '\n'
    return content


def getData():
    """程序主函数"""
    count = 10000
    """设置单次获取文章数量，可以任意设置正整数"""
    dataUrl = "https://www.xinjiang.gov.cn/interface-cms/qryManuscriptByWebsiteId"
    """请求所有文章数据连接"""
    dataJson = {
        "websiteId": "2a4092ca8c2a4255bfec9f13f114aba6",
        "channelId": [
            "2aceb5d534434a9fb3550295b52a87e5"
        ],
        "domainMetaList": [
            {}
        ],
        "pageSize": f"{count}",
        "pageNum": 1,
        "title": None
    }
    """请求参数"""
    response = requests.post(url=dataUrl, headers=headers, json=dataJson, timeout=60)
    response.encoding = response.apparent_encoding
    print(f"一级链接状态：{response.status_code}")
    if response.status_code == 200:
        dataList = response.json()["results"]
        for item in dataList:
            try:
                url: str = item["websiteDomain"] + item["url"]
                """文章链接"""
                result = xinjiangzcwj.find_one({"url": url})
                if result:
                    continue
                typeOneName: str = item["channelName"]
                """文章归类"""
                title: str = item["title"]
                """文章标题"""
                subTitle: str = item["subTitle"]
                """文章副标题"""
                if item["publishedTime"]:
                    pubtime: float = datetime.datetime.strptime(item["publishedTime"], "%Y-%m-%d").timestamp()
                    """发布日期"""
                else:
                    pubtime: float = 0
                    """发布日期"""
                puborg: str = item["domainMetaList"]["xxgkml"]["resultList"]["fwjg2"]["cnName"]
                """发文机关（自治区）"""
                articleType: str = item["domainMetaList"]["xxgkml"]["resultList"]["gwzl2"]["cnName"]
                """公文种类"""
                if item["domainMetaList"]["xxgkml"]["resultList"]["cwrq2"]["cnName"]:
                    ptime: float = datetime.datetime.strptime(
                        item["domainMetaList"]["xxgkml"]["resultList"]["cwrq2"]["cnName"],
                        "%Y-%m-%d").timestamp()
                    """成文日期"""
                else:
                    ptime: float = 0
                    """成文日期"""
                index: str = item["domainMetaList"]["xxgkml"]["resultList"]["syh2"]["cnName"]
                """索引号"""
                pcode: str = item["domainMetaList"]["xxgkml"]["resultList"]["wenh2"]["cnName"]
                """文号"""
                effectiveness: str = item["domainMetaList"]["xxgkml"]["resultList"]["yxx01"]["cnName"]
                """有效性"""
                typeSecondName: str = item["domainMetaList"]["xxgkml"]["resultList"]["wz2"]["cnName"]
                """文种（自治区）"""
                year: str = item["domainMetaList"]["xxgkml"]["resultList"]["nianf2"]["cnName"]
                """年份"""
                childtype: str = item["domainMetaList"]["xxgkml"]["resultList"]["ztfl2"]["cnName"]
                """主题分类"""
                author: str = item["domainMetaList"]["默认元数据集"]["resultList"]["author"]["cnName"]
                """作者"""
                source: str = item["domainMetaList"]["默认元数据集"]["resultList"]["source"]["cnName"]
                """来源"""
                if item["manuscriptRelatedRes"]:
                    manuscriptRelatedRes: str = item["websiteDomain"] + item["manuscriptRelatedRes"]
                    """附件链接"""
                else:
                    manuscriptRelatedRes: str = ""
                    """附件链接"""
                response = requests.get(url=url, headers=headers, timeout=60)
                response.encoding = response.apparent_encoding
                print(f"二级链接状态：{response.status_code}")
                if response.status_code == 200:
                    soup: BeautifulSoup = BeautifulSoup(response.text, "html.parser")
                    content: str = getContent(soup=soup)
                    xinjiangzcwj.insert_one(
                        {
                            'typeOneName': typeOneName,
                            'typeSecondName': typeSecondName,
                            'articleType': articleType,
                            "title": title,
                            "subTitle": subTitle,
                            "childtype": childtype,
                            "index": index,
                            "pcode": pcode,
                            "puborg": puborg,
                            "ptime": ptime,
                            "pubtime": pubtime,
                            "effectiveness": effectiveness,
                            "author": author,
                            "year": year,
                            "manuscriptRelatedRes": manuscriptRelatedRes,
                            "url": url,
                            "source": source,
                            "content": content
                        }
                    )
                    print(f"{typeOneName}--{typeSecondName}--{title}-已完成")
                    time.sleep(random.randint(3, 8))
            except Exception as e:
                print(e)
                continue


if __name__ == '__main__':
    getData()