guoneimeitishujucaiji/地方政策/政策/新疆/crawl/Crawlxjzfgz.py

# _*_ coding : UTF-8 _*_
# @Time : 2023/8/27 22:28
# @Author : Haochen Zhong
# @File : Exportxjzfgz.py
# @Software : PyCharm
# @Comment : 本程序采集新疆维吾尔自治区人民政府规章库
import datetime
import random
import time

import pymongo
import requests
from bs4 import BeautifulSoup

# 模拟用户访问
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/51.0.2704.63 Safari/537.36',
           'Connection': 'close'}
# 创建数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client.sjzf_zcwj
xinjiangzcwj = mydb.xinjiangzcwj


def getContent(soup: BeautifulSoup) -> str:
    """
    获取文章正文内容
    :param soup:
    :return:
    """
    content: str = ""
    for p in soup.select('.gknbxq_detail p'):
        para: str = p.text.strip()
        if para:
            content += para
            content += '\n'
    return content


def getData():
    """程序主函数"""
    count = 10000
    """设置单次获取文章数量，可以任意设置正整数"""
    dataUrl = "https://www.xinjiang.gov.cn/interface-cms/qryManuscriptByWebsiteId"
    """请求所有文章数据连接"""
    dataJson = {
        "websiteId": "2a4092ca8c2a4255bfec9f13f114aba6",
        "channelId": [
            "2aceb5d534434a9fb3550295b52a87e5"
        ],
        "domainMetaList": [
            {}
        ],
        "pageSize": f"{count}",
        "pageNum": 1,
        "title": None
    }
    """请求参数"""
    response = requests.post(url=dataUrl, headers=headers, json=dataJson, timeout=60)
    response.encoding = response.apparent_encoding
    print(f"一级链接状态：{response.status_code}")
    if response.status_code == 200:
        dataList = response.json()["results"]
        for item in dataList:
            try:
                url: str = item["websiteDomain"] + item["url"]
                """文章链接"""
                result = xinjiangzcwj.find_one({"url": url})
                if result:
                    continue
                typeOneName: str = item["channelName"]
                """文章归类"""
                title: str = item["title"]
                """文章标题"""
                subTitle: str = item["subTitle"]
                """文章副标题"""
                if item["publishedTime"]:
                    pubtime: float = datetime.datetime.strptime(item["publishedTime"], "%Y-%m-%d").timestamp()
                    """发布日期"""
                else:
                    pubtime: float = 0
                    """发布日期"""
                puborg: str = item["domainMetaList"]["xxgkml"]["resultList"]["fwjg2"]["cnName"]
                """发文机关（自治区）"""
                articleType: str = item["domainMetaList"]["xxgkml"]["resultList"]["gwzl2"]["cnName"]
                """公文种类"""
                if item["domainMetaList"]["xxgkml"]["resultList"]["cwrq2"]["cnName"]:
                    ptime: float = datetime.datetime.strptime(
                        item["domainMetaList"]["xxgkml"]["resultList"]["cwrq2"]["cnName"],
                        "%Y-%m-%d").timestamp()
                    """成文日期"""
                else:
                    ptime: float = 0
                    """成文日期"""
                index: str = item["domainMetaList"]["xxgkml"]["resultList"]["syh2"]["cnName"]
                """索引号"""
                pcode: str = item["domainMetaList"]["xxgkml"]["resultList"]["wenh2"]["cnName"]
                """文号"""
                effectiveness: str = item["domainMetaList"]["xxgkml"]["resultList"]["yxx01"]["cnName"]
                """有效性"""
                typeSecondName: str = item["domainMetaList"]["xxgkml"]["resultList"]["wz2"]["cnName"]
                """文种（自治区）"""
                year: str = item["domainMetaList"]["xxgkml"]["resultList"]["nianf2"]["cnName"]
                """年份"""
                childtype: str = item["domainMetaList"]["xxgkml"]["resultList"]["ztfl2"]["cnName"]
                """主题分类"""
                author: str = item["domainMetaList"]["默认元数据集"]["resultList"]["author"]["cnName"]
                """作者"""
                source: str = item["domainMetaList"]["默认元数据集"]["resultList"]["source"]["cnName"]
                """来源"""
                if item["manuscriptRelatedRes"]:
                    manuscriptRelatedRes: str = item["websiteDomain"] + item["manuscriptRelatedRes"]
                    """附件链接"""
                else:
                    manuscriptRelatedRes: str = ""
                    """附件链接"""
                response = requests.get(url=url, headers=headers, timeout=60)
                response.encoding = response.apparent_encoding
                print(f"二级链接状态：{response.status_code}")
                if response.status_code == 200:
                    soup: BeautifulSoup = BeautifulSoup(response.text, "html.parser")
                    content: str = getContent(soup=soup)
                    xinjiangzcwj.insert_one(
                        {
                            'typeOneName': typeOneName,
                            'typeSecondName': typeSecondName,
                            'articleType': articleType,
                            "title": title,
                            "subTitle": subTitle,
                            "childtype": childtype,
                            "index": index,
                            "pcode": pcode,
                            "puborg": puborg,
                            "ptime": ptime,
                            "pubtime": pubtime,
                            "effectiveness": effectiveness,
                            "author": author,
                            "year": year,
                            "manuscriptRelatedRes": manuscriptRelatedRes,
                            "url": url,
                            "source": source,
                            "content": content
                        }
                    )
                    print(f"{typeOneName}--{typeSecondName}--{title}-已完成")
                    time.sleep(random.randint(3, 8))
            except Exception as e:
                print(e)
                continue


if __name__ == '__main__':
    getData()
初始化仓库 2024-11-09 17:00:30 +08:00			`# __ coding : UTF-8 __`
			`# @Time : 2023/8/27 22:28`
			`# @Author : Haochen Zhong`
			`# @File : Exportxjzfgz.py`
			`# @Software : PyCharm`
			`# @Comment : 本程序采集新疆维吾尔自治区人民政府规章库`
			`import datetime`
			`import random`
			`import time`

			`import pymongo`
			`import requests`
			`from bs4 import BeautifulSoup`

			`# 模拟用户访问`
			`headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '`
			`'Chrome/51.0.2704.63 Safari/537.36',`
			`'Connection': 'close'}`
			`# 创建数据库`
			`client = pymongo.MongoClient('localhost', 27017)`
			`mydb = client.sjzf_zcwj`
			`xinjiangzcwj = mydb.xinjiangzcwj`


			`def getContent(soup: BeautifulSoup) -> str:`
			`"""`
			`获取文章正文内容`
			`:param soup:`
			`:return:`
			`"""`
			`content: str = ""`
			`for p in soup.select('.gknbxq_detail p'):`
			`para: str = p.text.strip()`
			`if para:`
			`content += para`
			`content += '\n'`
			`return content`


			`def getData():`
			`"""程序主函数"""`
			`count = 10000`
			`"""设置单次获取文章数量，可以任意设置正整数"""`
			`dataUrl = "https://www.xinjiang.gov.cn/interface-cms/qryManuscriptByWebsiteId"`
			`"""请求所有文章数据连接"""`
			`dataJson = {`
			`"websiteId": "2a4092ca8c2a4255bfec9f13f114aba6",`
			`"channelId": [`
			`"2aceb5d534434a9fb3550295b52a87e5"`
			`],`
			`"domainMetaList": [`
			`{}`
			`],`
			`"pageSize": f"{count}",`
			`"pageNum": 1,`
			`"title": None`
			`}`
			`"""请求参数"""`
			`response = requests.post(url=dataUrl, headers=headers, json=dataJson, timeout=60)`
			`response.encoding = response.apparent_encoding`
			`print(f"一级链接状态：{response.status_code}")`
			`if response.status_code == 200:`
			`dataList = response.json()["results"]`
			`for item in dataList:`
			`try:`
			`url: str = item["websiteDomain"] + item["url"]`
			`"""文章链接"""`
			`result = xinjiangzcwj.find_one({"url": url})`
			`if result:`
			`continue`
			`typeOneName: str = item["channelName"]`
			`"""文章归类"""`
			`title: str = item["title"]`
			`"""文章标题"""`
			`subTitle: str = item["subTitle"]`
			`"""文章副标题"""`
			`if item["publishedTime"]:`
			`pubtime: float = datetime.datetime.strptime(item["publishedTime"], "%Y-%m-%d").timestamp()`
			`"""发布日期"""`
			`else:`
			`pubtime: float = 0`
			`"""发布日期"""`
			`puborg: str = item["domainMetaList"]["xxgkml"]["resultList"]["fwjg2"]["cnName"]`
			`"""发文机关（自治区）"""`
			`articleType: str = item["domainMetaList"]["xxgkml"]["resultList"]["gwzl2"]["cnName"]`
			`"""公文种类"""`
			`if item["domainMetaList"]["xxgkml"]["resultList"]["cwrq2"]["cnName"]:`
			`ptime: float = datetime.datetime.strptime(`
			`item["domainMetaList"]["xxgkml"]["resultList"]["cwrq2"]["cnName"],`
			`"%Y-%m-%d").timestamp()`
			`"""成文日期"""`
			`else:`
			`ptime: float = 0`
			`"""成文日期"""`
			`index: str = item["domainMetaList"]["xxgkml"]["resultList"]["syh2"]["cnName"]`
			`"""索引号"""`
			`pcode: str = item["domainMetaList"]["xxgkml"]["resultList"]["wenh2"]["cnName"]`
			`"""文号"""`
			`effectiveness: str = item["domainMetaList"]["xxgkml"]["resultList"]["yxx01"]["cnName"]`
			`"""有效性"""`
			`typeSecondName: str = item["domainMetaList"]["xxgkml"]["resultList"]["wz2"]["cnName"]`
			`"""文种（自治区）"""`
			`year: str = item["domainMetaList"]["xxgkml"]["resultList"]["nianf2"]["cnName"]`
			`"""年份"""`
			`childtype: str = item["domainMetaList"]["xxgkml"]["resultList"]["ztfl2"]["cnName"]`
			`"""主题分类"""`
			`author: str = item["domainMetaList"]["默认元数据集"]["resultList"]["author"]["cnName"]`
			`"""作者"""`
			`source: str = item["domainMetaList"]["默认元数据集"]["resultList"]["source"]["cnName"]`
			`"""来源"""`
			`if item["manuscriptRelatedRes"]:`
			`manuscriptRelatedRes: str = item["websiteDomain"] + item["manuscriptRelatedRes"]`
			`"""附件链接"""`
			`else:`
			`manuscriptRelatedRes: str = ""`
			`"""附件链接"""`
			`response = requests.get(url=url, headers=headers, timeout=60)`
			`response.encoding = response.apparent_encoding`
			`print(f"二级链接状态：{response.status_code}")`
			`if response.status_code == 200:`
			`soup: BeautifulSoup = BeautifulSoup(response.text, "html.parser")`
			`content: str = getContent(soup=soup)`
			`xinjiangzcwj.insert_one(`
			`{`
			`'typeOneName': typeOneName,`
			`'typeSecondName': typeSecondName,`
			`'articleType': articleType,`
			`"title": title,`
			`"subTitle": subTitle,`
			`"childtype": childtype,`
			`"index": index,`
			`"pcode": pcode,`
			`"puborg": puborg,`
			`"ptime": ptime,`
			`"pubtime": pubtime,`
			`"effectiveness": effectiveness,`
			`"author": author,`
			`"year": year,`
			`"manuscriptRelatedRes": manuscriptRelatedRes,`
			`"url": url,`
			`"source": source,`
			`"content": content`
			`}`
			`)`
			`print(f"{typeOneName}--{typeSecondName}--{title}-已完成")`
			`time.sleep(random.randint(3, 8))`
			`except Exception as e:`
			`print(e)`
			`continue`


			`if __name__ == '__main__':`
			`getData()`