初始化仓库

2024-11-09 17:00:30 +08:00 · 2024-11-09 17:00:30 +08:00 · 3bfb57b662
commit 3bfb57b662
17 changed files with 2201 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,52 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+
+config.json
+# Virtual environment
+venv/
+env/
+.venv/
+.venv3/
+.Python
+*.sqlite3
+
+# IDE-specific files
+.idea/
+.vscode/
+
+# Compiled source
+*.com
+*.class
+*.dll
+*.exe
+*.o
+*.so
+
+# Logs and databases
+*.log
+*.sql
+*.sqlite
+
+# Output files
+dist/
+build/
+*.egg-info/
+*.egg
+
+# OS-specific files
+.DS_Store
+Thumbs.db
+
+# Miscellaneous
+*.bak
+*.swp
+*.tmp
+*.tmp.*
+*.~*
+
+
+# Jupyter Notebook
+.ipynb_checkpoints/
--- a/requirements.txt
+++ b/requirements.txt
--- a/国内党媒/CrawlZhongguogaigebao.py
+++ b/国内党媒/CrawlZhongguogaigebao.py
@ -0,0 +1,186 @@
+# _*_ coding : UTF-8 _*_
+# @Time : 2024/11/06 21:35
+# @UpdateTime : 2024/11/06 21:35
+# @Author : haochen zhong
+# @File : CrawlZhongguogaigebao.py
+# @Software : PyCharm
+# @Comment : 本程序采集中国改革报版面数据
+import asyncio
+import random
+from datetime import datetime, timedelta
+
+from bs4 import BeautifulSoup
+from httpx import AsyncClient
+from motor.motor_asyncio import AsyncIOMotorClient
+
+start_date = datetime.strptime('2017-09', '%Y-%m')
+"""中国改革报2017年9月份开始有数据"""
+end_date = datetime.today()
+"""截止到今天"""
+headers = {
+    'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
+
+# 链接数据库
+client = AsyncIOMotorClient('mongodb://localhost:27017')
+db = client['buweijiguanbao']
+collection = db['zhongguogaigebao']
+
+
+async def main():
+    collection_names = await db.list_collection_names()
+    # 判断数据表是否存在
+    if "zhongguogaigebao" not in collection_names:
+        # 如果不存在，则从2017年9月开始爬取
+        print("中国改革报数据表不存在，开始采集！")
+        await getData(start_date, end_date)
+    else:
+        # 如果存在，则从数据库中获取最后一条记录的日期
+        last_record = await collection.find_one({}, sort=[('release_time', -1)])
+        last_date_str = last_record['release_time']
+        print("数据库截止时间:",last_date_str)
+        await getData(last_date_str, end_date)
+
+
+async def getContent(soup: BeautifulSoup) -> str:
+    """
+    :param soup: BeautifulSoup对象
+    :return: 文章内容
+    """
+    content = ""
+    for p in soup.select("#ozoom p"):
+        para = p.text.strip()
+        if para:
+            content += para
+            content += '\n'
+    return content
+
+
+async def getData(start_date: datetime, end_date: datetime):
+    """
+    :param start_date: 开始日期
+    :param end_date: 结束日期
+    :return: None
+    """
+    crawl_num = 0
+    # 创建一个列表保存月份
+    months = []
+    # 从开始日期到结束日期，每个月份都添加到列表中
+    current_date = start_date
+    while current_date <= end_date:
+        months.append(current_date)
+        # 增加一个月
+        if current_date.month == 12:
+            current_date = current_date.replace(year=current_date.year + 1, month=1)
+        else:
+            current_date = current_date.replace(month=current_date.month + 1)
+        # 遍历月份列表
+    for month in months:
+        # 构造URL
+        url = f'http://www.cfgw.net.cn/epaper/{month.strftime("%Y%m")}/period.xml'
+        """http://www.cfgw.net.cn/epaper/201709/period.xml"""
+        print(url)
+        async with AsyncClient(headers=headers, timeout=60) as client:
+            # 发送GET请求
+            response = await client.get(url)
+            response.encoding = response.charset_encoding
+            print(f"一级连接状态：{response.status_code}")
+            if response.status_code == 200:
+                # 解析XML
+                soup = BeautifulSoup(response.text, 'xml')
+                for period in soup.find_all("period"):
+                    try:
+                        period_id = period.get("id")
+                        url1 = f"http://www.cfgw.net.cn/epaper/{month.strftime('%Y%m')}/{period_id}/node_01.htm"
+                        """http://www.cfgw.net.cn/epaper/201709/05/node_01.htm"""
+                        print(url1)
+                        response2 = await client.get(url1)
+                        response2.encoding = response2.charset_encoding
+                        print(f"二级连接状态：{response2.status_code}")
+                        if response2.status_code == 200:
+                            soup2 = BeautifulSoup(response2.text, 'lxml')
+                            for item in soup2.select(".posRelative>a"):
+                                url2 = f"http://www.cfgw.net.cn/epaper/{month.strftime('%Y%m')}/{period_id}/" + item.get(
+                                    "href")
+                                """http://www.cfgw.net.cn/epaper/201709/05/node_01/node_01.htm"""
+                                banmianming = item.text.split("：")[-1]
+                                banmianhao = item.text.split("：")[0]
+                                print(url2)
+                                response3 = await client.get(url2)
+                                response3.encoding = response3.charset_encoding
+                                print(f"三级连接状态：{response3.status_code}")
+                                if response3.status_code == 200:
+                                    soup3 = BeautifulSoup(response3.text, 'lxml')
+                                    for item2 in soup3.select("#articlelist > .clearfix > a"):
+                                        url3 = f"http://www.cfgw.net.cn/epaper/" + item2.get("href")[6:]
+                                        if await collection.find_one({"detail_url": url3}, {"_id": False}):
+                                            continue
+                                        title = item2.text.strip()
+                                        print(url3)
+                                        response4 = await client.get(url3)
+                                        response4.encoding = response4.charset_encoding
+                                        print(f"四级连接状态：{response4.status_code}")
+                                        if response4.status_code == 200:
+                                            soup4 = BeautifulSoup(response4.text, 'lxml')
+                                            try:
+                                                title = soup4.select("#Title")[0].text.strip()
+                                            except:
+                                                title = title
+                                            try:
+                                                subtitle = soup4.select("#SubTitle")[0].text.strip()
+                                            except:
+                                                subtitle = ""
+                                            try:
+                                                preTitle = soup4.select("#PreTitle")[0].text.strip()
+                                            except:
+                                                preTitle = ""
+                                            try:
+                                                author = soup4.find("author").text.strip()
+                                            except:
+                                                author = ""
+                                            try:
+                                                keyword = soup4.find("keyword").text.strip()
+                                            except:
+                                                keyword = ""
+                                            content = await getContent(soup4)
+                                            await collection.insert_one({
+                                                "title": title,
+                                                "subtitle": subtitle,
+                                                "preTitle": preTitle,
+                                                "author": author,
+                                                "banmianming": banmianming,
+                                                "banmianhao": banmianhao,
+                                                'keywordlist': keyword,
+                                                'detail_url': url3,
+                                                'release_time': month + timedelta(days=int(period_id)-1),
+                                                'insert_timestamp': datetime.today(),
+                                                'content': content
+                                            })
+                                            crawl_num += 1
+                                            print(
+                                                f"中国改革报---{month.strftime('%Y-%m')}-{period_id}---{banmianming}---{banmianhao}---{title}---采集完成！")
+                                            await asyncio.sleep(random.randint(5, 15))
+                                print(
+                                    f"中国改革报---{month.strftime('%Y-%m')}-{period_id}---{banmianming}---{banmianhao}----采集完成！")
+                                await asyncio.sleep(random.randint(5, 15))
+                        print(
+                            f"中国改革报---{month.strftime('%Y-%m')}-{period_id}-----采集完成！")
+                        await asyncio.sleep(random.randint(5, 15))
+                    except Exception as e:
+                        await collection.insert_one(
+                            {'banmianhao': 'empty',
+                             'banmianming': 'empty',
+                             'preTitle': 'empty',
+                             'title': 'empty',
+                             'subtitle': 'empty',
+                             'author': 'empty',
+                             'keywordlist': 'empty',
+                             'detail_url': url,
+                             'release_time': month + timedelta(days=int(period_id)),
+                             'insert_timestamp': datetime.today(),
+                             'content': 'empty'}
+                        )
+                        print(e)
+    print(f"中国改革报采集完毕，共采集{crawl_num}条数据！")
+
+
+asyncio.run(main())
--- a/国内党媒/CrawlZhongguojiaoyubao.py
+++ b/国内党媒/CrawlZhongguojiaoyubao.py
@ -0,0 +1,282 @@
+# _*_ coding : UTF-8 _*_
+# @Time : 2024/11/08 21:42
+# @UpdateTime : 2024/11/08 21:42
+# @Author : haochen zhong
+# @File : CrawlZhongguojiaoyubao.py
+# @Software : PyCharm
+# @Comment : 本程序采集中国教育报数据
+
+import asyncio
+import random
+from datetime import datetime, timedelta
+
+from bs4 import BeautifulSoup
+from httpx import AsyncClient
+from motor.motor_asyncio import AsyncIOMotorClient
+
+start_date = datetime.strptime('2022-01', '%Y-%m')
+"""中国教育报2022年1月份开始有数据"""
+end_date = datetime.today()
+"""截止到今天"""
+headers = {
+    'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
+
+# 链接数据库
+client = AsyncIOMotorClient('mongodb://localhost:27017')
+db = client['buweijiguanbao']
+collection = db['zhongguojiaoyubao']
+
+
+async def main():
+    collection_names = await db.list_collection_names()
+    # 判断数据表是否存在
+    if "zhongguojiaoyubao" not in collection_names:
+        # 如果不存在，则从2017年9月开始爬取
+        print("中国教育报数据表不存在，开始采集！")
+        await getData(start_date, end_date)
+    else:
+        # 如果存在，则从数据库中获取最后一条记录的日期
+        last_record = await collection.find_one({}, sort=[('release_time', -1)])
+        last_date_str = last_record['release_time']
+        print("数据库截止时间:", last_date_str)
+        await getData(last_date_str, end_date)
+
+
+async def getContent(soup: BeautifulSoup) -> str:
+    """
+    :param soup: BeautifulSoup对象
+    :return: 文章内容
+    """
+    content = ""
+    for p in soup.select(".content_tt p"):
+        para = p.text.strip()
+        if para:
+            content += para
+            content += '\n'
+    return content
+
+
+async def getData(start_date: datetime, end_date: datetime):
+    """
+    :param start_date: 开始日期
+    :param end_date: 结束日期
+    :return: None
+    """
+    crawl_num = 0
+    # 创建一个列表保存月份
+    months = []
+    # 从开始日期到结束日期，每个月份都添加到列表中
+    current_date = start_date
+    while current_date <= end_date:
+        months.append(current_date)
+        # 增加一个月
+        if current_date.month == 12:
+            current_date = current_date.replace(year=current_date.year + 1, month=1)
+        else:
+            current_date = current_date.replace(month=current_date.month + 1)
+        # 遍历月份列表
+    for month in months:
+        # 构造URL
+        url = f'http://paper.jyb.cn/zgjyb/html/{month.strftime("%Y-%m")}/period.xml'
+        """http://paper.jyb.cn/zgjyb/html/2023-01/period.xml"""
+        print(url)
+        async with AsyncClient(headers=headers, timeout=60) as client:
+            response = await client.get(url)
+            response.encoding = response.charset_encoding
+            print(f"一级连接状态：{response.status_code}")
+            if response.status_code == 200:
+                soup = BeautifulSoup(response.text, 'xml')
+                for period in soup.select("period"):
+                    period_name = datetime.strptime(period.find("period_name").text.strip(), "%Y-%m-%d")
+                    front_page = period.find("front_page").text.strip()
+                    try:
+                        url1 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/{front_page}"
+                        print(url1)
+                        response2 = await client.get(url1)
+                        response2.encoding = response2.charset_encoding
+                        print(f"二级连接状态：{response2.status_code}")
+                        if response2.status_code == 200:
+                            soup2 = BeautifulSoup(response2.text, 'lxml')
+                            for item in soup2.select(".right_title-name a"):
+                                banmianming = item.text.split("：")[-1]
+                                banmianhao = item.text.split("：")[0]
+                                url2 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/" + item.get(
+                                    "href").replace("./","").strip()
+                                print(url2)
+                                response3 = await client.get(url2)
+                                response3.encoding = response3.charset_encoding
+                                print(f"三级连接状态：{response3.status_code}")
+                                if response3.status_code == 200:
+                                    soup3 = BeautifulSoup(response3.text, 'lxml')
+                                    for item2 in soup3.select("#titleList1 a"):
+                                        url3 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/" + item2.get(
+                                            "href")
+                                        if await collection.find_one({"detail_url": url3}, {"_id": False}):
+                                            continue
+                                        title = item2.text.strip()
+                                        print(url3)
+                                        response4 = await client.get(url3)
+                                        response4.encoding = response4.charset_encoding
+                                        print(f"四级连接状态：{response4.status_code}")
+                                        if response4.status_code == 200:
+                                            soup4 = BeautifulSoup(response4.text, 'lxml')
+                                            try:
+                                                title = soup4.select_one(".title1").text.strip()
+                                            except:
+                                                title = title
+                                            try:
+                                                subTitle = soup4.select(".title2")[0].text.strip()
+                                            except:
+                                                subTitle = ""
+                                            try:
+                                                author = soup4.select_one(".title3").text.strip()
+                                            except:
+                                                author = ""
+                                            try:
+                                                perTitle = soup4.select(".title2")[-1].text.strip()
+                                            except:
+                                                perTitle = ""
+                                            try:
+                                                keywordlist = soup4.find("founder-keyword").text.strip()
+                                            except:
+                                                keywordlist = ""
+                                            content = await getContent(soup4)
+                                            await collection.insert_one({
+                                                "title": title,
+                                                "subtitle": subTitle,
+                                                "preTitle": perTitle,
+                                                "author": author,
+                                                "banmianming": banmianming,
+                                                "banmianhao": banmianhao,
+                                                'keywordlist': keywordlist,
+                                                'detail_url': url3,
+                                                'release_time': period_name,
+                                                'insert_timestamp': datetime.today(),
+                                                'content': content
+                                            })
+                                            crawl_num += 1
+                                            print(
+                                                f"中国教育报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}---{title}---采集完成！")
+                                            await asyncio.sleep(random.randint(5, 15))
+                                    print(
+                                        f"中国教育报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}-----采集完成！")
+                                    await asyncio.sleep(random.randint(5, 15))
+                        print(f"中国教育报---{period_name.strftime('%Y-%m-%d')}-----采集完成！")
+                        await asyncio.sleep(random.randint(5, 15))
+                    except Exception as e:
+                        print(e)
+                        await collection.insert_one(
+                            {'banmianhao': 'empty',
+                             'banmianming': 'empty',
+                             'preTitle': 'empty',
+                             'title': 'empty',
+                             'subtitle': 'empty',
+                             'author': 'empty',
+                             'keywordlist': 'empty',
+                             'detail_url': url,
+                             'release_time': period_name,
+                             'insert_timestamp': datetime.today(),
+                             'content': 'empty'}
+                        )
+
+            else:
+                url = f"http://paper.jyb.cn/zgjyb/html/{month.strftime('%Y-%m')}/navi.xml"
+                response = await client.get(url)
+                response.encoding = response.charset_encoding
+                print(f"一级连接状态：{response.status_code}")
+                if response.status_code == 200:
+                    soup = BeautifulSoup(response.text, 'xml')
+                    for period in soup.select("calendar"):
+                        period_name = datetime.strptime(period.find("date").text.strip(), "%Y-%m-%d")
+                        front_page = period.find("url").text.strip()[6:]
+                        try:
+                            url1 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/{front_page}"
+                            print(url1)
+                            response2 = await client.get(url1)
+                            response2.encoding = response2.charset_encoding
+                            print(f"二级连接状态：{response2.status_code}")
+                            if response2.status_code == 200:
+                                soup2 = BeautifulSoup(response2.text, 'lxml')
+                                for item in soup2.select(".right_title-name a"):
+                                    banmianming = item.text.split(":")[-1]
+                                    banmianhao = item.text.split(":")[0]
+                                    url2 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/" + item.get(
+                                        "href").replace("./","").strip()
+                                    print(url2)
+                                    response3 = await client.get(url2)
+                                    response3.encoding = response3.charset_encoding
+                                    print(f"三级连接状态：{response3.status_code}")
+                                    if response3.status_code == 200:
+                                        soup3 = BeautifulSoup(response3.text, 'lxml')
+                                        for item2 in soup3.select("#titleList1 a"):
+                                            url3 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/" + item2.get(
+                                                "href")
+                                            if await collection.find_one({"detail_url": url3}, {"_id": False}):
+                                                continue
+                                            title = item2.text.strip()
+                                            print(url3)
+                                            response4 = await client.get(url3)
+                                            response4.encoding = response4.charset_encoding
+                                            print(f"四级连接状态：{response4.status_code}")
+                                            if response4.status_code == 200:
+                                                soup4 = BeautifulSoup(response4.text, 'lxml')
+                                                try:
+                                                    title = soup4.select(".article-title")[0].text.strip()
+                                                except:
+                                                    title = title
+                                                try:
+                                                    subTitle = soup4.select(".article-subtitle")[0].text.strip()
+                                                except:
+                                                    subTitle = ""
+                                                try:
+                                                    author = soup4.select(".article-author")[0].text.strip()
+                                                except:
+                                                    author = ""
+                                                try:
+                                                    perTitle = soup4.select(".article-pretitle")[0].text.strip()
+                                                except:
+                                                    perTitle = ""
+                                                try:
+                                                    keywordlist = soup4.find("founder-keyword").text.strip()
+                                                except:
+                                                    keywordlist = ""
+                                                content = await getContent(soup4)
+                                                await collection.insert_one({
+                                                    "title": title,
+                                                    "subtitle": subTitle,
+                                                    "preTitle": perTitle,
+                                                    "author": author,
+                                                    "banmianming": banmianming,
+                                                    "banmianhao": banmianhao,
+                                                    'keywordlist': keywordlist,
+                                                    'detail_url': url3,
+                                                    'release_time': period_name,
+                                                    'insert_timestamp': datetime.today(),
+                                                    'content': content
+                                                })
+                                                crawl_num += 1
+                                                print(
+                                                    f"中国教育报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}---{title}---采集完成！")
+                                                await asyncio.sleep(random.randint(5, 15))
+                                        print(
+                                            f"中国教育报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}-----采集完成！")
+                                        await asyncio.sleep(random.randint(5, 15))
+                            print(f"中国教育报---{period_name.strftime('%Y-%m-%d')}-----采集完成！")
+                            await asyncio.sleep(random.randint(5, 15))
+                        except Exception as e:
+                            await collection.insert_one(
+                                {'banmianhao': 'empty',
+                                 'banmianming': 'empty',
+                                 'preTitle': 'empty',
+                                 'title': 'empty',
+                                 'subtitle': 'empty',
+                                 'author': 'empty',
+                                 'keywordlist': 'empty',
+                                 'detail_url': url,
+                                 'release_time':period_name ,
+                                 'insert_timestamp': datetime.today(),
+                                 'content': 'empty'}
+                            )
+    print(f"中国教育报采集完毕，共采集{crawl_num}条数据！")
+
+asyncio.run(main())
--- a/国内党媒/CrawlZhongguojingjidaobao.py
+++ b/国内党媒/CrawlZhongguojingjidaobao.py
@ -0,0 +1,185 @@
+# _*_ coding : UTF-8 _*_
+# @Time : 2024/11/08 00:07
+# @UpdateTime : 2024/11/08 00:07
+# @Author : haochen zhong
+# @File : CrawlZhongguojingjidaobao.py
+# @Software : PyCharm
+# @Comment : 本程序采集中国经济导报数据
+import asyncio
+import random
+from datetime import datetime, timedelta
+
+from bs4 import BeautifulSoup
+from httpx import AsyncClient
+from motor.motor_asyncio import AsyncIOMotorClient
+
+start_date = datetime.strptime('2012-09', '%Y-%m')
+"""中国经济导报2012年9月份开始有数据"""
+end_date = datetime.today()
+"""截止到今天"""
+headers = {
+    'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
+
+# 链接数据库
+client = AsyncIOMotorClient('mongodb://localhost:27017')
+db = client['buweijiguanbao']
+collection = db['zhongguojingjidaobao']
+
+
+async def main():
+    collection_names = await db.list_collection_names()
+    # 判断数据表是否存在
+    if "zhongguojingjidaobao" not in collection_names:
+        # 如果不存在，则从2017年9月开始爬取
+        print("中国经济导报数据表不存在，开始采集！")
+        await getData(start_date, end_date)
+    else:
+        # 如果存在，则从数据库中获取最后一条记录的日期
+        last_record = await collection.find_one({}, sort=[('release_time', -1)])
+        last_date_str = last_record['release_time']
+        print("数据库截止时间:", last_date_str)
+        await getData(last_date_str, end_date)
+
+
+async def getContent(soup: BeautifulSoup) -> str:
+    """
+    :param soup: BeautifulSoup对象
+    :return: 文章内容
+    """
+    content = ""
+    for p in soup.select("#pgcontent"):
+        para = p.text.strip()
+        if para:
+            content += para
+            content += '\n'
+    return content
+
+
+async def getData(start_date: datetime, end_date: datetime):
+    """
+    :param start_date: 开始日期
+    :param end_date: 结束日期
+    :return: None
+    """
+    crawl_num = 0
+    # 创建一个列表保存月份
+    months = []
+    # 从开始日期到结束日期，每个月份都添加到列表中
+    current_date = start_date
+    while current_date <= end_date:
+        months.append(current_date)
+        # 增加一个月
+        if current_date.month == 12:
+            current_date = current_date.replace(year=current_date.year + 1, month=1)
+        else:
+            current_date = current_date.replace(month=current_date.month + 1)
+        # 遍历月份列表
+    for month in months:
+        # 构造URL
+        url = f'http://www.ceh.com.cn/epaper/uniflows/html/{month.strftime("%Y/%m")}/date.txt'
+        """http://www.ceh.com.cn/epaper/uniflows/html/2012/09/date.txt"""
+        print(url)
+        async with AsyncClient(headers=headers, timeout=60) as client:
+            # 发送GET请求
+            response = await client.get(url)
+            response.encoding = "gb2312"
+            print(f"一级连接状态：{response.status_code}")
+            if response.status_code == 200:
+                # 解析XML
+                soup = response.text.split("|")
+                for period in soup:
+                    period_id, element = period.split(",")
+                    if len(element) < 5:
+                        continue
+                    try:
+                        url1 = f"http://www.ceh.com.cn/epaper/uniflows/html/{month.strftime('%Y/%m')}/{period_id}/boardurl.htm"
+                        """http://www.ceh.com.cn/epaper/uniflows/html/2012/09/01/boardurl.htm"""
+                        print(url1)
+                        response2 = await client.get(url1)
+                        response2.encoding = "gb2312"
+                        print(f"二级连接状态：{response2.status_code}")
+                        if response2.status_code == 200:
+                            soup2 = BeautifulSoup(response2.text, 'lxml')
+                            for item in soup2.select(".board_link td>a"):
+                                url2 = f"http://www.ceh.com.cn/epaper/uniflows/html/{month.strftime('%Y/%m')}/{period_id}/" + item.get(
+                                    "href")
+                                """http://www.ceh.com.cn/epaper/uniflows/html/2024/11/07/01/default.htm"""
+                                banmianming = item.text.split("：")[-1].strip()
+                                banmianhao = item.text.split("：")[0].replace("&nbsp;", "").replace(" ", "").strip()
+                                print(url2)
+                                response3 = await client.get(url2)
+                                response3.encoding = "gb2312"
+                                print(f"三级连接状态：{response3.status_code}")
+                                if response3.status_code == 200:
+                                    soup3 = BeautifulSoup(response3.text, 'lxml')
+                                    for item2 in soup3.select("#mp_32"):
+                                        url3 = f"http://www.ceh.com.cn/epaper/uniflows/html/{month.strftime('%Y/%m')}/{period_id}/" + \
+                                               item.get("href").split("/")[0] + "/" + item2.get("href")
+                                        if await collection.find_one({"detail_url": url3}, {"_id": False}):
+                                            continue
+                                        title = item2.text.strip()
+                                        print(url3)
+                                        response4 = await client.get(url3)
+                                        response4.encoding = "gb2312"
+                                        print(f"四级连接状态：{response4.status_code}")
+                                        if response4.status_code == 200:
+                                            soup4 = BeautifulSoup(response4.text, 'lxml')
+                                            try:
+                                                title = soup4.select(".content_title")[0].text.strip()
+                                            except:
+                                                title = title
+                                            try:
+                                                subtitle = soup4.select(".subtitle")[0].text.strip()
+                                            except:
+                                                subtitle = ""
+                                            try:
+                                                preTitle = soup4.select(".yinti_title")[0].text.strip()
+                                            except:
+                                                preTitle = ""
+                                            try:
+                                                author = soup4.select(".others")[0].text.strip()
+                                            except:
+                                                author = ""
+                                            content = await getContent(soup4)
+                                            await collection.insert_one({
+                                                "title": title,
+                                                "subtitle": subtitle,
+                                                "preTitle": preTitle,
+                                                "author": author,
+                                                "banmianming": banmianming,
+                                                "banmianhao": banmianhao,
+                                                'keywordlist': 'empty',
+                                                'detail_url': url3,
+                                                'release_time': month + timedelta(days=int(period_id) - 1),
+                                                'insert_timestamp': datetime.today(),
+                                                'content': content
+                                            })
+                                            crawl_num += 1
+                                            print(
+                                                f"中国经济导报---{month.strftime('%Y-%m')}-{period_id}---{banmianming}---{banmianhao}---{title}---采集完成！")
+                                            await asyncio.sleep(random.randint(5, 15))
+                                print(
+                                    f"中国经济导报---{month.strftime('%Y-%m')}-{period_id}---{banmianming}---{banmianhao}----采集完成！")
+                                await asyncio.sleep(random.randint(5, 15))
+                        print(
+                            f"中国经济导报---{month.strftime('%Y-%m')}-{period_id}-----采集完成！")
+                        await asyncio.sleep(random.randint(5, 15))
+                    except Exception as e:
+                        await collection.insert_one(
+                            {'banmianhao': 'empty',
+                             'banmianming': 'empty',
+                             'preTitle': 'empty',
+                             'title': 'empty',
+                             'subtitle': 'empty',
+                             'author': 'empty',
+                             'keywordlist': 'empty',
+                             'detail_url': url,
+                             'release_time': month + timedelta(days=int(period_id)),
+                             'insert_timestamp': datetime.today(),
+                             'content': 'empty'}
+                        )
+                        print(e)
+    print(f"中国经济导报采集完毕，共采集{crawl_num}条数据！")
+
+
+asyncio.run(main())
--- a/国内党媒/v1.0各部委的机关报纸和专业报纸名录信息20241104.xlsx
+++ b/国内党媒/v1.0各部委的机关报纸和专业报纸名录信息20241104.xlsx
--- a/地方政策/报刊/CrawlAnhui.py
+++ b/地方政策/报刊/CrawlAnhui.py
@ -0,0 +1,145 @@
+# _*_ coding : UTF-8 _*_
+# @Time : 2024/11/08 20:29
+# @UpdateTime : 2024/11/08 20:29
+# @Author : haochen zhong
+# @File : CrawlAnhui.py
+# @Software : PyCharm
+# @Comment : 本程序采集安徽日报数字报数据
+
+
+import asyncio
+import random
+from datetime import datetime, timedelta
+
+from bs4 import BeautifulSoup
+from httpx import AsyncClient
+from motor.motor_asyncio import AsyncIOMotorClient
+
+start_date = datetime.strptime('2017-09-29', '%Y-%m-%d')
+"""安徽日报报2018年09月29日开始有数据"""
+end_date = datetime.today()
+"""截止到今天"""
+headers = {
+    'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
+
+# 链接数据库
+client = AsyncIOMotorClient('mongodb://localhost:27017')
+db = client['dfdm_sjribao']
+collection = db['anhuiribao']
+
+
+async def main():
+    collection_names = await db.list_collection_names()
+    # 判断数据表是否存在
+    if "anhuiribao" not in collection_names:
+        # 如果不存在，则从2017年9月开始爬取
+        print("安徽日报报数据表不存在，开始采集！")
+        await getData(start_date, end_date)
+    else:
+        # 如果存在，则从数据库中获取最后一条记录的日期
+        last_record = await collection.find_one({}, sort=[('release_time', -1)])
+        last_date_str = last_record['release_time']
+        print("数据库截止时间:", last_date_str)
+        await getData(last_date_str, end_date)
+
+
+async def getContent(soup: BeautifulSoup) -> str:
+    """
+    :param soup: BeautifulSoup对象
+    :return: 文章内容
+    """
+    content = ""
+    for p in soup.select(".content p"):
+        para = p.text.strip()
+        if para:
+            content += para
+            content += '\n'
+    return content
+
+
+async def getData(start_date: datetime, end_date: datetime):
+    crawl_num = 0
+    for i in range((end_date - start_date).days):
+        date_now = start_date + timedelta(days=i + 1)
+        date_now_s = date_now.strftime('%Y%m/%d')
+        base_url = "https://szb.ahnews.com.cn/ahrb/layout/" + date_now_s + '/'
+        url = base_url + 'node_01.html'
+        """https://szb.ahnews.com.cn/ahrb/layout/201811/01/node_01.html"""
+        try:
+            async with AsyncClient(headers=headers, timeout=60) as client:
+                print(url)
+                response = await client.get(url)
+                response.encoding = response.charset_encoding
+                print(f"一级连接状态：{response.status_code}")
+                if response.status_code == 200:
+                    soup = BeautifulSoup(response.text, 'lxml')
+                    for item in soup.select(".Chunkiconlist p > a:nth-child(1)"):
+                        banmianming = item.text.split("：")[-1].strip()
+                        banmianhao = item.text.split("：")[0].replace("&nbsp;", "").replace(" ", "").strip()
+                        url1 = base_url + item.get("href")
+                        print(url1)
+                        response2= await client.get(url1)
+                        response2.encoding = response2.charset_encoding
+                        print(f"二级连接状态：{response2.status_code}")
+                        if response2.status_code == 200:
+                            soup2 = BeautifulSoup(response2.text, 'lxml')
+                            for item2 in soup2.select(".newslist a"):
+                                url2 = "https://szb.ahnews.com.cn/ahrb/" + item2.get("href")[9:]
+                                """https://szb.ahnews.com.cn/ahrb/content/201709/29/c17310.html"""
+                                if await collection.find_one({"detail_url": url2}, {"_id": False}):
+                                    continue
+                                title = item2.text.strip()
+                                print(url2)
+                                response3 = await client.get(url2)
+                                response3.encoding = response3.charset_encoding
+                                print(f"三级连接状态：{response3.status_code}")
+                                if response3.status_code == 200:
+                                    soup3 = BeautifulSoup(response3.text, 'lxml')
+                                    content = await getContent(soup3)
+                                    try:
+                                        title = soup3.select(".newsdetatit h3")[0].text.strip()
+                                    except:
+                                        title = title
+                                    try:
+                                        subTitle= soup3.select(".newsdetatext p")[0].text.strip()
+                                    except:
+                                        subTitle = ""
+                                    await collection.insert_one({
+                                        "title": title,
+                                        "subtitle": subTitle,
+                                        "preTitle": "",
+                                        "author": "",
+                                        "banmianming": banmianming,
+                                        "banmianhao": banmianhao,
+                                        'keywordlist': 'empty',
+                                        'detail_url': url2,
+                                        'release_time': date_now,
+                                        'insert_timestamp': datetime.today(),
+                                        'content': content
+                                    })
+                                    crawl_num += 1
+                                    print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成！")
+                                    await asyncio.sleep(random.randint(5, 15))
+                        print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成！")
+                        await asyncio.sleep(random.randint(5, 15))
+                print(f"安徽日报---{date_now_s}-----采集完成！")
+                await asyncio.sleep(random.randint(5, 15))
+        except Exception as e:
+            print(e)
+            await collection.insert_one(
+                {'banmianhao': 'empty',
+                 'banmianming': 'empty',
+                 'preTitle': 'empty',
+                 'title': 'empty',
+                 'subtitle': 'empty',
+                 'author': 'empty',
+                 'keywordlist': 'empty',
+                 'detail_url': url,
+                 'release_time': date_now,
+                 'insert_timestamp': datetime.today(),
+                 'content': 'empty'}
+            )
+    print(f"安徽日报采集完毕，共采集{crawl_num}条数据！")
+
+asyncio.run(main())
+
--- a/地方政策/报刊/CrawlGuizhou.py
+++ b/地方政策/报刊/CrawlGuizhou.py
@ -0,0 +1,140 @@
+# _*_ coding : UTF-8 _*_
+# @Time : 2022/12/27 14:15
+# @UpdateTime : 2023/11/08 16:30
+# @Author : Haochen Zhong
+# @File : CrawlGuizhou.py
+# @Software : PyCharm
+# @Comment : 本程序采集贵州日报数字报板面数据
+import random
+import time
+from datetime import timedelta, datetime
+
+import pymongo
+import requests
+from bs4 import BeautifulSoup
+
+# 数据库起止时间
+start_date = datetime.strptime('2021-12-31', '%Y-%m-%d')
+"""贵州日报数字报2022-01-01开始有数据纪录"""
+end_date = datetime.today()
+headers = {
+    'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
+# 创建数据库
+client = pymongo.MongoClient('localhost', 27017)
+mydb = client.dfdm_sjribao
+guizhouribao = mydb.guizhouribao
+# 设置随机时间
+sleeptime = random.randint(2, 15)
+
+
+def main():
+    # 判断数据库是否存在
+    collist = mydb.list_collection_names()
+    if "guizhouribao" in collist:  # 检测集合是否存在
+        print("贵州集合存在，更新数据库")
+        # 数据库最新一条内容的时间
+        db_time = guizhouribao.find_one(sort=[('release_time', -1)])[
+            'release_time']  # 或者find().sort('_id', -1).limit(1)
+        print('数据库截止时间%s' % db_time)
+        # 输入更新数据库时间
+        input_time = datetime.today()
+        if db_time < input_time:
+            getData(db_time, input_time)
+        else:
+            print('数据库无需更新')
+    else:
+        # 爬取网页并建立数据库
+        print("数据库不存在，建立数据库！")
+        getData(start_date, end_date)
+
+
+def get_content(soup3):
+    content = ""
+    for p in soup3.select("#ozoom p"):
+        para = p.text.strip()
+        if para:
+            content += para
+            content += '\n'
+    return content
+
+
+def getData(start_date, end_date):
+    crawl_num = 0
+    for i in range((end_date - start_date).days):
+        date_now = start_date + timedelta(days=i + 1)
+        date_now_s = date_now.strftime('%Y%m/%d')
+        base_url = "http://szb.gzrbs.com.cn/pc/layout/" + date_now_s + "/"
+        url = base_url + "node_01.html"
+        # http://szb.gzrbs.com.cn/pc/layout/202201/01/node_01.html
+        try:
+            response = requests.get(url=url, headers=headers, timeout=(30, 45))
+            response.encoding = response.apparent_encoding
+            print(f"一级连接状态：{response.status_code}")
+            if response.status_code == 200:
+                soup = BeautifulSoup(response.text, "lxml")
+                for item in soup.select(".btn-block"):
+                    banmianming = item.text.split("：")[-1]
+                    banmianhao = item.text.split("：")[0]
+                    url1 = base_url + item.get("href")
+                    response2 = requests.get(url=url1, headers=headers, timeout=(30, 45))
+                    response2.encoding = response2.apparent_encoding
+                    print(f"二级连接状态：{response2.status_code}")
+                    if response2.status_code == 200:
+                        soup2 = BeautifulSoup(response2.text, "lxml")
+                        for item2 in soup2.select(".resultList a"):
+                            title = item2.text.strip()
+                            url2 = "http://szb.gzrbs.com.cn/pc/" + item2.get("href")[9:]
+                            # http://szb.gzrbs.com.cn/pc/cont/202201/02/content_42202.html
+                            response3 = requests.get(url=url2, headers=headers, timeout=(30, 45))
+                            response3.encoding = response3.apparent_encoding
+                            print(f"三级连接状态：{response3.status_code}")
+                            if response3.status_code == 200:
+                                soup3 = BeautifulSoup(response3.text, "lxml")
+                                try:
+                                    title = soup3.select("#Title")[0].text.strip()
+                                except:
+                                    title = title
+                                try:
+                                    subtitle = soup3.select("#SubTitle")[0].text.strip()
+                                except:
+                                    subtitle = ""
+                                try:
+                                    preTitle = soup3.select("#PreTitle")[0].text.strip()
+                                except:
+                                    preTitle = ""
+                                content = get_content(soup3)
+                                guizhouribao.insert_one({'banmianhao': banmianhao,
+                                                         'banmianming': banmianming,
+                                                         'preTitle': preTitle,
+                                                         'title': title,
+                                                         'subtitle': subtitle,
+                                                         'author': '',
+                                                         'keywordlist': 'empty',
+                                                         'detail_url': url2,
+                                                         'release_time': date_now,
+                                                         'insert_timestamp': datetime.today(),
+                                                         'content': content})
+                                crawl_num += 1
+                                print(f"贵州日报-{date_now_s}-{banmianming}-{title}-已完成")
+                                time.sleep(sleeptime)
+                        print(f"贵州日报-{date_now_s}-{banmianming}-已完成")
+                        time.sleep(sleeptime)
+                print(f"贵州日报-{date_now_s}-已完成")
+        except Exception as result:
+            guizhouribao.insert_one({'banmianhao': 'empty',
+                                     'banmianming': 'empty',
+                                     'preTitle': 'empty',
+                                     'title': 'empty',
+                                     'subtitle': 'empty',
+                                     'author': 'empty',
+                                     'keywordlist': 'empty',
+                                     'detail_url': url,
+                                     'release_time': date_now,
+                                     'insert_timestamp': datetime.today(),
+                                     'content': 'empty'})
+            print(result)
+    print(f"贵州日报采集完毕，共采集{crawl_num}条数据！")
+
+
+if __name__ == '__main__':
+    main()
--- a/地方政策/报刊/CrawlHainan.py
+++ b/地方政策/报刊/CrawlHainan.py
@ -0,0 +1,162 @@
+# _*_ coding : UTF-8 _*_
+# @Time : 2024-01-17 14:24:59
+# @Author : haochen zhong
+# @File : CrawlHainan.py
+# @Software : PyCharm
+# @Comment :
+import re
+from bs4 import BeautifulSoup
+import requests
+from datetime import timedelta, datetime
+import time
+import pymongo
+import random
+
+# 数据库起止时间
+start_date = datetime.strptime('2008-02-29', '%Y-%m-%d')
+end_date = datetime.today()
+headers = {
+    'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
+# 创建数据库
+client = pymongo.MongoClient('localhost', 27017)
+mydb = client.dfdm_sjribao
+hainanribao = mydb.hainanribao
+
+
+def main():
+    # 判断数据库是否存在
+    collist = mydb.list_collection_names()
+    if "hainanribao" in collist:  # 检测集合是否存在
+        print("海南日报集合存在，更新数据库")
+        # 数据库最新一条内容的时间
+        db_time = hainanribao.find_one(sort=[('release_time', -1)])['release_time']
+        print(f'数据库截止时间{db_time}')
+        # 输入更新数据库时间
+        input_time = datetime.today()
+        if db_time < input_time:
+            getData(db_time, input_time)
+        else:
+            print('数据库无需更新')
+    else:
+        print("数据库不存在，建立数据库")
+        # 爬取网页并建立数据库
+        getData(start_date, end_date)
+
+
+# 解析网页正文
+def parse_html_text(soup):
+    """
+    :param html: html字符串
+    :return: 正文 string
+    """
+    content = ''
+    if soup.select('#ozoom'):
+        content = soup.select('#ozoom')[0].text.strip()
+    return content
+
+
+def parse_subtitle(soup):
+    item = soup.select('.font02')
+    if re.findall(r'article-subtitle>-->(.*?)<!--', str(item)):
+        subtitle = re.findall(r'article-subtitle>-->(.*?)<!--', str(item))[0]
+    else:
+        subtitle = ''
+    return subtitle
+
+
+def parse_h3title(soup):
+    item = soup.select('.font02')
+    if re.findall(r'article-pretitle>-->(.*?)<!--', str(item)):
+        h3title = re.findall(r'article-pretitle>-->(.*?)<!--', str(item))[0]
+    else:
+        h3title = ''
+    return h3title
+
+
+def parse_author(soup):
+    item = soup.select('.font02')
+    if re.findall(r'article-subtitle>-->(.*?)<!--', str(item)):
+        author = re.findall(r'article-subtitle>-->(.*?)<!--', str(item))[0]
+    else:
+        author = ''
+    return author
+
+
+# 爬取网页并建立数据库
+def getData(start_date, end_date):
+    crawl_num = 0
+    for i in range((end_date - start_date).days):
+        date_now = start_date + timedelta(days=i + 1)
+        date_now_s = date_now.strftime('%Y-%m/%d')
+        base_url = "http://news.hndaily.cn/html/" + date_now_s + '/'
+        url = base_url + 'node_1.htm'
+        # 进入首页
+        try:
+            response = requests.get(url, headers=headers)
+            response.encoding = response.apparent_encoding
+            print(f'一级连接状态{response.status_code}')
+            if response.status_code == 200:
+                soup = BeautifulSoup(response.text, "lxml")
+                for item in soup.select('#pageLink'):
+                    banmianhao = item.text.split("：")[0].strip()
+                    banmianming = item.text.split("：")[-1].strip()
+                    if banmianming == "广告":
+                        continue
+                    url1 = base_url + item.get('href')
+                    response2 = requests.get(url1, headers=headers)
+                    response2.encoding = response2.apparent_encoding
+                    print(f'二级连接状态{response2.status_code}')
+                    if response2.status_code == 200:
+                        soup1 = BeautifulSoup(response2.text, "lxml")
+                        for item1 in soup1.select('#main-ed-articlenav-list tr td div a'):
+                            detail_url = base_url + item1.get('href')
+                            print(detail_url)
+                            title = item1.text.strip()
+                            response3 = requests.get(detail_url, headers=headers)
+                            response3.encoding = response3.apparent_encoding
+                            print(f'三级连接状态:{response3.status_code}')
+                            if response3.status_code == 200:
+                                soup2 = BeautifulSoup(response3.text, "lxml")
+                                try:
+                                    title = soup2.select('.font01')[0].text.strip()
+                                except IndexError:
+                                    pass
+                                subtitle = parse_subtitle(soup2)
+                                h3title = parse_h3title(soup2)
+                                author = parse_author(soup2)
+                                content = parse_html_text(soup2)
+                                hainanribao.insert_one({'banmianhao': banmianhao,
+                                                        'banmianming': banmianming,
+                                                        'title': title,
+                                                        'subtitle': subtitle,
+                                                        'h3title': h3title,
+                                                        'author': author,
+                                                        'keywordlist': '',
+                                                        'detail_url': detail_url,
+                                                        'release_time': date_now,
+                                                        'insert_timestamp': datetime.today(),
+                                                        'content': content})
+                                print(f"海南日报-{date_now_s}-{banmianhao}-{banmianming}-{title}已经完成")
+                                crawl_num += 1
+                                time.sleep(random.randint(3, 10))
+                        print(f"海南日报-{date_now_s}-{banmianhao}-{banmianming}-已经完成")
+                        time.sleep(random.randint(3, 10))
+                print(f"海南日报-{date_now_s}-已经完成")
+        except Exception as result:
+            hainanribao.insert_one({'banmianhao': 'empty',
+                                    'banmianming': 'empty',
+                                    'title': 'empty',
+                                    'subtitle': 'empty',
+                                    'h3title': 'empty',
+                                    'author': 'empty',
+                                    'keywordlist': 'empty',
+                                    'detail_url': url,
+                                    'release_time': date_now,
+                                    'insert_timestamp': datetime.today(),
+                                    'content': 'empty'})
+            print(result)
+    print(f"海南日报采集完毕，本次共采集{crawl_num}条数据！")
+
+
+if __name__ == "__main__":
+    main()
--- a/地方政策/报刊/CrawlHenan.py
+++ b/地方政策/报刊/CrawlHenan.py
@ -0,0 +1,133 @@
+# _*_ coding : UTF-8 _*_
+# @Time : 2024-03-08 10:18:55
+# @Author : haochen zhong
+# @File : CrawlHenan.py
+# @Software : PyCharm
+# @Comment :采集河南日报数字报版面数据
+import datetime
+import random
+import time
+
+import pymongo
+import requests
+from bs4 import BeautifulSoup
+
+start_date = datetime.datetime.strptime('2007-10-13', '%Y-%m-%d')
+"""采集开始时间"""
+end_date = datetime.datetime.today()
+"""采集结束时间"""
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
+"""自定义请求头"""
+# 创建数据库
+dbclient = pymongo.MongoClient('localhost', 27017)
+"""连接数据库"""
+mydb = dbclient.dfdm_sjribao
+henanribao = mydb.henanribao
+
+
+def main():
+    # 判断数据库是否存在
+    collist = mydb.list_collection_names()
+    if "henanribao" in collist:  # 检测集合是否存在
+        print("河南集合存在，更新数据库")
+        # 数据库最新一条内容的时间
+        db_time = henanribao.find_one(sort=[('release_time', -1)])['release_time']
+        print('数据库截止时间%s' % db_time)
+        # 输入更新数据库时间
+        input_time = datetime.datetime.today()
+        if db_time < input_time:
+            getData(db_time, input_time)
+        else:
+            print('数据库无需更新')
+    else:
+        # 爬取网页并建立数据库
+        print("数据库不存在，建立数据库！")
+        getData(start_date, end_date)
+
+
+def getContent(soup: BeautifulSoup):
+    content = ''
+    for p in soup.select('#articleContent p'):
+        para = p.text.strip()
+        if para:
+            content += para
+            content += '\n'
+    return content
+
+
+def getData(start_date, end_date):
+    crawl_num = 0
+    for i in range((end_date - start_date).days):  # gu:时间长度
+        date_now = start_date + datetime.timedelta(days=i + 1)
+        date_now_s = date_now.strftime('%Y-%m/%d')
+        base_url = "http://newpaper.dahe.cn/hnrb/html/" + date_now_s + '/'
+        url = base_url + 'node_1.htm'
+        # http://newpaper.dahe.cn/hnrb/html/2024-03/08/node_1.htm
+        print(url)
+        try:
+            response = requests.get(url, headers, timeout=60)
+            response.encoding = response.apparent_encoding
+            print(f"一级链接状态：{response.status_code}")
+            if response.status_code == 200:
+                soup = BeautifulSoup(response.text, "lxml")
+                for item in soup.select(".layout-catalogue-item>a:nth-child(1)"):
+                    banmianhao = item.text.split("：")[0]
+                    banmianming = item.text.split("：")[-1]
+                    url1 = base_url + item.get("href")
+                    response2 = requests.get(url1, headers)
+                    response2.encoding = response2.apparent_encoding
+                    print(f"二级链接状态：{response2.status_code}")
+                    if response2.status_code == 200:
+                        soup2 = BeautifulSoup(response2.text, "lxml")
+                        for item2 in soup2.select(".news-item a"):
+                            title = item2.get("title", "").strip()
+                            url2 = base_url + item2.get("href")
+                            response3 = requests.get(url2, headers)
+                            response3.encoding = response3.apparent_encoding
+                            print(f"三级连接状态：{response3.status_code}")
+                            if response3.status_code == 200:
+                                soup3 = BeautifulSoup(response3.text, "lxml")
+                                content = getContent(soup3)
+                                try:
+                                    preTitle = soup3.select(".headline")[0].text.strip()
+                                except Exception as e:
+                                    preTitle = ""
+                                try:
+                                    subtitle = soup3.select(".subtitle")[0].test.strip()
+                                except Exception as e:
+                                    subtitle = ""
+                                henanribao.insert_one({'banmianhao': banmianhao,
+                                                       'banmianming': banmianming,
+                                                       'title': title,
+                                                       'subtitle': subtitle,
+                                                       'preTitle': preTitle,
+                                                       'author': '',
+                                                       'keywordlist': '',
+                                                       'detail_url': url2,
+                                                       'release_time': date_now,
+                                                       'insert_timestamp': datetime.datetime.today(),
+                                                       'content': content})
+                                crawl_num += 1
+                                print(f"河南日报-{date_now_s}-{banmianhao}-{title}---采集成功！")
+                                time.sleep(random.randint(5, 10))
+                    print(f"河南日报-{date_now_s}-{banmianhao}---采集成功！")
+                print(f"河南日报-{date_now_s}---采集成功！")
+        except Exception as result:
+            henanribao.insert_one({'banmianhao': 'empty',
+                                   'banmianming': 'empty',
+                                   'preTitle': 'empty',
+                                   'title': 'empty',
+                                   'subtitle': 'empty',
+                                   'author': 'empty',
+                                   'keywordlist': 'empty',
+                                   'detail_url': url,
+                                   'release_time': date_now,
+                                   'insert_timestamp': datetime.datetime.today(),
+                                   'content': 'empty'})
+            print(result)
+    print(f"河南日报采集完毕，共采集{crawl_num}条数据！")
+
+
+if __name__ == '__main__':
+    main()
--- a/地方政策/报刊/CrawlNingxia.py
+++ b/地方政策/报刊/CrawlNingxia.py
@ -0,0 +1,140 @@
+# _*_ coding : UTF-8 _*_
+# @Time : 2022/12/29 13:48
+# @Author : Haochen Zhong
+# @File : CrawlNingxia.py
+# @Software : PyCharm
+# @Comment : 本程序采集宁夏日报版面数据
+
+import requests
+import time
+import pymongo
+import random
+from datetime import timedelta, datetime
+from bs4 import BeautifulSoup
+
+# 数据库起止时间
+start_date = datetime.strptime('2022-01-31', '%Y-%m-%d')
+"""宁夏日报2022-02-01开始有数据"""
+end_date = datetime.today()
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
+# 创建数据库
+client = pymongo.MongoClient('localhost', 27017)
+mydb = client.dfdm_sjribao
+ningxiaribao = mydb.ningxiaribao
+# 设置随机时间
+sleeptime = random.randint(2, 10)
+
+
+def main():
+    # 判断数据库是否存在
+    collist = mydb.list_collection_names()
+    if "ningxiaribao" in collist:  # 检测集合是否存在
+        print("宁夏集合存在，更新数据库")
+        # 数据库最新一条内容的时间
+        db_time = ningxiaribao.find_one(sort=[('release_time', -1)])[
+            'release_time']  # 或者find().sort('_id', -1).limit(1)
+        print('数据库截止时间%s' % db_time)
+        # 输入更新数据库时间
+        input_time = datetime.today()
+        if db_time < input_time:
+            getData(db_time, input_time)
+        else:
+            print('数据库无需更新!')
+    else:
+        # 爬取网页并建立数据库
+        print("数据库不存在，建立数据库！")
+        getData(start_date, end_date)
+
+
+def get_content(soup3):
+    content = ""
+    for p in soup3.select("#ozoom p"):
+        para = p.text.strip()
+        if para:
+            content += para
+            content += '\n'
+    return content
+
+
+def getData(start_date, end_date):
+    crawl_num = 0
+    for i in range((end_date - start_date).days):  # gu:时间长度
+        date_now = start_date + timedelta(days=i + 1)
+        date_now_s = date_now.strftime('%Y%m/%d')
+        base_url = "https://szb.nxrb.cn/nxrb/pc/layout/" + date_now_s + "/"
+        url = base_url + "node_01.html"
+        # https://szb.nxrb.cn/nxrb/pc/layout/202202/01/node_01.html
+        try:
+            response = requests.get(url=url, headers=headers, timeout=(30, 45))
+            response.encoding = response.apparent_encoding
+            print(f"一级连接状态: {response.status_code}")
+            if response.status_code == 200:
+                soup = BeautifulSoup(response.text, "lxml")
+                for item in soup.select(".nav-list .btn-block"):
+                    banmianhao = item.text.split("：")[0].strip()
+                    banmianming = item.text.split("：")[-1].strip()
+                    url1 = base_url + item.get("href")
+                    response2 = requests.get(url=url1, headers=headers, timeout=(30, 45))
+                    response2.encoding = response2.apparent_encoding
+                    print(f"二级连接状态：{response2.status_code}")
+                    if response2.status_code == 200:
+                        soup2 = BeautifulSoup(response2.text, "lxml")
+                        for item2 in soup2.select(".news-list .resultList a"):
+                            url_title = item2.text.strip()
+                            url2 = "https://szb.nxrb.cn/nxrb/pc/" + item2.get("href")[9:]
+                            print(url2)
+                            response3 = requests.get(url=url2, headers=headers, timeout=(30, 45))
+                            response3.encoding = response3.apparent_encoding
+                            print(f"三级连接状态：{response3.status_code}")
+                            if response3.status_code == 200:
+                                soup3 = BeautifulSoup(response3.text, "lxml")
+                                try:
+                                    pretitle = soup3.select("#PreTitle")[0].text.strip()
+                                except:
+                                    pretitle = ""
+                                try:
+                                    title = soup3.select("#Title")[0].text.strip()
+                                except:
+                                    title = url_title
+                                try:
+                                    subtitle = soup3.select("SubTitle")[0].text.strip()
+                                except:
+                                    subtitle = ""
+                                content = get_content(soup3)
+                                ningxiaribao.insert_one({'banmianhao': banmianhao,
+                                                         'banmianming': banmianming,
+                                                         'title': title,
+                                                         'subtitle': subtitle,
+                                                         'h3title': pretitle,
+                                                         'author': '',
+                                                         'keywordlist': 'empty',
+                                                         'detail_url': url2,
+                                                         'release_time': date_now,
+                                                         'insert_timestamp': datetime.today(),
+                                                         'content': content})
+                                crawl_num += 1
+                                print(f"宁夏日报-{date_now_s}-{banmianhao}-{title}-已完成")
+                                time.sleep(sleeptime)
+                        print(f"宁夏日报-{date_now_s}-{banmianhao}-已完成")
+                        time.sleep(sleeptime)
+                print(f"宁夏日报-{date_now_s}-已完成")
+                time.sleep(sleeptime)
+        except Exception as result:
+            ningxiaribao.insert_one({'banmianhao': 'empty',
+                                     'banmianming': 'empty',
+                                     'title': 'empty',
+                                     'subtitle': 'empty',
+                                     'h3title': 'empty',
+                                     'author': 'empty',
+                                     'keywordlist': 'empty',
+                                     'detail_url': url,
+                                     'release_time': date_now,
+                                     'insert_timestamp': datetime.today(),
+                                     'content': 'empty'})
+            print(result)
+    print(f"宁夏日报采集完成，成功采集{crawl_num}条数据！")
+
+
+if __name__ == '__main__':
+    main()
--- a/地方政策/报刊/CrawlSiChuan.py
+++ b/地方政策/报刊/CrawlSiChuan.py
@ -0,0 +1,137 @@
+# _*_ coding : UTF-8 _*_
+# @Time : 2024/01/09 14:15
+# @UpdateTime : 2024/01/09 16:30
+# @Author : Haochen Zhong
+# @File : CrawlSiChuan.py
+# @Software : PyCharm
+# @Comment : 本程序采集四川日报数字报板面数据
+
+import re
+from bs4 import BeautifulSoup
+import requests
+from datetime import timedelta, datetime
+import time
+import pymongo
+import random
+
+# 数据库起止时间
+start_date = datetime.strptime('2021-12-31', '%Y-%m-%d')
+end_date = datetime.today()
+headers = {
+    'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
+# 创建数据库
+client = pymongo.MongoClient('localhost', 27017)
+mydb = client.dfdm_sjribao
+sichuanribao = mydb.sichuanribao
+
+
+def main():
+    # 判断数据库是否存在
+    collist = mydb.list_collection_names()
+    if "sichuanribao" in collist:  # 检测集合是否存在
+        print("四川日报集合存在，更新数据库")
+        # 数据库最新一条内容的时间
+        db_time = sichuanribao.find_one(sort=[('release_time', -1)])[
+            'release_time']  # 或者find().sort('_id', -1).limit(1)
+        print('数据库截止时间%s' % db_time)
+        # 输入更新数据库时间
+        input_time = datetime.today()
+        if db_time < input_time:
+            getData(db_time, input_time)
+        else:
+            print('数据库无需更新')
+    else:
+        # 爬取网页并建立数据库
+        print("数据库不存在，建立数据库！")
+        getData(start_date, end_date)
+
+
+def getContent(soup):
+    content = ''
+    for p in soup.select('#main2 > div.main2_r > ul > li:nth-child(2) p'):
+        para = p.text.strip()
+        if para:
+            content += para
+            content += '\n'
+    return content
+
+
+def getSubtitle(soup):
+    subtitle = ''
+    if soup.select('#main2 > div.main2_r > ul > li:nth-child(1) > h2:nth-child(1)'):
+        for p in soup.select('#main2 > div.main2_r > ul > li:nth-child(1) > h2:nth-child(1)'):
+            para = p.text.strip()
+            if para:
+                subtitle += para
+                subtitle += '\n'
+    return subtitle
+
+
+def getData(start_date, end_date):
+    crawl_num = 0
+    for i in range((end_date - start_date).days):
+        date_now = start_date + timedelta(days=i + 1)
+        date_now_s = date_now.strftime('%Y%m%d')
+        base_url = "https://epaper.scdaily.cn/shtml/scrb/"
+        url = base_url + date_now_s + '/index.shtml'
+        try:
+            response = requests.get(url, headers)
+            print(f"一级链接状态：{response.status_code}")
+            if response.status_code == 200:
+                response.encoding = response.apparent_encoding
+                soup = BeautifulSoup(response.text, "lxml")
+                for item in soup.select("#main > div.main_r > ul:nth-child(2) > li:nth-child(2) a"):
+                    banmianhao = item.text.split("：")[0]
+                    banmianming = item.text.split("：")[-1]
+                    url1 = "https://epaper.scdaily.cn" + item.get("href")
+                    response2 = requests.get(url1, headers)
+                    print(f"二级链接状态：{response2.status_code}")
+                    if response2.status_code == 200:
+                        response2.encoding = response2.apparent_encoding
+                        soup2 = BeautifulSoup(response2.text, "lxml")
+                        for item2 in soup2.select("#main > div.main_r > ul:nth-child(3) > li:nth-child(2) a"):
+                            url2 = "https://epaper.scdaily.cn" + item2.get("href")
+                            title = item2.get("title")
+                            response3 = requests.get(url2, headers)
+                            print(f"三级连接状态：{response3.status_code}")
+                            if response3.status_code == 200:
+                                response3.encoding = response3.apparent_encoding
+                                soup3 = BeautifulSoup(response3.text, "lxml")
+                                content = getContent(soup3)
+                                subtitle = getSubtitle(soup3)
+                                sichuanribao.insert_one({'banmianhao': banmianhao,
+                                                         'banmianming': banmianming,
+                                                         'title': title,
+                                                         'subtitle': subtitle,
+                                                         'h3title': '',
+                                                         'author': '',
+                                                         'keywordlist': '',
+                                                         'detail_url': url2,
+                                                         'release_time': date_now,
+                                                         'insert_timestamp': datetime.today(),
+                                                         'content': content})
+                                print(f"四川日报--{date_now_s}-{banmianhao}-{title}----已完成")
+                                crawl_num += 1
+                                time.sleep(random.randint(3, 10))
+                        print(f"四川日报--{date_now_s}-{banmianhao}----已完成")
+                        time.sleep(random.randint(3, 10))
+            print(f"四川日报--{date_now_s}-----已完成")
+            time.sleep(random.randint(3, 10))
+        except Exception as result:
+            sichuanribao.insert_one({'banmianhao': 'empty',
+                                     'banmianming': 'empty',
+                                     'preTitle': 'empty',
+                                     'title': 'empty',
+                                     'subtitle': 'empty',
+                                     'author': 'empty',
+                                     'keywordlist': 'empty',
+                                     'detail_url': url,
+                                     'release_time': date_now,
+                                     'insert_timestamp': datetime.today(),
+                                     'content': 'empty'})
+            print(result)
+    print(f"四川日报采集完毕，共采集{crawl_num}条数据！")
+
+
+if __name__ == '__main__':
+    main()
--- a/地方政策/报刊/CrawlXinminwanbao.py
+++ b/地方政策/报刊/CrawlXinminwanbao.py
@ -0,0 +1,167 @@
+# _*_ coding : UTF-8 _*_
+# @Time : 2022/6/17 8:50
+# @Author : Haochen Zhong
+# @File : 本程序用于抓取上海新民晚报数据
+# @Project : Pytharm
+from bs4 import BeautifulSoup, Comment
+import requests
+from datetime import timedelta, datetime
+import time
+import pymongo
+import random
+
+start_date = datetime.strptime('2018-12-31', '%Y-%m-%d')  # 抓取上海新民晚报从2019-01-01到至今的数据
+end_date = datetime.today()
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
+# 创建数据库
+client = pymongo.MongoClient('localhost', 27017)
+mydb = client.dfdm_qitaguanbao
+shxinminwanbao = mydb.shxinminwanbao
+
+
+# 设置随机时间
+
+
+def main():
+    # 判断数据库是否存在
+    collist = mydb.list_collection_names()
+    if "shxinminwanbao" in collist:  # 检测集合是否存在
+        print("上海新民晚报集合存在，更新数据库")
+        # 数据库最新一条内容的时间
+        db_time = shxinminwanbao.find_one(sort=[('release_time', -1)])['release_time']
+        print('数据库截止时间%s' % db_time)
+        # 输入更新数据库时间
+        input_time = datetime.today()
+        if db_time < input_time:
+            getData(db_time, input_time)
+        else:
+            print('数据库无需更新')
+    else:
+        # 爬取网页并建立数据库
+        print('数据库不存在，建立数据库！')
+        getData(start_date, end_date)
+
+
+def parse_html_text(soup2):
+    img_list = soup2.select('.dzb-enter-desc-box p img')
+    if img_list:
+        img = '图片链接：\n'
+        for i in img_list:
+            img_url = 'https:' + i.get('src')
+            img += img_url
+            img += '\n'
+        content = img + '正文内容：\n'
+        for p in soup2.select('.dzb-enter-desc-box p'):
+            para = p.text.split('    ')
+            for x in para:
+                if x != '' or x != '\\n\\n':
+                    content += x.strip()
+                    content += '\n'
+    else:
+        content = ''
+        for p in soup2.select('.dzb-enter-desc-box p'):
+            para = p.text.split('    ')
+            for x in para:
+                if x.strip() != '' or x != '\\n\\n':
+                    content += x.strip()
+                    content += '\n'
+    return content
+
+
+def getData(start_date, end_date):
+    for i in range((end_date - start_date).days):
+        date_now = start_date + timedelta(days=i + 1)
+        date_now_s = date_now.strftime('%Y-%m-%d')
+        base_url = "https://paper.xinmin.cn/html/xmwb/" + date_now_s + '/'
+        url = base_url + '1.html'
+        art_base_url = 'https://paper.xinmin.cn'
+        # 进入首页
+        try:
+            try:
+                response = requests.get(url=url, headers=headers, timeout=30)
+            except:
+                time.sleep(10)
+                response = requests.get(url=url, headers=headers, timeout=30)
+            response.encoding = response.apparent_encoding
+            print('一级连接状态：%d' % response.status_code)
+            if response.status_code == 200:
+                soup = BeautifulSoup(response.text, 'lxml')
+                response.close()
+                # 提取所有版面信息
+                for item in soup.select('.dzb-enter-mulu-wrap-nav a'):
+                    url1 = art_base_url + item.get('href')
+                    banmianhao = item.get('title').split(':')[0]
+                    banmianming = item.get('title').split(':')[-1]
+                    try:
+                        response2 = requests.get(url=url1, headers=headers, timeout=30)
+                    except:
+                        time.sleep(10)
+                        response2 = requests.get(url=url1, headers=headers, timeout=30)
+                    response2.encoding = response2.apparent_encoding
+                    print('二级连接状态：%d' % response2.status_code)
+                    if response2.status_code == 200:
+                        soup1 = BeautifulSoup(response2.text, 'lxml')
+                        response2.close()
+                        for item1 in soup1.select('.dzb-enter-benban-wrap div a'):
+                            url2 = art_base_url + item1.get('href')
+                            try:
+                                response3 = requests.get(url=url2, headers=headers, timeout=30)
+                            except:
+                                time.sleep(10)
+                                response3 = requests.get(url=url2, headers=headers, timeout=30)
+                            response3.encoding = response3.apparent_encoding
+                            print('三级连接状态：%d' % response3.status_code)
+                            if response3.status_code == 200:
+                                soup2 = BeautifulSoup(response3.text, 'lxml')
+                                response3.close()
+                                title = soup2.select('.dzb-title-box')[0].text.strip()
+                                pass_list = ['上海地区今明天气', '上海市今明天气预报', '广告']
+                                if title in pass_list:  # 筛除每天海今明天气和广告
+                                    time.sleep(random.randint(2, 8))
+                                    continue
+                                subtitle = soup2.select('.dzb-sub-title-box')[0].text.strip()
+                                # 查找所有注释
+                                comments = soup2.find_all(string=lambda text: isinstance(text, Comment))
+                                author = ""
+                                # 遍历注释，找到包含作者的注释
+                                for comment in comments:
+                                    if 'dzb-author-box' in comment:
+                                        # 使用 BeautifulSoup 解析注释内容
+                                        author_soup = BeautifulSoup(comment, 'html.parser')
+                                        author = author_soup.find('span', class_='dzb-author-box').text
+                                pretitle = soup2.select('.dzb-special-title-box')[0].text.strip()
+                                content = parse_html_text(soup2)
+                                shxinminwanbao.insert_one({'banmianhao': banmianhao,
+                                                           'banmianming': banmianming,
+                                                           'pretitle': pretitle,
+                                                           'title': title,
+                                                           'subtitle': subtitle,
+                                                           'author': author,
+                                                           'keywordlist': '',
+                                                           'detail_url': url2,
+                                                           'release_time': date_now,
+                                                           'insert_timestamp': datetime.today(),
+                                                           'content': content})
+                                print('上海新民晚报-%s-%s-%s-已完成' % (date_now_s, banmianhao, title))
+                                time.sleep(random.randint(2, 8))
+                        print('上海新民晚报-%s-%s-已完成' % (date_now_s, banmianhao))
+                print("上海新民晚报-%s-已经完成" % date_now_s)
+        except Exception as result:
+            shxinminwanbao.insert_one({'banmianhao': 'empty',
+                                       'banmianming': 'empty',
+                                       'title': 'empty',
+                                       'subtitle': 'empty',
+                                       'h3title': 'empty',
+                                       'author': 'empty',
+                                       'keywordlist': 'empty',
+                                       'detail_url': url,
+                                       'release_time': date_now,
+                                       'insert_timestamp': datetime.today(),
+                                       'content': 'empty'})
+            print(result)
+
+
+if __name__ == '__main__':
+    main()
+    print("爬取完毕！")
--- a/地方政策/政策/上海/CrawlShanghaiZhengce.py
+++ b/地方政策/政策/上海/CrawlShanghaiZhengce.py
@ -0,0 +1,72 @@
+import datetime
+import random
+import time
+
+import pymongo
+import requests
+from bs4 import BeautifulSoup
+
+# 模拟用户访问
+headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
+                         'Chrome/51.0.2704.63 Safari/537.36',
+           'Connection': 'close'}
+# 创建数据库
+client = pymongo.MongoClient('localhost', 27017)
+mydb = client.sjzf_zcwj
+shanghaizcwj = mydb.shanghaizcwj
+base_url = "https://www.shanghai.gov.cn"
+
+
+def getContent(soup: BeautifulSoup) -> str:
+    """
+    获取文章正文内容
+    :param soup:
+    :return:
+    """
+    content: str = ""
+    for p in soup.select('#ivs_content p'):
+        para: str = p.text.strip()
+        if para:
+            content += para
+            content += '\n'
+    return content
+
+
+def getData():
+    pages = 28
+    for i in range(1, pages + 1):
+        if i == 1:
+            url = "https://www.shanghai.gov.cn/xxzfgzwj/index.html"
+        else:
+            url = f"https://www.shanghai.gov.cn/xxzfgzwj/index_{i}.html"
+        response = requests.get(url, headers=headers)
+        response.encoding = response.apparent_encoding
+        if response.status_code == 200:
+            soup = BeautifulSoup(response.text, "lxml")
+            # print(response.text)
+            trList = soup.select(".trout-region-list tbody tr")
+            for item in trList:
+                data = item.select("a")[0]
+                title = data.get("title", "")
+                url = base_url + data.get("href", "")
+                print(url)
+                if shanghaizcwj.find_one({"url": url}):
+                    continue
+                subtitle = data.select_one(".text-color").text.strip()
+                response2 = requests.get(url=url, headers=headers)
+                response2.encoding = response2.apparent_encoding
+                print(response2.status_code)
+                if response2.status_code == 200:
+                    soup2 = BeautifulSoup(response2.text, "lxml")
+                    content: str = getContent(soup=soup2)
+                    shanghaizcwj.insert_one({
+                        "title": title,
+                        "subtitle": subtitle,
+                        "content": content,
+                        "url": url,
+                    })
+                    time.sleep(random.randint(3, 5))
+                    print(title, "采集完成")
+
+
+getData()
--- a/地方政策/政策/新疆/crawl/Crawlxjzfgz.py
+++ b/地方政策/政策/新疆/crawl/Crawlxjzfgz.py
@ -0,0 +1,153 @@
+# _*_ coding : UTF-8 _*_
+# @Time : 2023/8/27 22:28
+# @Author : Haochen Zhong
+# @File : Exportxjzfgz.py
+# @Software : PyCharm
+# @Comment : 本程序采集新疆维吾尔自治区人民政府规章库
+import datetime
+import random
+import time
+
+import pymongo
+import requests
+from bs4 import BeautifulSoup
+
+# 模拟用户访问
+headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
+                         'Chrome/51.0.2704.63 Safari/537.36',
+           'Connection': 'close'}
+# 创建数据库
+client = pymongo.MongoClient('localhost', 27017)
+mydb = client.sjzf_zcwj
+xinjiangzcwj = mydb.xinjiangzcwj
+
+
+def getContent(soup: BeautifulSoup) -> str:
+    """
+    获取文章正文内容
+    :param soup:
+    :return:
+    """
+    content: str = ""
+    for p in soup.select('.gknbxq_detail p'):
+        para: str = p.text.strip()
+        if para:
+            content += para
+            content += '\n'
+    return content
+
+
+def getData():
+    """程序主函数"""
+    count = 10000
+    """设置单次获取文章数量，可以任意设置正整数"""
+    dataUrl = "https://www.xinjiang.gov.cn/interface-cms/qryManuscriptByWebsiteId"
+    """请求所有文章数据连接"""
+    dataJson = {
+        "websiteId": "2a4092ca8c2a4255bfec9f13f114aba6",
+        "channelId": [
+            "2aceb5d534434a9fb3550295b52a87e5"
+        ],
+        "domainMetaList": [
+            {}
+        ],
+        "pageSize": f"{count}",
+        "pageNum": 1,
+        "title": None
+    }
+    """请求参数"""
+    response = requests.post(url=dataUrl, headers=headers, json=dataJson, timeout=60)
+    response.encoding = response.apparent_encoding
+    print(f"一级链接状态：{response.status_code}")
+    if response.status_code == 200:
+        dataList = response.json()["results"]
+        for item in dataList:
+            try:
+                url: str = item["websiteDomain"] + item["url"]
+                """文章链接"""
+                result = xinjiangzcwj.find_one({"url": url})
+                if result:
+                    continue
+                typeOneName: str = item["channelName"]
+                """文章归类"""
+                title: str = item["title"]
+                """文章标题"""
+                subTitle: str = item["subTitle"]
+                """文章副标题"""
+                if item["publishedTime"]:
+                    pubtime: float = datetime.datetime.strptime(item["publishedTime"], "%Y-%m-%d").timestamp()
+                    """发布日期"""
+                else:
+                    pubtime: float = 0
+                    """发布日期"""
+                puborg: str = item["domainMetaList"]["xxgkml"]["resultList"]["fwjg2"]["cnName"]
+                """发文机关（自治区）"""
+                articleType: str = item["domainMetaList"]["xxgkml"]["resultList"]["gwzl2"]["cnName"]
+                """公文种类"""
+                if item["domainMetaList"]["xxgkml"]["resultList"]["cwrq2"]["cnName"]:
+                    ptime: float = datetime.datetime.strptime(
+                        item["domainMetaList"]["xxgkml"]["resultList"]["cwrq2"]["cnName"],
+                        "%Y-%m-%d").timestamp()
+                    """成文日期"""
+                else:
+                    ptime: float = 0
+                    """成文日期"""
+                index: str = item["domainMetaList"]["xxgkml"]["resultList"]["syh2"]["cnName"]
+                """索引号"""
+                pcode: str = item["domainMetaList"]["xxgkml"]["resultList"]["wenh2"]["cnName"]
+                """文号"""
+                effectiveness: str = item["domainMetaList"]["xxgkml"]["resultList"]["yxx01"]["cnName"]
+                """有效性"""
+                typeSecondName: str = item["domainMetaList"]["xxgkml"]["resultList"]["wz2"]["cnName"]
+                """文种（自治区）"""
+                year: str = item["domainMetaList"]["xxgkml"]["resultList"]["nianf2"]["cnName"]
+                """年份"""
+                childtype: str = item["domainMetaList"]["xxgkml"]["resultList"]["ztfl2"]["cnName"]
+                """主题分类"""
+                author: str = item["domainMetaList"]["默认元数据集"]["resultList"]["author"]["cnName"]
+                """作者"""
+                source: str = item["domainMetaList"]["默认元数据集"]["resultList"]["source"]["cnName"]
+                """来源"""
+                if item["manuscriptRelatedRes"]:
+                    manuscriptRelatedRes: str = item["websiteDomain"] + item["manuscriptRelatedRes"]
+                    """附件链接"""
+                else:
+                    manuscriptRelatedRes: str = ""
+                    """附件链接"""
+                response = requests.get(url=url, headers=headers, timeout=60)
+                response.encoding = response.apparent_encoding
+                print(f"二级链接状态：{response.status_code}")
+                if response.status_code == 200:
+                    soup: BeautifulSoup = BeautifulSoup(response.text, "html.parser")
+                    content: str = getContent(soup=soup)
+                    xinjiangzcwj.insert_one(
+                        {
+                            'typeOneName': typeOneName,
+                            'typeSecondName': typeSecondName,
+                            'articleType': articleType,
+                            "title": title,
+                            "subTitle": subTitle,
+                            "childtype": childtype,
+                            "index": index,
+                            "pcode": pcode,
+                            "puborg": puborg,
+                            "ptime": ptime,
+                            "pubtime": pubtime,
+                            "effectiveness": effectiveness,
+                            "author": author,
+                            "year": year,
+                            "manuscriptRelatedRes": manuscriptRelatedRes,
+                            "url": url,
+                            "source": source,
+                            "content": content
+                        }
+                    )
+                    print(f"{typeOneName}--{typeSecondName}--{title}-已完成")
+                    time.sleep(random.randint(3, 8))
+            except Exception as e:
+                print(e)
+                continue
+
+
+if __name__ == '__main__':
+    getData()
--- a/地方政策/政策/新疆/export/Exportxjzfgz.py
+++ b/地方政策/政策/新疆/export/Exportxjzfgz.py
@ -0,0 +1,185 @@
+# _*_ coding : UTF-8 _*_
+# @Time : 2023/8/28 0:50
+# @Author : Haochen Zhong
+# @File : Exportxjzfgz.py
+# @Software : PyCharm
+# @Comment : 本程序用于到处新疆维吾尔自治区人民政府规章
+
+import datetime
+import os
+import time
+
+import pandas as pd
+import pymongo
+from docx import Document
+from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
+from docx.oxml.ns import qn
+from docx.shared import Pt, Cm
+
+client = pymongo.MongoClient('localhost', 27017)
+"""与mongoDB数据库建立连接"""
+mydb = client.sjzf_zcwj
+"""政策文件存放在数据库的一级目录对象"""
+xinjiangzcwj = mydb.xinjiangzcwj
+"""政策文件存放对象"""
+
+savePath = ""
+"""导出文件存放路径"""
+
+
+def replace_invalid_chars(text):
+    """
+    替换Window系统和Linux系统文件路径禁止字符，统一转换成Html实体编码
+    """
+    invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
+    """Window系统和Linux系统文件路径禁止字符列表"""
+    replace_char = ['&lt;', '&gt;', '&#58;', '&quot;', '&#47;', '&#92;', '&#124;', '&#63;', '&#42;']
+    """Window系统和Linux系统文件路径禁止字符替换列表 统一转换成html实体编码"""
+
+    for i, char in enumerate(invalid_chars):
+        text = text.replace(char, replace_char[i])
+    return text
+
+
+def analysisTime(timestamp: int) -> str:
+    """
+    处理时间，将1970-01-01之前的时间戳正确转换
+    """
+    if timestamp == 0:
+        return "未知"
+    if timestamp < 0:
+        # 计算从 1970-01-01 开始的时间间隔
+        delta = datetime.timedelta(seconds=abs(timestamp))
+        date = datetime.datetime(1970, 1, 1) - delta
+    else:
+        date = datetime.datetime.fromtimestamp(timestamp)
+    # 格式化为字符串
+    return date.strftime('%Y-%m-%d')
+
+
+def saveFile():
+    num = 0
+    startTime = time.time()
+    global savePath
+    query = {
+        'typeOneName': "",
+        'typeSecondName': "",
+        'articleType': "",
+        "title": "",
+        "subTitle": "",
+        "childtype": "",
+        "index": "",
+        "pcode": "",
+        "puborg": "",
+        "ptime": "",
+        "pubtime": "",
+        "effectiveness": "",
+        "author": "",
+        "year": "",
+        "manuscriptRelatedRes": "",
+        "url": "",
+        "source": "",
+        "content": ""
+    }
+    query = {f'{k}': v for k, v in query.items() if v}
+    """需要过滤的文章，默认不过滤"""
+    dataList = list(xinjiangzcwj.find(query))
+    if not savePath:
+        savePath = input("请输入数据存放路径：")
+    totalPath = os.path.join(savePath, "数据统计表.csv")
+    for data in dataList:
+        try:
+            typeOneName = data["typeOneName"]
+            """一级分类目录"""
+            typeSecondName = data["typeSecondName"]
+            """二级分类目录"""
+            articleType = data["articleType"]
+            """四级分类目录"""
+            # 创建目录
+            output_directory = os.path.join(savePath, typeOneName, typeSecondName)
+            if not os.path.exists(output_directory):
+                os.makedirs(output_directory)
+            doc = Document()
+            firstLine = doc.add_paragraph()
+            firstLineText = f"索引号：{data['index']}\t\t有效性：{data['effectiveness']}"
+            firstLine_run = firstLine.add_run(firstLineText)
+            firstLine_run.font.size = Pt(12)
+            firstLine_run.font.name = 'Times New Roman'  # 设置标题西文字体
+            firstLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
+
+            secondLine = doc.add_paragraph()
+            secondLineText = f"发文机关：{data['puborg']}\t\t发文字号：{data['pcode']}"
+            secondLine_run = secondLine.add_run(secondLineText)
+            secondLine_run.font.size = Pt(12)
+            secondLine_run.font.name = 'Times New Roman'  # 设置标题西文字体
+            secondLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
+
+            thirdLine = doc.add_paragraph()
+            thirdLineText = f"标题：{data['title']}"
+            thirdLine_run = thirdLine.add_run(thirdLineText)
+            thirdLine_run.font.size = Pt(12)
+            thirdLine_run.font.name = 'Times New Roman'  # 设置标题西文字体
+            thirdLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
+
+            fourLine = doc.add_paragraph()
+            pubtime = analysisTime(data['pubtime'])
+            ptime = analysisTime(data['ptime'])
+            fourLineText = f"成文日期：{ptime}\t\t发布日期：{pubtime}"
+            fourLine_run = fourLine.add_run(fourLineText)
+            fourLine_run.font.size = Pt(12)
+            fourLine_run.font.name = 'Times New Roman'  # 设置标题西文字体
+            fourLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
+
+            urlLine = doc.add_paragraph()
+            urlLineText = f"文章链接：{data['url']}"
+            urlLine_run = urlLine.add_run(urlLineText)
+            urlLine_run.font.size = Pt(12)
+            urlLine_run.font.name = 'Times New Roman'  # 设置标题西文字体
+            urlLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
+
+            title = doc.add_paragraph()
+            title_run = title.add_run(data["title"])
+            title_run.bold = True
+            title_run.font.size = Pt(22)
+            title_run.font.name = 'Times New Roman'  # 设置标题西文字体
+            title_run.element.rPr.rFonts.set(qn('w:eastAsia'), "华文中宋")
+            title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER  # 设置大标题居中对齐
+
+            for section in data["content"].split("\n"):
+                paragraph = doc.add_paragraph()
+                run = paragraph.add_run("\t" + section)
+                run.font.size = Pt(16)
+                run.font.name = "Times New Roman"
+                run.element.rPr.rFonts.set(qn('w:eastAsia'), "仿宋")
+                run.first_line_indent = Cm(0.74)
+
+            if data["manuscriptRelatedRes"]:
+                urlLine = doc.add_paragraph()
+                urlLineText = f"附件链接：{data['manuscriptRelatedRes']}"
+                urlLine_run = urlLine.add_run(urlLineText)
+                urlLine_run.font.size = Pt(12)
+                urlLine_run.font.name = 'Times New Roman'  # 设置标题西文字体
+                urlLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
+            if len(data["title"]) > 45:
+                title_ = data["title"][len(data["title"]) - 30:]
+            else:
+                title_ = data["title"]
+            fileName = f"{replace_invalid_chars(title_)}.docx"
+            filePath = os.path.join(output_directory, fileName)
+            doc.save(filePath)
+            num += 1
+            print(f"{typeOneName}--{typeSecondName}--{data['title']}--导出成功！")
+        except Exception as e:
+            print(e)
+            continue
+    csvData = pd.DataFrame(dataList)
+    csvData.columns = ["数据库ID", "文章归类", "文种（自治区）", "公文种类", "文章标题", "文章副标题", "主题分类",
+                       "索引号", "文号", "发文机关（自治区）", "成文日期时间戳", "发布日期时间戳", "有效性", "作者", "年份",
+                       "附件链接",
+                       "文章链接", "来源", "正文内容"]
+    csvData.to_csv(totalPath, encoding="utf-8-sig",index_label="序号")
+    print(f"耗时：{time.time() - startTime} 秒，一共导出{num}份文件,详情数据请看数据统计表.csv")
+
+
+if __name__ == '__main__':
+    saveFile()
--- a/地方政策/政策/深圳/CrawlShenZhen.py
+++ b/地方政策/政策/深圳/CrawlShenZhen.py
@ -0,0 +1,62 @@
+import asyncio
+import datetime
+import random
+import time
+
+import pymongo
+import requests
+from httpx import AsyncClient
+
+# 模拟用户访问
+headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
+                         'Chrome/51.0.2704.63 Safari/537.36', }
+# 创建数据库
+client = pymongo.MongoClient('localhost', 27017)
+mydb = client.sjzf_zcwj
+shenzhenzcwj = mydb.shenzhenzcwj
+
+yearList = ['158104', '148604', '141910', '125615', '103604', '101620', '101621', '101622', '101623', '101624',
+            '101625', '101626', '101627', '101628', '101629', '101630', '101631', '101632', '101633', '101634',
+            '101635', '101636', '101637', '101638', '146351', '146338', '146325', '146311', '146298', '146285',
+            '146272', '146205', '146190', '145973', '145972', '145970']
+
+
+def update_json_data(original_data, new_data):
+    # 遍历新数据的键值对
+    for key, value in new_data.items():
+        # 如果新数据的值不为 None 或者空字符串，更新原数据
+        if value is not None and value != "":
+            original_data[key] = value
+    return original_data
+
+
+async def getData():
+    async with AsyncClient(headers=headers, timeout=60, verify=False) as client:
+        for i in yearList:
+            url = f"http://www.sz.gov.cn/postmeta/i/{i}.json"
+            print(url)
+            response = await client.get(url=url)
+            response.encoding = response.charset_encoding
+            print(response.status_code)
+            if response.status_code == 200:
+                for item in response.json()["children"]:
+                    url2 = f"http://www.sz.gov.cn/postmeta/i/{item['id']}.json"
+                    print(url2)
+                    response2 = await client.get(url=url2)
+                    response2.encoding = response2.charset_encoding
+                    print(response2.status_code)
+                    if response2.status_code == 200:
+                        for item2 in response2.json()["articles"][1:]:
+                            if shenzhenzcwj.find_one({"id":item2["id"]}):
+                                continue
+                            url3 = f"http://www.sz.gov.cn/postmeta/p/{item2['id'] // 1000000}/{item2['id'] // 1000}/{item2['id']}.json"
+                            response3 = await client.get(url=url3)
+                            response3.encoding = response3.charset_encoding
+                            print(response3.status_code)
+                            if response3.status_code == 200:
+                                data = response3.json()
+                                newData = update_json_data(item2, data)
+                                shenzhenzcwj.insert_one(newData)
+                                print(newData["title"],"采集完成")
+                                await asyncio.sleep(random.randint(2,3))
+asyncio.run(getData())