From 44f8f2e1fc617c9bbf3a8e050632a026ba4ff3da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9A=93=E6=9C=88=E5=BD=92=E5=B0=98?= <xingling25@qq.com>
Date: Fri, 7 Feb 2025 23:07:43 +0800
Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E7=BA=A2=E6=97=97?=
 =?UTF-8?q?=E6=96=87=E7=A8=BF=EF=BC=8C=E3=80=8A=E6=B1=82=E6=98=AF=E3=80=8B?=
 =?UTF-8?q?=E9=87=87=E9=9B=86=E9=93=BE=E6=8E=A5=E6=8A=A5=E9=94=99=E9=97=AE?=
 =?UTF-8?q?=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 红旗/CrawlHqwg.py   | 266 ++++++++++++++++++++++++++++++++++++++++++++
 红旗/CrawlQiushi.py | 244 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 510 insertions(+)
 create mode 100644 红旗/CrawlHqwg.py
 create mode 100644 红旗/CrawlQiushi.py

diff --git a/红旗/CrawlHqwg.py b/红旗/CrawlHqwg.py
new file mode 100644
index 0000000..4df04d1
--- /dev/null
+++ b/红旗/CrawlHqwg.py
@@ -0,0 +1,266 @@
+# -*- coding = utf-8 -*-
+# @Time : 2021/12/2 20:34
+# @Author : Hongshuang Gu
+# @File : Crawlhqwg.py
+# @Software : PyCharm
+import asyncio
+import random
+import re
+from datetime import datetime
+
+import pandas as pd
+from bs4 import BeautifulSoup
+from httpx import AsyncClient
+from motor.motor_asyncio import AsyncIOMotorClient
+
+# 链接数据库
+client = AsyncIOMotorClient('mongodb://localhost:27017')
+db = client['zydm']
+collection = db['hqwg']
+# 数据库起止时间
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
+
+
+async def main():
+    # 判断数据库是否存在
+    collist = await db.list_collection_names()
+    if "hqwg" in collist:  # 检测集合是否存在
+        print("红旗文稿集合存在，更新数据库")
+        searchRes = await collection.find({}).to_list(length=None)
+        Res = pd.DataFrame(list(searchRes))
+        h1 = Res['title'].drop_duplicates().reset_index()
+        # 输入更新数据库时间
+        await upDate(h1)
+    else:
+        await getDate()
+
+
+# 解析网页正文
+def parse_html_text(soup):
+    """
+    :param html: html字符串
+    :return: 正文 string
+    """
+    content = ''  # gu:建了一个字符串
+    for p in soup.select('.highlight p'):
+        para = p.text.strip()
+        if para:
+            content += para
+            content += '\n'
+    return content
+
+
+def parse_author(soup):
+    all_name = soup.select('.headtitle') or soup.select('.metadata')
+    if all_name:
+        name = re.findall(r'作者：(.*)', str(all_name))[0]
+    else:
+        name = ''
+    return name
+
+
+def parse_time(soup):
+    if soup.select('.pubtime'):
+        str_time = soup.select('.pubtime')
+        release_time = datetime.strptime(str_time[0].text.strip(), '%Y-%m-%d %H:%M:%S')
+    else:
+        str_time = soup.select('.headtitle span') or soup.select('.metadata')
+        find_time = re.findall(r'([0-9]{4}年[0-9]{2}月[0-9]{2}日 [0-9]{2}:[0-9]{2}:[0-9]{2})', str(str_time))
+        release_time = datetime.strptime(find_time[0].strip(), '%Y年%m月%d日 %H:%M:%S')
+    return release_time
+
+
+# 爬取网页并建立数据库
+async def getDate():
+    url = "http://www.qstheory.cn/hqwglist/mulu.htm"
+    # 进入首页
+    try:
+        async with AsyncClient(headers=headers) as client:
+            response = await client.get(url)
+            response.encoding = response.charset_encoding
+            print('一级连接状态%d' % response.status_code)
+            if response.status_code == 200:
+                soup = BeautifulSoup(response.text, "lxml")
+                for item in soup.select('.booktitle a'):
+                    book_link = item.get('href')
+                    if "http" not in book_link:
+                        book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip()
+                    year = item.text
+                    response2 = await client.get(book_link)
+                    response2.encoding = response2.charset_encoding
+                    print('二级连接状态%d' % response2.status_code)
+                    if response2.status_code == 200:
+                        soup1 = BeautifulSoup(response2.text, "lxml")
+                        for item1 in soup1.select('.highlight p a'):
+                            if '《红旗文稿》' in item1.text:
+                                banmianhao = item1.text.split("第")[-1].replace("期", "").strip()
+                                banmianming = '红旗文稿' + year
+                                yaowen_link = item1.get('href')
+                                response3 = await  client.get(yaowen_link)
+                                response3.encoding = response3.charset_encoding
+                                print('三级连接状态%d' % response3.status_code)
+                                if response3.status_code == 200:
+                                    soup2 = BeautifulSoup(response3.text, "lxml")
+                                    for item2 in soup2.select('.text p a'):
+                                        link = item2.get('href')
+                                        title = item2.text.strip()
+                                        response4 = await client.get(link, )
+                                        response4.encoding = response4.charset_encoding
+                                        print('四级连接状态%d' % response4.status_code)
+                                        if response4.status_code == 200:
+                                            soup3 = BeautifulSoup(response4.text, "lxml")
+                                            if soup3.select('h1'):
+                                                release_time = parse_time(soup3)
+                                                content = parse_html_text(soup3)
+                                                author = parse_author(soup3)
+                                                await collection.insert_one({'banmianhao': banmianhao,
+                                                                             'banmianming': banmianming,
+                                                                             'title': title,
+                                                                             'subtitle': 'empty',
+                                                                             'author': author,
+                                                                             'keywordlist': 'empty',
+                                                                             'detail_url': link,
+                                                                             'release_time': release_time,
+                                                                             'insert_timestamp': datetime.today(),
+                                                                             'content': content})
+                                                print("%s-%s-%s已完成" % (release_time, banmianhao, title))
+                                            else:
+                                                real_page = soup3.select('script')
+                                                real_url = re.findall(r'window.location.href="(.*?)"', str(real_page))
+                                                response5 = await client.get(real_url[0])
+                                                response5.encoding = response5.charset_encoding
+                                                print('五级连接状态%d' % response5.status_code)
+                                                if response5.status_code == 200:
+                                                    soup4 = BeautifulSoup(response5.text, "lxml")
+                                                    release_time = parse_time(soup4)
+                                                    content = parse_html_text(soup4)
+                                                    author = parse_author(soup4)
+                                                    await collection.insert_one({'banmianhao': banmianhao,
+                                                                                 'banmianming': banmianming,
+                                                                                 'title': title,
+                                                                                 'subtitle': 'empty',
+                                                                                 'author': author,
+                                                                                 'keywordlist': 'empty',
+                                                                                 'detail_url': link,
+                                                                                 'release_time': release_time,
+                                                                                 'insert_timestamp': datetime.today(),
+                                                                                 'content': content})
+                                                    print("%s-%s-%s已完成" % (release_time, banmianhao, title))
+                                                    await asyncio.sleep(random.randint(5, 20))
+    except Exception as result:
+        # 改天数据为空
+        await collection.insert_one({'banmianhao': 'empty',
+                                     'banmianming': 'empty',
+                                     'title': 'empty',
+                                     'subtitle': 'empty',
+                                     'author': 'empty',
+                                     'keywordlist': 'empty',
+                                     'detail_url': url,
+                                     'release_time': 'empty',
+                                     'insert_timestamp': datetime.today(),
+                                     'content': 'empty'})
+        print(result)
+
+
+# 更新数据
+async def upDate(h1):
+    url = "http://www.qstheory.cn/hqwglist/mulu.htm"
+    # 进入首页
+    try:
+        async with AsyncClient(headers=headers) as client:
+            response = await client.get(url)
+            response.encoding = response.charset_encoding
+            print('一级连接状态%d' % response.status_code)
+            if response.status_code == 200:
+                soup = BeautifulSoup(response.text, "lxml")
+                for item in soup.select('.booktitle a'):
+                    book_link = item.get('href')
+                    if "http" not in book_link:
+                        book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip()
+                    year = item.text
+                    response2 = await client.get(book_link, headers=headers)
+                    response2.encoding = response2.charset_encoding
+                    print('二级连接状态%d' % response2.status_code)
+                    if response2.status_code == 200:
+                        soup1 = BeautifulSoup(response2.text, "lxml")
+                        for item1 in soup1.select('.highlight p a'):
+                            if '《红旗文稿》' in item1.text:
+                                banmianhao = item1.text.split("第")[-1].replace("期", "").strip()
+                                banmianming = '红旗文稿' + year
+                                yaowen_link = item1.get('href')
+                                response3 = await client.get(yaowen_link)
+                                response3.encoding = response3.charset_encoding
+                                print('三级连接状态%d' % response3.status_code)
+                                if response3.status_code == 200:
+                                    soup2 = BeautifulSoup(response3.text, "lxml")
+                                    for item2 in soup2.select('.text p a'):
+                                        link = item2.get('href')
+                                        title = item2.text.strip()
+                                        if h1[h1['title'].str.contains(title)].empty:
+                                            response4 = await client.get(link, headers=headers)
+                                            response4.encoding = response4.charset_encoding
+                                            print('四级连接状态%d' % response4.status_code)
+                                            if response4.status_code == 200:
+                                                soup3 = BeautifulSoup(response4.text, "lxml")
+                                                if soup3.select('h1'):
+                                                    release_time = parse_time(soup3)
+                                                    content = parse_html_text(soup3)
+                                                    author = parse_author(soup3)
+                                                    await collection.insert_one({'banmianhao': banmianhao,
+                                                                                 'banmianming': banmianming,
+                                                                                 'title': title,
+                                                                                 'subtitle': 'empty',
+                                                                                 'author': author,
+                                                                                 'keywordlist': 'empty',
+                                                                                 'detail_url': link,
+                                                                                 'release_time': release_time,
+                                                                                 'insert_timestamp': datetime.today(),
+                                                                                 'content': content})
+                                                    print("%s-%s-%s已完成" % (release_time, banmianhao, title))
+                                                else:
+                                                    real_page = soup3.select('script')
+                                                    real_url = re.findall(r'window.location.href="(.*?)"',
+                                                                          str(real_page))
+                                                    response5 = await client.get(real_url[0], headers=headers)
+                                                    response5.encoding = response5.charset_encoding
+                                                    print('五级连接状态%d' % response5.status_code)
+                                                    if response5.status_code == 200:
+                                                        soup4 = BeautifulSoup(response5.text, "lxml")
+                                                        release_time = parse_time(soup4)
+                                                        content = parse_html_text(soup4)
+                                                        author = parse_author(soup4)
+                                                        await collection.insert_one({'banmianhao': banmianhao,
+                                                                                     'banmianming': banmianming,
+                                                                                     'title': title,
+                                                                                     'subtitle': 'empty',
+                                                                                     'author': author,
+                                                                                     'keywordlist': 'empty',
+                                                                                     'detail_url': link,
+                                                                                     'release_time': release_time,
+                                                                                     'insert_timestamp': datetime.today(),
+                                                                                     'content': content})
+                                                        print("%s-%s-%s已完成" % (release_time, banmianhao, title))
+                                                    await asyncio.sleep(random.randint(5, 20))
+                                            await asyncio.sleep(random.randint(5, 20))
+                                        else:
+                                            print('%s已经存在' % title)
+    except Exception as result:
+        # 改天数据为空
+        await collection.insert_one({'banmianhao': 'empty',
+                                     'banmianming': 'empty',
+                                     'title': 'empty',
+                                     'subtitle': 'empty',
+                                     'author': 'empty',
+                                     'keywordlist': 'empty',
+                                     'detail_url': url,
+                                     'release_time': 'empty',
+                                     'insert_timestamp': datetime.today(),
+                                     'content': 'empty'})
+        print(result)
+
+
+if __name__ == "__main__":  # 当程序执行时
+    # 调用函数
+    asyncio.run(main())
+    print("爬取完毕！")
diff --git a/红旗/CrawlQiushi.py b/红旗/CrawlQiushi.py
new file mode 100644
index 0000000..a5dfaf1
--- /dev/null
+++ b/红旗/CrawlQiushi.py
@@ -0,0 +1,244 @@
+# -*- coding = utf-8 -*-
+# @Time : 2021/12/2 20:34
+# @Author : Hongshuang Gu
+# @File : Crawlqiushi.py
+# @Software : PyCharm
+
+
+import asyncio
+import random
+from datetime import datetime
+
+import pandas as pd
+from bs4 import BeautifulSoup
+from httpx import AsyncClient
+from motor.motor_asyncio import AsyncIOMotorClient
+
+# 链接数据库
+client = AsyncIOMotorClient('mongodb://localhost:27017')
+db = client['zydm']
+collection = db['qiushi']
+# 数据库起止时间
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
+
+
+async def main():
+    # 判断数据库是否存在
+    collist = await db.list_collection_names()
+    if "qiushi" in collist:  # 检测集合是否存在
+        print("求是集合存在，更新数据库")
+        searchRes = await collection.find({}).to_list(length=None)
+        Res = pd.DataFrame(list(searchRes))
+        h1 = Res['title'].drop_duplicates().reset_index()
+        # 输入更新数据库时间
+        await upDate(h1)
+    else:
+        await getDate()
+
+
+# 解析网页正文
+def parse_html_text(soup):
+    """
+    :param html: html字符串
+    :return: 正文 string
+    """
+    content = ''  # gu:建了一个字符串
+    for p in soup.select('.highlight p'):
+        para = p.text.strip()
+        if para:
+            content += para
+            content += '\n'
+    return content
+
+
+def parse_author(soup):
+    all_name = soup.select('.appellation')
+    if all_name:
+        name = all_name[-1].text
+    else:
+        name = ''
+    return name
+
+
+# 爬取网页并建立数据库
+async def getDate():
+    url = "http://www.qstheory.cn/qs/mulu.htm"
+    # 进入首页
+    try:
+        async with AsyncClient(headers=headers) as client:
+            response = await client.get(url)
+            response.encoding = response.charset_encoding
+            print('一级连接状态%d' % response.status_code)
+            if response.status_code == 200:
+                soup = BeautifulSoup(response.text, "lxml")
+                for item in soup.select('.booktitle a'):
+                    book_link = item.get('href')
+                    if "http" not in book_link:
+                        book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip()
+                    year = item.text
+                    response2 = await client.get(book_link)
+                    response2.encoding = response2.charset_encoding
+                    print('二级连接状态%d' % response2.status_code)
+                    if response2.status_code == 200:
+                        soup1 = BeautifulSoup(response2.text, "lxml")
+                        for item1 in soup1.select('.highlight p a'):
+                            if '《求是》' in item1.text:
+                                banmianhao = item1.text.split("第")[-1].replace("期", "").strip()
+                                banmianming = '求是' + year
+                                yaowen_link = item1.get('href')
+                                response3 = await  client.get(yaowen_link)
+                                response3.encoding = response3.charset_encoding
+                                print('三级连接状态%d' % response3.status_code)
+                                if response3.status_code == 200:
+                                    soup2 = BeautifulSoup(response3.text, "lxml")
+                                    for item2 in soup2.select('.text p a'):
+                                        link = item2.get('href')
+                                        title = item2.text.strip()
+                                        response4 = await client.get(link, )
+                                        response4.encoding = response4.charset_encoding
+                                        print('四级连接状态%d' % response4.status_code)
+                                        if response4.status_code == 200:
+                                            soup3 = BeautifulSoup(response4.text, "lxml")
+                                            if soup3.select('h1'):
+                                                author = parse_author(soup3)
+                                                if soup3.select('.pubtime'):
+                                                    str_time = soup3.select('.pubtime')[0].text.strip()
+                                                    release_time = datetime.strptime(str_time, '%Y-%m-%d %H:%M:%S')
+                                                else:
+                                                    str_time = soup3.select('.headtitle span')[0].text.strip()
+                                                    release_time = datetime.strptime(str_time, '%Y年%m月%d日 %H:%M:%S')
+                                                content = parse_html_text(soup3)
+                                                await collection.insert_one({'banmianhao': banmianhao,
+                                                                             'banmianming': banmianming,
+                                                                             'title': title,
+                                                                             'subtitle': 'empty',
+                                                                             'author': author,
+                                                                             'keywordlist': 'empty',
+                                                                             'detail_url': link,
+                                                                             'release_time': release_time,
+                                                                             'insert_timestamp': datetime.today(),
+                                                                             'content': content})
+                                                print("%s-%s-%s已完成" % (release_time, banmianhao, title))
+                                            else:
+                                                await collection.insert_one({'banmianhao': banmianhao,
+                                                                             'banmianming': banmianming,
+                                                                             'title': title,
+                                                                             'subtitle': 'empty',
+                                                                             'author': 'empty',
+                                                                             'keywordlist': 'empty',
+                                                                             'detail_url': link,
+                                                                             'release_time': 'empty',
+                                                                             'insert_timestamp': datetime.today(),
+                                                                             'content': 'empty'})
+                                                print("%s无内容" % (title))
+                                            await asyncio.sleep(random.randint(5, 20))
+    except Exception as result:
+        # 改天数据为空
+        await collection.insert_one({'banmianhao': 'empty',
+                                     'banmianming': 'empty',
+                                     'title': 'empty',
+                                     'subtitle': 'empty',
+                                     'author': 'empty',
+                                     'keywordlist': 'empty',
+                                     'detail_url': url,
+                                     'release_time': 'empty',
+                                     'insert_timestamp': datetime.today(),
+                                     'content': 'empty'})
+        print(result)
+
+
+async def upDate(h1):
+    url = "http://www.qstheory.cn/qs/mulu.htm"
+    # 进入首页
+    try:
+        async with AsyncClient(headers=headers) as client:
+            response = await client.get(url)
+            response.encoding = response.charset_encoding
+            print('一级连接状态%d' % response.status_code)
+            if response.status_code == 200:
+                soup = BeautifulSoup(response.text, "lxml")
+                for item in soup.select('.booktitle a'):
+                    book_link = item.get('href')
+                    if "http" not in book_link:
+                        book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip()
+                    year = item.text
+                    response2 = await client.get(book_link, headers=headers)
+                    response2.encoding = 'utf-8'
+                    print('二级连接状态%d' % response2.status_code)
+                    if response2.status_code == 200:
+                        soup1 = BeautifulSoup(response2.text, "lxml")
+                        for item1 in soup1.select('.highlight p a'):
+                            if '《求是》' in item1.text:
+                                banmianhao = item1.text.split("第")[-1].replace("期", "").strip()
+                                banmianming = '求是' + year
+                                yaowen_link = item1.get('href')
+                                response3 = await client.get(yaowen_link)
+                                response3.encoding = response3.charset_encoding
+                                print('三级连接状态%d' % response3.status_code)
+                                if response3.status_code == 200:
+                                    soup2 = BeautifulSoup(response3.text, "lxml")
+                                    for item2 in soup2.select('.text p a'):
+                                        link = item2.get('href')
+                                        title = item2.text.strip()
+                                        if h1[h1['title'].str.contains(title)].empty:
+                                            response4 = await client.get(link, headers=headers)
+                                            response4.encoding = response4.charset_encoding
+                                            print('四级连接状态%d' % response4.status_code)
+                                            if response4.status_code == 200:
+                                                soup3 = BeautifulSoup(response4.text, "lxml")
+                                                if soup3.select('h1'):
+                                                    author = parse_author(soup3)
+                                                    if soup3.select('.pubtime'):
+                                                        str_time = soup3.select('.pubtime')[0].text.strip()
+                                                        release_time = datetime.strptime(str_time, '%Y-%m-%d %H:%M:%S')
+                                                    else:
+                                                        str_time = soup3.select('.headtitle span')[0].text.strip()
+                                                        release_time = datetime.strptime(str_time,
+                                                                                         '%Y年%m月%d日 %H:%M:%S')
+                                                    content = parse_html_text(soup3)
+                                                    await collection.insert_one({'banmianhao': banmianhao,
+                                                                                 'banmianming': banmianming,
+                                                                                 'title': title,
+                                                                                 'subtitle': 'empty',
+                                                                                 'author': author,
+                                                                                 'keywordlist': 'empty',
+                                                                                 'detail_url': link,
+                                                                                 'release_time': release_time,
+                                                                                 'insert_timestamp': datetime.today(),
+                                                                                 'content': content})
+                                                    print("%s-%s-%s已完成" % (release_time, banmianhao, title))
+                                                else:
+                                                    await collection.insert_one({'banmianhao': banmianhao,
+                                                                                 'banmianming': banmianming,
+                                                                                 'title': title,
+                                                                                 'subtitle': 'empty',
+                                                                                 'author': 'empty',
+                                                                                 'keywordlist': 'empty',
+                                                                                 'detail_url': link,
+                                                                                 'release_time': 'empty',
+                                                                                 'insert_timestamp': datetime.today(),
+                                                                                 'content': 'empty'})
+                                                    print("%s无内容" % (title))
+                                                await asyncio.sleep(random.randint(5, 20))
+                                        else:
+                                            print('%s已经存在' % title)
+    except Exception as result:
+        # 改天数据为空
+        await collection.insert_one({'banmianhao': 'empty',
+                                     'banmianming': 'empty',
+                                     'title': 'empty',
+                                     'subtitle': 'empty',
+                                     'author': 'empty',
+                                     'keywordlist': 'empty',
+                                     'detail_url': url,
+                                     'release_time': 'empty',
+                                     'insert_timestamp': datetime.today(),
+                                     'content': 'empty'})
+        print(result)
+
+
+if __name__ == "__main__":  # 当程序执行时
+    # 调用函数
+    asyncio.run(main())
+    print("爬取完毕！")