From 44f8f2e1fc617c9bbf3a8e050632a026ba4ff3da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9A=93=E6=9C=88=E5=BD=92=E5=B0=98?= Date: Fri, 7 Feb 2025 23:07:43 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E7=BA=A2=E6=97=97?= =?UTF-8?q?=E6=96=87=E7=A8=BF=EF=BC=8C=E3=80=8A=E6=B1=82=E6=98=AF=E3=80=8B?= =?UTF-8?q?=E9=87=87=E9=9B=86=E9=93=BE=E6=8E=A5=E6=8A=A5=E9=94=99=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 红旗/CrawlHqwg.py | 266 ++++++++++++++++++++++++++++++++++++++++++++ 红旗/CrawlQiushi.py | 244 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 510 insertions(+) create mode 100644 红旗/CrawlHqwg.py create mode 100644 红旗/CrawlQiushi.py diff --git a/红旗/CrawlHqwg.py b/红旗/CrawlHqwg.py new file mode 100644 index 0000000..4df04d1 --- /dev/null +++ b/红旗/CrawlHqwg.py @@ -0,0 +1,266 @@ +# -*- coding = utf-8 -*- +# @Time : 2021/12/2 20:34 +# @Author : Hongshuang Gu +# @File : Crawlhqwg.py +# @Software : PyCharm +import asyncio +import random +import re +from datetime import datetime + +import pandas as pd +from bs4 import BeautifulSoup +from httpx import AsyncClient +from motor.motor_asyncio import AsyncIOMotorClient + +# 链接数据库 +client = AsyncIOMotorClient('mongodb://localhost:27017') +db = client['zydm'] +collection = db['hqwg'] +# 数据库起止时间 +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'} + + +async def main(): + # 判断数据库是否存在 + collist = await db.list_collection_names() + if "hqwg" in collist: # 检测集合是否存在 + print("红旗文稿集合存在,更新数据库") + searchRes = await collection.find({}).to_list(length=None) + Res = pd.DataFrame(list(searchRes)) + h1 = Res['title'].drop_duplicates().reset_index() + # 输入更新数据库时间 + await upDate(h1) + else: + await getDate() + + +# 解析网页正文 +def parse_html_text(soup): + """ + :param html: html字符串 + :return: 正文 string + """ + content = '' # gu:建了一个字符串 + for p in soup.select('.highlight p'): + para = p.text.strip() + if para: + content += para + content += '\n' + return content + + +def parse_author(soup): + all_name = soup.select('.headtitle') or soup.select('.metadata') + if all_name: + name = re.findall(r'作者:(.*)', str(all_name))[0] + else: + name = '' + return name + + +def parse_time(soup): + if soup.select('.pubtime'): + str_time = soup.select('.pubtime') + release_time = datetime.strptime(str_time[0].text.strip(), '%Y-%m-%d %H:%M:%S') + else: + str_time = soup.select('.headtitle span') or soup.select('.metadata') + find_time = re.findall(r'([0-9]{4}年[0-9]{2}月[0-9]{2}日 [0-9]{2}:[0-9]{2}:[0-9]{2})', str(str_time)) + release_time = datetime.strptime(find_time[0].strip(), '%Y年%m月%d日 %H:%M:%S') + return release_time + + +# 爬取网页并建立数据库 +async def getDate(): + url = "http://www.qstheory.cn/hqwglist/mulu.htm" + # 进入首页 + try: + async with AsyncClient(headers=headers) as client: + response = await client.get(url) + response.encoding = response.charset_encoding + print('一级连接状态%d' % response.status_code) + if response.status_code == 200: + soup = BeautifulSoup(response.text, "lxml") + for item in soup.select('.booktitle a'): + book_link = item.get('href') + if "http" not in book_link: + book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip() + year = item.text + response2 = await client.get(book_link) + response2.encoding = response2.charset_encoding + print('二级连接状态%d' % response2.status_code) + if response2.status_code == 200: + soup1 = BeautifulSoup(response2.text, "lxml") + for item1 in soup1.select('.highlight p a'): + if '《红旗文稿》' in item1.text: + banmianhao = item1.text.split("第")[-1].replace("期", "").strip() + banmianming = '红旗文稿' + year + yaowen_link = item1.get('href') + response3 = await client.get(yaowen_link) + response3.encoding = response3.charset_encoding + print('三级连接状态%d' % response3.status_code) + if response3.status_code == 200: + soup2 = BeautifulSoup(response3.text, "lxml") + for item2 in soup2.select('.text p a'): + link = item2.get('href') + title = item2.text.strip() + response4 = await client.get(link, ) + response4.encoding = response4.charset_encoding + print('四级连接状态%d' % response4.status_code) + if response4.status_code == 200: + soup3 = BeautifulSoup(response4.text, "lxml") + if soup3.select('h1'): + release_time = parse_time(soup3) + content = parse_html_text(soup3) + author = parse_author(soup3) + await collection.insert_one({'banmianhao': banmianhao, + 'banmianming': banmianming, + 'title': title, + 'subtitle': 'empty', + 'author': author, + 'keywordlist': 'empty', + 'detail_url': link, + 'release_time': release_time, + 'insert_timestamp': datetime.today(), + 'content': content}) + print("%s-%s-%s已完成" % (release_time, banmianhao, title)) + else: + real_page = soup3.select('script') + real_url = re.findall(r'window.location.href="(.*?)"', str(real_page)) + response5 = await client.get(real_url[0]) + response5.encoding = response5.charset_encoding + print('五级连接状态%d' % response5.status_code) + if response5.status_code == 200: + soup4 = BeautifulSoup(response5.text, "lxml") + release_time = parse_time(soup4) + content = parse_html_text(soup4) + author = parse_author(soup4) + await collection.insert_one({'banmianhao': banmianhao, + 'banmianming': banmianming, + 'title': title, + 'subtitle': 'empty', + 'author': author, + 'keywordlist': 'empty', + 'detail_url': link, + 'release_time': release_time, + 'insert_timestamp': datetime.today(), + 'content': content}) + print("%s-%s-%s已完成" % (release_time, banmianhao, title)) + await asyncio.sleep(random.randint(5, 20)) + except Exception as result: + # 改天数据为空 + await collection.insert_one({'banmianhao': 'empty', + 'banmianming': 'empty', + 'title': 'empty', + 'subtitle': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': url, + 'release_time': 'empty', + 'insert_timestamp': datetime.today(), + 'content': 'empty'}) + print(result) + + +# 更新数据 +async def upDate(h1): + url = "http://www.qstheory.cn/hqwglist/mulu.htm" + # 进入首页 + try: + async with AsyncClient(headers=headers) as client: + response = await client.get(url) + response.encoding = response.charset_encoding + print('一级连接状态%d' % response.status_code) + if response.status_code == 200: + soup = BeautifulSoup(response.text, "lxml") + for item in soup.select('.booktitle a'): + book_link = item.get('href') + if "http" not in book_link: + book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip() + year = item.text + response2 = await client.get(book_link, headers=headers) + response2.encoding = response2.charset_encoding + print('二级连接状态%d' % response2.status_code) + if response2.status_code == 200: + soup1 = BeautifulSoup(response2.text, "lxml") + for item1 in soup1.select('.highlight p a'): + if '《红旗文稿》' in item1.text: + banmianhao = item1.text.split("第")[-1].replace("期", "").strip() + banmianming = '红旗文稿' + year + yaowen_link = item1.get('href') + response3 = await client.get(yaowen_link) + response3.encoding = response3.charset_encoding + print('三级连接状态%d' % response3.status_code) + if response3.status_code == 200: + soup2 = BeautifulSoup(response3.text, "lxml") + for item2 in soup2.select('.text p a'): + link = item2.get('href') + title = item2.text.strip() + if h1[h1['title'].str.contains(title)].empty: + response4 = await client.get(link, headers=headers) + response4.encoding = response4.charset_encoding + print('四级连接状态%d' % response4.status_code) + if response4.status_code == 200: + soup3 = BeautifulSoup(response4.text, "lxml") + if soup3.select('h1'): + release_time = parse_time(soup3) + content = parse_html_text(soup3) + author = parse_author(soup3) + await collection.insert_one({'banmianhao': banmianhao, + 'banmianming': banmianming, + 'title': title, + 'subtitle': 'empty', + 'author': author, + 'keywordlist': 'empty', + 'detail_url': link, + 'release_time': release_time, + 'insert_timestamp': datetime.today(), + 'content': content}) + print("%s-%s-%s已完成" % (release_time, banmianhao, title)) + else: + real_page = soup3.select('script') + real_url = re.findall(r'window.location.href="(.*?)"', + str(real_page)) + response5 = await client.get(real_url[0], headers=headers) + response5.encoding = response5.charset_encoding + print('五级连接状态%d' % response5.status_code) + if response5.status_code == 200: + soup4 = BeautifulSoup(response5.text, "lxml") + release_time = parse_time(soup4) + content = parse_html_text(soup4) + author = parse_author(soup4) + await collection.insert_one({'banmianhao': banmianhao, + 'banmianming': banmianming, + 'title': title, + 'subtitle': 'empty', + 'author': author, + 'keywordlist': 'empty', + 'detail_url': link, + 'release_time': release_time, + 'insert_timestamp': datetime.today(), + 'content': content}) + print("%s-%s-%s已完成" % (release_time, banmianhao, title)) + await asyncio.sleep(random.randint(5, 20)) + await asyncio.sleep(random.randint(5, 20)) + else: + print('%s已经存在' % title) + except Exception as result: + # 改天数据为空 + await collection.insert_one({'banmianhao': 'empty', + 'banmianming': 'empty', + 'title': 'empty', + 'subtitle': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': url, + 'release_time': 'empty', + 'insert_timestamp': datetime.today(), + 'content': 'empty'}) + print(result) + + +if __name__ == "__main__": # 当程序执行时 + # 调用函数 + asyncio.run(main()) + print("爬取完毕!") diff --git a/红旗/CrawlQiushi.py b/红旗/CrawlQiushi.py new file mode 100644 index 0000000..a5dfaf1 --- /dev/null +++ b/红旗/CrawlQiushi.py @@ -0,0 +1,244 @@ +# -*- coding = utf-8 -*- +# @Time : 2021/12/2 20:34 +# @Author : Hongshuang Gu +# @File : Crawlqiushi.py +# @Software : PyCharm + + +import asyncio +import random +from datetime import datetime + +import pandas as pd +from bs4 import BeautifulSoup +from httpx import AsyncClient +from motor.motor_asyncio import AsyncIOMotorClient + +# 链接数据库 +client = AsyncIOMotorClient('mongodb://localhost:27017') +db = client['zydm'] +collection = db['qiushi'] +# 数据库起止时间 +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'} + + +async def main(): + # 判断数据库是否存在 + collist = await db.list_collection_names() + if "qiushi" in collist: # 检测集合是否存在 + print("求是集合存在,更新数据库") + searchRes = await collection.find({}).to_list(length=None) + Res = pd.DataFrame(list(searchRes)) + h1 = Res['title'].drop_duplicates().reset_index() + # 输入更新数据库时间 + await upDate(h1) + else: + await getDate() + + +# 解析网页正文 +def parse_html_text(soup): + """ + :param html: html字符串 + :return: 正文 string + """ + content = '' # gu:建了一个字符串 + for p in soup.select('.highlight p'): + para = p.text.strip() + if para: + content += para + content += '\n' + return content + + +def parse_author(soup): + all_name = soup.select('.appellation') + if all_name: + name = all_name[-1].text + else: + name = '' + return name + + +# 爬取网页并建立数据库 +async def getDate(): + url = "http://www.qstheory.cn/qs/mulu.htm" + # 进入首页 + try: + async with AsyncClient(headers=headers) as client: + response = await client.get(url) + response.encoding = response.charset_encoding + print('一级连接状态%d' % response.status_code) + if response.status_code == 200: + soup = BeautifulSoup(response.text, "lxml") + for item in soup.select('.booktitle a'): + book_link = item.get('href') + if "http" not in book_link: + book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip() + year = item.text + response2 = await client.get(book_link) + response2.encoding = response2.charset_encoding + print('二级连接状态%d' % response2.status_code) + if response2.status_code == 200: + soup1 = BeautifulSoup(response2.text, "lxml") + for item1 in soup1.select('.highlight p a'): + if '《求是》' in item1.text: + banmianhao = item1.text.split("第")[-1].replace("期", "").strip() + banmianming = '求是' + year + yaowen_link = item1.get('href') + response3 = await client.get(yaowen_link) + response3.encoding = response3.charset_encoding + print('三级连接状态%d' % response3.status_code) + if response3.status_code == 200: + soup2 = BeautifulSoup(response3.text, "lxml") + for item2 in soup2.select('.text p a'): + link = item2.get('href') + title = item2.text.strip() + response4 = await client.get(link, ) + response4.encoding = response4.charset_encoding + print('四级连接状态%d' % response4.status_code) + if response4.status_code == 200: + soup3 = BeautifulSoup(response4.text, "lxml") + if soup3.select('h1'): + author = parse_author(soup3) + if soup3.select('.pubtime'): + str_time = soup3.select('.pubtime')[0].text.strip() + release_time = datetime.strptime(str_time, '%Y-%m-%d %H:%M:%S') + else: + str_time = soup3.select('.headtitle span')[0].text.strip() + release_time = datetime.strptime(str_time, '%Y年%m月%d日 %H:%M:%S') + content = parse_html_text(soup3) + await collection.insert_one({'banmianhao': banmianhao, + 'banmianming': banmianming, + 'title': title, + 'subtitle': 'empty', + 'author': author, + 'keywordlist': 'empty', + 'detail_url': link, + 'release_time': release_time, + 'insert_timestamp': datetime.today(), + 'content': content}) + print("%s-%s-%s已完成" % (release_time, banmianhao, title)) + else: + await collection.insert_one({'banmianhao': banmianhao, + 'banmianming': banmianming, + 'title': title, + 'subtitle': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': link, + 'release_time': 'empty', + 'insert_timestamp': datetime.today(), + 'content': 'empty'}) + print("%s无内容" % (title)) + await asyncio.sleep(random.randint(5, 20)) + except Exception as result: + # 改天数据为空 + await collection.insert_one({'banmianhao': 'empty', + 'banmianming': 'empty', + 'title': 'empty', + 'subtitle': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': url, + 'release_time': 'empty', + 'insert_timestamp': datetime.today(), + 'content': 'empty'}) + print(result) + + +async def upDate(h1): + url = "http://www.qstheory.cn/qs/mulu.htm" + # 进入首页 + try: + async with AsyncClient(headers=headers) as client: + response = await client.get(url) + response.encoding = response.charset_encoding + print('一级连接状态%d' % response.status_code) + if response.status_code == 200: + soup = BeautifulSoup(response.text, "lxml") + for item in soup.select('.booktitle a'): + book_link = item.get('href') + if "http" not in book_link: + book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip() + year = item.text + response2 = await client.get(book_link, headers=headers) + response2.encoding = 'utf-8' + print('二级连接状态%d' % response2.status_code) + if response2.status_code == 200: + soup1 = BeautifulSoup(response2.text, "lxml") + for item1 in soup1.select('.highlight p a'): + if '《求是》' in item1.text: + banmianhao = item1.text.split("第")[-1].replace("期", "").strip() + banmianming = '求是' + year + yaowen_link = item1.get('href') + response3 = await client.get(yaowen_link) + response3.encoding = response3.charset_encoding + print('三级连接状态%d' % response3.status_code) + if response3.status_code == 200: + soup2 = BeautifulSoup(response3.text, "lxml") + for item2 in soup2.select('.text p a'): + link = item2.get('href') + title = item2.text.strip() + if h1[h1['title'].str.contains(title)].empty: + response4 = await client.get(link, headers=headers) + response4.encoding = response4.charset_encoding + print('四级连接状态%d' % response4.status_code) + if response4.status_code == 200: + soup3 = BeautifulSoup(response4.text, "lxml") + if soup3.select('h1'): + author = parse_author(soup3) + if soup3.select('.pubtime'): + str_time = soup3.select('.pubtime')[0].text.strip() + release_time = datetime.strptime(str_time, '%Y-%m-%d %H:%M:%S') + else: + str_time = soup3.select('.headtitle span')[0].text.strip() + release_time = datetime.strptime(str_time, + '%Y年%m月%d日 %H:%M:%S') + content = parse_html_text(soup3) + await collection.insert_one({'banmianhao': banmianhao, + 'banmianming': banmianming, + 'title': title, + 'subtitle': 'empty', + 'author': author, + 'keywordlist': 'empty', + 'detail_url': link, + 'release_time': release_time, + 'insert_timestamp': datetime.today(), + 'content': content}) + print("%s-%s-%s已完成" % (release_time, banmianhao, title)) + else: + await collection.insert_one({'banmianhao': banmianhao, + 'banmianming': banmianming, + 'title': title, + 'subtitle': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': link, + 'release_time': 'empty', + 'insert_timestamp': datetime.today(), + 'content': 'empty'}) + print("%s无内容" % (title)) + await asyncio.sleep(random.randint(5, 20)) + else: + print('%s已经存在' % title) + except Exception as result: + # 改天数据为空 + await collection.insert_one({'banmianhao': 'empty', + 'banmianming': 'empty', + 'title': 'empty', + 'subtitle': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': url, + 'release_time': 'empty', + 'insert_timestamp': datetime.today(), + 'content': 'empty'}) + print(result) + + +if __name__ == "__main__": # 当程序执行时 + # 调用函数 + asyncio.run(main()) + print("爬取完毕!")