diff --git a/红旗/CrawlHqwg.py b/红旗/CrawlHqwg.py new file mode 100644 index 0000000..4df04d1 --- /dev/null +++ b/红旗/CrawlHqwg.py @@ -0,0 +1,266 @@ +# -*- coding = utf-8 -*- +# @Time : 2021/12/2 20:34 +# @Author : Hongshuang Gu +# @File : Crawlhqwg.py +# @Software : PyCharm +import asyncio +import random +import re +from datetime import datetime + +import pandas as pd +from bs4 import BeautifulSoup +from httpx import AsyncClient +from motor.motor_asyncio import AsyncIOMotorClient + +# 链接数据库 +client = AsyncIOMotorClient('mongodb://localhost:27017') +db = client['zydm'] +collection = db['hqwg'] +# 数据库起止时间 +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'} + + +async def main(): + # 判断数据库是否存在 + collist = await db.list_collection_names() + if "hqwg" in collist: # 检测集合是否存在 + print("红旗文稿集合存在,更新数据库") + searchRes = await collection.find({}).to_list(length=None) + Res = pd.DataFrame(list(searchRes)) + h1 = Res['title'].drop_duplicates().reset_index() + # 输入更新数据库时间 + await upDate(h1) + else: + await getDate() + + +# 解析网页正文 +def parse_html_text(soup): + """ + :param html: html字符串 + :return: 正文 string + """ + content = '' # gu:建了一个字符串 + for p in soup.select('.highlight p'): + para = p.text.strip() + if para: + content += para + content += '\n' + return content + + +def parse_author(soup): + all_name = soup.select('.headtitle') or soup.select('.metadata') + if all_name: + name = re.findall(r'作者:(.*)', str(all_name))[0] + else: + name = '' + return name + + +def parse_time(soup): + if soup.select('.pubtime'): + str_time = soup.select('.pubtime') + release_time = datetime.strptime(str_time[0].text.strip(), '%Y-%m-%d %H:%M:%S') + else: + str_time = soup.select('.headtitle span') or soup.select('.metadata') + find_time = re.findall(r'([0-9]{4}年[0-9]{2}月[0-9]{2}日 [0-9]{2}:[0-9]{2}:[0-9]{2})', str(str_time)) + release_time = datetime.strptime(find_time[0].strip(), '%Y年%m月%d日 %H:%M:%S') + return release_time + + +# 爬取网页并建立数据库 +async def getDate(): + url = "http://www.qstheory.cn/hqwglist/mulu.htm" + # 进入首页 + try: + async with AsyncClient(headers=headers) as client: + response = await client.get(url) + response.encoding = response.charset_encoding + print('一级连接状态%d' % response.status_code) + if response.status_code == 200: + soup = BeautifulSoup(response.text, "lxml") + for item in soup.select('.booktitle a'): + book_link = item.get('href') + if "http" not in book_link: + book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip() + year = item.text + response2 = await client.get(book_link) + response2.encoding = response2.charset_encoding + print('二级连接状态%d' % response2.status_code) + if response2.status_code == 200: + soup1 = BeautifulSoup(response2.text, "lxml") + for item1 in soup1.select('.highlight p a'): + if '《红旗文稿》' in item1.text: + banmianhao = item1.text.split("第")[-1].replace("期", "").strip() + banmianming = '红旗文稿' + year + yaowen_link = item1.get('href') + response3 = await client.get(yaowen_link) + response3.encoding = response3.charset_encoding + print('三级连接状态%d' % response3.status_code) + if response3.status_code == 200: + soup2 = BeautifulSoup(response3.text, "lxml") + for item2 in soup2.select('.text p a'): + link = item2.get('href') + title = item2.text.strip() + response4 = await client.get(link, ) + response4.encoding = response4.charset_encoding + print('四级连接状态%d' % response4.status_code) + if response4.status_code == 200: + soup3 = BeautifulSoup(response4.text, "lxml") + if soup3.select('h1'): + release_time = parse_time(soup3) + content = parse_html_text(soup3) + author = parse_author(soup3) + await collection.insert_one({'banmianhao': banmianhao, + 'banmianming': banmianming, + 'title': title, + 'subtitle': 'empty', + 'author': author, + 'keywordlist': 'empty', + 'detail_url': link, + 'release_time': release_time, + 'insert_timestamp': datetime.today(), + 'content': content}) + print("%s-%s-%s已完成" % (release_time, banmianhao, title)) + else: + real_page = soup3.select('script') + real_url = re.findall(r'window.location.href="(.*?)"', str(real_page)) + response5 = await client.get(real_url[0]) + response5.encoding = response5.charset_encoding + print('五级连接状态%d' % response5.status_code) + if response5.status_code == 200: + soup4 = BeautifulSoup(response5.text, "lxml") + release_time = parse_time(soup4) + content = parse_html_text(soup4) + author = parse_author(soup4) + await collection.insert_one({'banmianhao': banmianhao, + 'banmianming': banmianming, + 'title': title, + 'subtitle': 'empty', + 'author': author, + 'keywordlist': 'empty', + 'detail_url': link, + 'release_time': release_time, + 'insert_timestamp': datetime.today(), + 'content': content}) + print("%s-%s-%s已完成" % (release_time, banmianhao, title)) + await asyncio.sleep(random.randint(5, 20)) + except Exception as result: + # 改天数据为空 + await collection.insert_one({'banmianhao': 'empty', + 'banmianming': 'empty', + 'title': 'empty', + 'subtitle': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': url, + 'release_time': 'empty', + 'insert_timestamp': datetime.today(), + 'content': 'empty'}) + print(result) + + +# 更新数据 +async def upDate(h1): + url = "http://www.qstheory.cn/hqwglist/mulu.htm" + # 进入首页 + try: + async with AsyncClient(headers=headers) as client: + response = await client.get(url) + response.encoding = response.charset_encoding + print('一级连接状态%d' % response.status_code) + if response.status_code == 200: + soup = BeautifulSoup(response.text, "lxml") + for item in soup.select('.booktitle a'): + book_link = item.get('href') + if "http" not in book_link: + book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip() + year = item.text + response2 = await client.get(book_link, headers=headers) + response2.encoding = response2.charset_encoding + print('二级连接状态%d' % response2.status_code) + if response2.status_code == 200: + soup1 = BeautifulSoup(response2.text, "lxml") + for item1 in soup1.select('.highlight p a'): + if '《红旗文稿》' in item1.text: + banmianhao = item1.text.split("第")[-1].replace("期", "").strip() + banmianming = '红旗文稿' + year + yaowen_link = item1.get('href') + response3 = await client.get(yaowen_link) + response3.encoding = response3.charset_encoding + print('三级连接状态%d' % response3.status_code) + if response3.status_code == 200: + soup2 = BeautifulSoup(response3.text, "lxml") + for item2 in soup2.select('.text p a'): + link = item2.get('href') + title = item2.text.strip() + if h1[h1['title'].str.contains(title)].empty: + response4 = await client.get(link, headers=headers) + response4.encoding = response4.charset_encoding + print('四级连接状态%d' % response4.status_code) + if response4.status_code == 200: + soup3 = BeautifulSoup(response4.text, "lxml") + if soup3.select('h1'): + release_time = parse_time(soup3) + content = parse_html_text(soup3) + author = parse_author(soup3) + await collection.insert_one({'banmianhao': banmianhao, + 'banmianming': banmianming, + 'title': title, + 'subtitle': 'empty', + 'author': author, + 'keywordlist': 'empty', + 'detail_url': link, + 'release_time': release_time, + 'insert_timestamp': datetime.today(), + 'content': content}) + print("%s-%s-%s已完成" % (release_time, banmianhao, title)) + else: + real_page = soup3.select('script') + real_url = re.findall(r'window.location.href="(.*?)"', + str(real_page)) + response5 = await client.get(real_url[0], headers=headers) + response5.encoding = response5.charset_encoding + print('五级连接状态%d' % response5.status_code) + if response5.status_code == 200: + soup4 = BeautifulSoup(response5.text, "lxml") + release_time = parse_time(soup4) + content = parse_html_text(soup4) + author = parse_author(soup4) + await collection.insert_one({'banmianhao': banmianhao, + 'banmianming': banmianming, + 'title': title, + 'subtitle': 'empty', + 'author': author, + 'keywordlist': 'empty', + 'detail_url': link, + 'release_time': release_time, + 'insert_timestamp': datetime.today(), + 'content': content}) + print("%s-%s-%s已完成" % (release_time, banmianhao, title)) + await asyncio.sleep(random.randint(5, 20)) + await asyncio.sleep(random.randint(5, 20)) + else: + print('%s已经存在' % title) + except Exception as result: + # 改天数据为空 + await collection.insert_one({'banmianhao': 'empty', + 'banmianming': 'empty', + 'title': 'empty', + 'subtitle': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': url, + 'release_time': 'empty', + 'insert_timestamp': datetime.today(), + 'content': 'empty'}) + print(result) + + +if __name__ == "__main__": # 当程序执行时 + # 调用函数 + asyncio.run(main()) + print("爬取完毕!") diff --git a/红旗/CrawlQiushi.py b/红旗/CrawlQiushi.py new file mode 100644 index 0000000..a5dfaf1 --- /dev/null +++ b/红旗/CrawlQiushi.py @@ -0,0 +1,244 @@ +# -*- coding = utf-8 -*- +# @Time : 2021/12/2 20:34 +# @Author : Hongshuang Gu +# @File : Crawlqiushi.py +# @Software : PyCharm + + +import asyncio +import random +from datetime import datetime + +import pandas as pd +from bs4 import BeautifulSoup +from httpx import AsyncClient +from motor.motor_asyncio import AsyncIOMotorClient + +# 链接数据库 +client = AsyncIOMotorClient('mongodb://localhost:27017') +db = client['zydm'] +collection = db['qiushi'] +# 数据库起止时间 +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'} + + +async def main(): + # 判断数据库是否存在 + collist = await db.list_collection_names() + if "qiushi" in collist: # 检测集合是否存在 + print("求是集合存在,更新数据库") + searchRes = await collection.find({}).to_list(length=None) + Res = pd.DataFrame(list(searchRes)) + h1 = Res['title'].drop_duplicates().reset_index() + # 输入更新数据库时间 + await upDate(h1) + else: + await getDate() + + +# 解析网页正文 +def parse_html_text(soup): + """ + :param html: html字符串 + :return: 正文 string + """ + content = '' # gu:建了一个字符串 + for p in soup.select('.highlight p'): + para = p.text.strip() + if para: + content += para + content += '\n' + return content + + +def parse_author(soup): + all_name = soup.select('.appellation') + if all_name: + name = all_name[-1].text + else: + name = '' + return name + + +# 爬取网页并建立数据库 +async def getDate(): + url = "http://www.qstheory.cn/qs/mulu.htm" + # 进入首页 + try: + async with AsyncClient(headers=headers) as client: + response = await client.get(url) + response.encoding = response.charset_encoding + print('一级连接状态%d' % response.status_code) + if response.status_code == 200: + soup = BeautifulSoup(response.text, "lxml") + for item in soup.select('.booktitle a'): + book_link = item.get('href') + if "http" not in book_link: + book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip() + year = item.text + response2 = await client.get(book_link) + response2.encoding = response2.charset_encoding + print('二级连接状态%d' % response2.status_code) + if response2.status_code == 200: + soup1 = BeautifulSoup(response2.text, "lxml") + for item1 in soup1.select('.highlight p a'): + if '《求是》' in item1.text: + banmianhao = item1.text.split("第")[-1].replace("期", "").strip() + banmianming = '求是' + year + yaowen_link = item1.get('href') + response3 = await client.get(yaowen_link) + response3.encoding = response3.charset_encoding + print('三级连接状态%d' % response3.status_code) + if response3.status_code == 200: + soup2 = BeautifulSoup(response3.text, "lxml") + for item2 in soup2.select('.text p a'): + link = item2.get('href') + title = item2.text.strip() + response4 = await client.get(link, ) + response4.encoding = response4.charset_encoding + print('四级连接状态%d' % response4.status_code) + if response4.status_code == 200: + soup3 = BeautifulSoup(response4.text, "lxml") + if soup3.select('h1'): + author = parse_author(soup3) + if soup3.select('.pubtime'): + str_time = soup3.select('.pubtime')[0].text.strip() + release_time = datetime.strptime(str_time, '%Y-%m-%d %H:%M:%S') + else: + str_time = soup3.select('.headtitle span')[0].text.strip() + release_time = datetime.strptime(str_time, '%Y年%m月%d日 %H:%M:%S') + content = parse_html_text(soup3) + await collection.insert_one({'banmianhao': banmianhao, + 'banmianming': banmianming, + 'title': title, + 'subtitle': 'empty', + 'author': author, + 'keywordlist': 'empty', + 'detail_url': link, + 'release_time': release_time, + 'insert_timestamp': datetime.today(), + 'content': content}) + print("%s-%s-%s已完成" % (release_time, banmianhao, title)) + else: + await collection.insert_one({'banmianhao': banmianhao, + 'banmianming': banmianming, + 'title': title, + 'subtitle': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': link, + 'release_time': 'empty', + 'insert_timestamp': datetime.today(), + 'content': 'empty'}) + print("%s无内容" % (title)) + await asyncio.sleep(random.randint(5, 20)) + except Exception as result: + # 改天数据为空 + await collection.insert_one({'banmianhao': 'empty', + 'banmianming': 'empty', + 'title': 'empty', + 'subtitle': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': url, + 'release_time': 'empty', + 'insert_timestamp': datetime.today(), + 'content': 'empty'}) + print(result) + + +async def upDate(h1): + url = "http://www.qstheory.cn/qs/mulu.htm" + # 进入首页 + try: + async with AsyncClient(headers=headers) as client: + response = await client.get(url) + response.encoding = response.charset_encoding + print('一级连接状态%d' % response.status_code) + if response.status_code == 200: + soup = BeautifulSoup(response.text, "lxml") + for item in soup.select('.booktitle a'): + book_link = item.get('href') + if "http" not in book_link: + book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip() + year = item.text + response2 = await client.get(book_link, headers=headers) + response2.encoding = 'utf-8' + print('二级连接状态%d' % response2.status_code) + if response2.status_code == 200: + soup1 = BeautifulSoup(response2.text, "lxml") + for item1 in soup1.select('.highlight p a'): + if '《求是》' in item1.text: + banmianhao = item1.text.split("第")[-1].replace("期", "").strip() + banmianming = '求是' + year + yaowen_link = item1.get('href') + response3 = await client.get(yaowen_link) + response3.encoding = response3.charset_encoding + print('三级连接状态%d' % response3.status_code) + if response3.status_code == 200: + soup2 = BeautifulSoup(response3.text, "lxml") + for item2 in soup2.select('.text p a'): + link = item2.get('href') + title = item2.text.strip() + if h1[h1['title'].str.contains(title)].empty: + response4 = await client.get(link, headers=headers) + response4.encoding = response4.charset_encoding + print('四级连接状态%d' % response4.status_code) + if response4.status_code == 200: + soup3 = BeautifulSoup(response4.text, "lxml") + if soup3.select('h1'): + author = parse_author(soup3) + if soup3.select('.pubtime'): + str_time = soup3.select('.pubtime')[0].text.strip() + release_time = datetime.strptime(str_time, '%Y-%m-%d %H:%M:%S') + else: + str_time = soup3.select('.headtitle span')[0].text.strip() + release_time = datetime.strptime(str_time, + '%Y年%m月%d日 %H:%M:%S') + content = parse_html_text(soup3) + await collection.insert_one({'banmianhao': banmianhao, + 'banmianming': banmianming, + 'title': title, + 'subtitle': 'empty', + 'author': author, + 'keywordlist': 'empty', + 'detail_url': link, + 'release_time': release_time, + 'insert_timestamp': datetime.today(), + 'content': content}) + print("%s-%s-%s已完成" % (release_time, banmianhao, title)) + else: + await collection.insert_one({'banmianhao': banmianhao, + 'banmianming': banmianming, + 'title': title, + 'subtitle': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': link, + 'release_time': 'empty', + 'insert_timestamp': datetime.today(), + 'content': 'empty'}) + print("%s无内容" % (title)) + await asyncio.sleep(random.randint(5, 20)) + else: + print('%s已经存在' % title) + except Exception as result: + # 改天数据为空 + await collection.insert_one({'banmianhao': 'empty', + 'banmianming': 'empty', + 'title': 'empty', + 'subtitle': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': url, + 'release_time': 'empty', + 'insert_timestamp': datetime.today(), + 'content': 'empty'}) + print(result) + + +if __name__ == "__main__": # 当程序执行时 + # 调用函数 + asyncio.run(main()) + print("爬取完毕!")