# -*- coding = utf-8 -*- # @Time : 2021/12/2 20:34 # @Author : Hongshuang Gu # @File : Crawlhqwg.py # @Software : PyCharm import asyncio import random import re from datetime import datetime import pandas as pd from bs4 import BeautifulSoup from httpx import AsyncClient from motor.motor_asyncio import AsyncIOMotorClient # 链接数据库 client = AsyncIOMotorClient('mongodb://localhost:27017') db = client['zydm'] collection = db['hqwg'] # 数据库起止时间 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'} async def main(): # 判断数据库是否存在 collist = await db.list_collection_names() if "hqwg" in collist: # 检测集合是否存在 print("红旗文稿集合存在,更新数据库") searchRes = await collection.find({}).to_list(length=None) Res = pd.DataFrame(list(searchRes)) h1 = Res['title'].drop_duplicates().reset_index() # 输入更新数据库时间 await upDate(h1) else: await getDate() # 解析网页正文 def parse_html_text(soup): """ :param html: html字符串 :return: 正文 string """ content = '' # gu:建了一个字符串 for p in soup.select('.highlight p'): para = p.text.strip() if para: content += para content += '\n' return content def parse_author(soup): all_name = soup.select('.headtitle') or soup.select('.metadata') if all_name: name = re.findall(r'作者:(.*)', str(all_name))[0] else: name = '' return name def parse_time(soup): if soup.select('.pubtime'): str_time = soup.select('.pubtime') release_time = datetime.strptime(str_time[0].text.strip(), '%Y-%m-%d %H:%M:%S') else: str_time = soup.select('.headtitle span') or soup.select('.metadata') find_time = re.findall(r'([0-9]{4}年[0-9]{2}月[0-9]{2}日 [0-9]{2}:[0-9]{2}:[0-9]{2})', str(str_time)) release_time = datetime.strptime(find_time[0].strip(), '%Y年%m月%d日 %H:%M:%S') return release_time # 爬取网页并建立数据库 async def getDate(): url = "http://www.qstheory.cn/hqwglist/mulu.htm" # 进入首页 try: async with AsyncClient(headers=headers) as client: response = await client.get(url) response.encoding = response.charset_encoding print('一级连接状态%d' % response.status_code) if response.status_code == 200: soup = BeautifulSoup(response.text, "lxml") for item in soup.select('.booktitle a'): book_link = item.get('href') if "http" not in book_link: book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip() year = item.text response2 = await client.get(book_link) response2.encoding = response2.charset_encoding print('二级连接状态%d' % response2.status_code) if response2.status_code == 200: soup1 = BeautifulSoup(response2.text, "lxml") for item1 in soup1.select('.highlight p a'): if '《红旗文稿》' in item1.text: banmianhao = item1.text.split("第")[-1].replace("期", "").strip() banmianming = '红旗文稿' + year yaowen_link = item1.get('href') response3 = await client.get(yaowen_link) response3.encoding = response3.charset_encoding print('三级连接状态%d' % response3.status_code) if response3.status_code == 200: soup2 = BeautifulSoup(response3.text, "lxml") for item2 in soup2.select('.text p a'): link = item2.get('href') title = item2.text.strip() response4 = await client.get(link, ) response4.encoding = response4.charset_encoding print('四级连接状态%d' % response4.status_code) if response4.status_code == 200: soup3 = BeautifulSoup(response4.text, "lxml") if soup3.select('h1'): release_time = parse_time(soup3) content = parse_html_text(soup3) author = parse_author(soup3) await collection.insert_one({'banmianhao': banmianhao, 'banmianming': banmianming, 'title': title, 'subtitle': 'empty', 'author': author, 'keywordlist': 'empty', 'detail_url': link, 'release_time': release_time, 'insert_timestamp': datetime.today(), 'content': content}) print("%s-%s-%s已完成" % (release_time, banmianhao, title)) else: real_page = soup3.select('script') real_url = re.findall(r'window.location.href="(.*?)"', str(real_page)) response5 = await client.get(real_url[0]) response5.encoding = response5.charset_encoding print('五级连接状态%d' % response5.status_code) if response5.status_code == 200: soup4 = BeautifulSoup(response5.text, "lxml") release_time = parse_time(soup4) content = parse_html_text(soup4) author = parse_author(soup4) await collection.insert_one({'banmianhao': banmianhao, 'banmianming': banmianming, 'title': title, 'subtitle': 'empty', 'author': author, 'keywordlist': 'empty', 'detail_url': link, 'release_time': release_time, 'insert_timestamp': datetime.today(), 'content': content}) print("%s-%s-%s已完成" % (release_time, banmianhao, title)) await asyncio.sleep(random.randint(5, 20)) except Exception as result: # 改天数据为空 await collection.insert_one({'banmianhao': 'empty', 'banmianming': 'empty', 'title': 'empty', 'subtitle': 'empty', 'author': 'empty', 'keywordlist': 'empty', 'detail_url': url, 'release_time': 'empty', 'insert_timestamp': datetime.today(), 'content': 'empty'}) print(result) # 更新数据 async def upDate(h1): url = "http://www.qstheory.cn/hqwglist/mulu.htm" # 进入首页 try: async with AsyncClient(headers=headers) as client: response = await client.get(url) response.encoding = response.charset_encoding print('一级连接状态%d' % response.status_code) if response.status_code == 200: soup = BeautifulSoup(response.text, "lxml") for item in soup.select('.booktitle a'): book_link = item.get('href') if "http" not in book_link: book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip() year = item.text response2 = await client.get(book_link, headers=headers) response2.encoding = response2.charset_encoding print('二级连接状态%d' % response2.status_code) if response2.status_code == 200: soup1 = BeautifulSoup(response2.text, "lxml") for item1 in soup1.select('.highlight p a'): if '《红旗文稿》' in item1.text: banmianhao = item1.text.split("第")[-1].replace("期", "").strip() banmianming = '红旗文稿' + year yaowen_link = item1.get('href') response3 = await client.get(yaowen_link) response3.encoding = response3.charset_encoding print('三级连接状态%d' % response3.status_code) if response3.status_code == 200: soup2 = BeautifulSoup(response3.text, "lxml") for item2 in soup2.select('.text p a'): link = item2.get('href') title = item2.text.strip() if h1[h1['title'].str.contains(title)].empty: response4 = await client.get(link, headers=headers) response4.encoding = response4.charset_encoding print('四级连接状态%d' % response4.status_code) if response4.status_code == 200: soup3 = BeautifulSoup(response4.text, "lxml") if soup3.select('h1'): release_time = parse_time(soup3) content = parse_html_text(soup3) author = parse_author(soup3) await collection.insert_one({'banmianhao': banmianhao, 'banmianming': banmianming, 'title': title, 'subtitle': 'empty', 'author': author, 'keywordlist': 'empty', 'detail_url': link, 'release_time': release_time, 'insert_timestamp': datetime.today(), 'content': content}) print("%s-%s-%s已完成" % (release_time, banmianhao, title)) else: real_page = soup3.select('script') real_url = re.findall(r'window.location.href="(.*?)"', str(real_page)) response5 = await client.get(real_url[0], headers=headers) response5.encoding = response5.charset_encoding print('五级连接状态%d' % response5.status_code) if response5.status_code == 200: soup4 = BeautifulSoup(response5.text, "lxml") release_time = parse_time(soup4) content = parse_html_text(soup4) author = parse_author(soup4) await collection.insert_one({'banmianhao': banmianhao, 'banmianming': banmianming, 'title': title, 'subtitle': 'empty', 'author': author, 'keywordlist': 'empty', 'detail_url': link, 'release_time': release_time, 'insert_timestamp': datetime.today(), 'content': content}) print("%s-%s-%s已完成" % (release_time, banmianhao, title)) await asyncio.sleep(random.randint(5, 20)) await asyncio.sleep(random.randint(5, 20)) else: print('%s已经存在' % title) except Exception as result: # 改天数据为空 await collection.insert_one({'banmianhao': 'empty', 'banmianming': 'empty', 'title': 'empty', 'subtitle': 'empty', 'author': 'empty', 'keywordlist': 'empty', 'detail_url': url, 'release_time': 'empty', 'insert_timestamp': datetime.today(), 'content': 'empty'}) print(result) if __name__ == "__main__": # 当程序执行时 # 调用函数 asyncio.run(main()) print("爬取完毕!")