guoneimeitishujucaiji/红旗/CrawlHqwg.py

# -*- coding = utf-8 -*-
# @Time : 2021/12/2 20:34
# @Author : Hongshuang Gu
# @File : Crawlhqwg.py
# @Software : PyCharm
import asyncio
import random
import re
from datetime import datetime

import pandas as pd
from bs4 import BeautifulSoup
from httpx import AsyncClient
from motor.motor_asyncio import AsyncIOMotorClient

# 链接数据库
client = AsyncIOMotorClient('mongodb://localhost:27017')
db = client['zydm']
collection = db['hqwg']
# 数据库起止时间
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}


async def main():
    # 判断数据库是否存在
    collist = await db.list_collection_names()
    if "hqwg" in collist:  # 检测集合是否存在
        print("红旗文稿集合存在，更新数据库")
        searchRes = await collection.find({}).to_list(length=None)
        Res = pd.DataFrame(list(searchRes))
        h1 = Res['title'].drop_duplicates().reset_index()
        # 输入更新数据库时间
        await upDate(h1)
    else:
        await getDate()


# 解析网页正文
def parse_html_text(soup):
    """
    :param html: html字符串
    :return: 正文 string
    """
    content = ''  # gu:建了一个字符串
    for p in soup.select('.highlight p'):
        para = p.text.strip()
        if para:
            content += para
            content += '\n'
    return content


def parse_author(soup):
    all_name = soup.select('.headtitle') or soup.select('.metadata')
    if all_name:
        name = re.findall(r'作者：(.*)', str(all_name))[0]
    else:
        name = ''
    return name


def parse_time(soup):
    if soup.select('.pubtime'):
        str_time = soup.select('.pubtime')
        release_time = datetime.strptime(str_time[0].text.strip(), '%Y-%m-%d %H:%M:%S')
    else:
        str_time = soup.select('.headtitle span') or soup.select('.metadata')
        find_time = re.findall(r'([0-9]{4}年[0-9]{2}月[0-9]{2}日 [0-9]{2}:[0-9]{2}:[0-9]{2})', str(str_time))
        release_time = datetime.strptime(find_time[0].strip(), '%Y年%m月%d日 %H:%M:%S')
    return release_time


# 爬取网页并建立数据库
async def getDate():
    url = "http://www.qstheory.cn/hqwglist/mulu.htm"
    # 进入首页
    try:
        async with AsyncClient(headers=headers) as client:
            response = await client.get(url)
            response.encoding = response.charset_encoding
            print('一级连接状态%d' % response.status_code)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "lxml")
                for item in soup.select('.booktitle a'):
                    book_link = item.get('href')
                    if "http" not in book_link:
                        book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip()
                    year = item.text
                    response2 = await client.get(book_link)
                    response2.encoding = response2.charset_encoding
                    print('二级连接状态%d' % response2.status_code)
                    if response2.status_code == 200:
                        soup1 = BeautifulSoup(response2.text, "lxml")
                        for item1 in soup1.select('.highlight p a'):
                            if '《红旗文稿》' in item1.text:
                                banmianhao = item1.text.split("第")[-1].replace("期", "").strip()
                                banmianming = '红旗文稿' + year
                                yaowen_link = item1.get('href')
                                response3 = await  client.get(yaowen_link)
                                response3.encoding = response3.charset_encoding
                                print('三级连接状态%d' % response3.status_code)
                                if response3.status_code == 200:
                                    soup2 = BeautifulSoup(response3.text, "lxml")
                                    for item2 in soup2.select('.text p a'):
                                        link = item2.get('href')
                                        title = item2.text.strip()
                                        response4 = await client.get(link, )
                                        response4.encoding = response4.charset_encoding
                                        print('四级连接状态%d' % response4.status_code)
                                        if response4.status_code == 200:
                                            soup3 = BeautifulSoup(response4.text, "lxml")
                                            if soup3.select('h1'):
                                                release_time = parse_time(soup3)
                                                content = parse_html_text(soup3)
                                                author = parse_author(soup3)
                                                await collection.insert_one({'banmianhao': banmianhao,
                                                                             'banmianming': banmianming,
                                                                             'title': title,
                                                                             'subtitle': 'empty',
                                                                             'author': author,
                                                                             'keywordlist': 'empty',
                                                                             'detail_url': link,
                                                                             'release_time': release_time,
                                                                             'insert_timestamp': datetime.today(),
                                                                             'content': content})
                                                print("%s-%s-%s已完成" % (release_time, banmianhao, title))
                                            else:
                                                real_page = soup3.select('script')
                                                real_url = re.findall(r'window.location.href="(.*?)"', str(real_page))
                                                response5 = await client.get(real_url[0])
                                                response5.encoding = response5.charset_encoding
                                                print('五级连接状态%d' % response5.status_code)
                                                if response5.status_code == 200:
                                                    soup4 = BeautifulSoup(response5.text, "lxml")
                                                    release_time = parse_time(soup4)
                                                    content = parse_html_text(soup4)
                                                    author = parse_author(soup4)
                                                    await collection.insert_one({'banmianhao': banmianhao,
                                                                                 'banmianming': banmianming,
                                                                                 'title': title,
                                                                                 'subtitle': 'empty',
                                                                                 'author': author,
                                                                                 'keywordlist': 'empty',
                                                                                 'detail_url': link,
                                                                                 'release_time': release_time,
                                                                                 'insert_timestamp': datetime.today(),
                                                                                 'content': content})
                                                    print("%s-%s-%s已完成" % (release_time, banmianhao, title))
                                                    await asyncio.sleep(random.randint(5, 20))
    except Exception as result:
        # 改天数据为空
        await collection.insert_one({'banmianhao': 'empty',
                                     'banmianming': 'empty',
                                     'title': 'empty',
                                     'subtitle': 'empty',
                                     'author': 'empty',
                                     'keywordlist': 'empty',
                                     'detail_url': url,
                                     'release_time': 'empty',
                                     'insert_timestamp': datetime.today(),
                                     'content': 'empty'})
        print(result)


# 更新数据
async def upDate(h1):
    url = "http://www.qstheory.cn/hqwglist/mulu.htm"
    # 进入首页
    try:
        async with AsyncClient(headers=headers) as client:
            response = await client.get(url)
            response.encoding = response.charset_encoding
            print('一级连接状态%d' % response.status_code)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "lxml")
                for item in soup.select('.booktitle a'):
                    book_link = item.get('href')
                    if "http" not in book_link:
                        book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip()
                    year = item.text
                    response2 = await client.get(book_link, headers=headers)
                    response2.encoding = response2.charset_encoding
                    print('二级连接状态%d' % response2.status_code)
                    if response2.status_code == 200:
                        soup1 = BeautifulSoup(response2.text, "lxml")
                        for item1 in soup1.select('.highlight p a'):
                            if '《红旗文稿》' in item1.text:
                                banmianhao = item1.text.split("第")[-1].replace("期", "").strip()
                                banmianming = '红旗文稿' + year
                                yaowen_link = item1.get('href')
                                response3 = await client.get(yaowen_link)
                                response3.encoding = response3.charset_encoding
                                print('三级连接状态%d' % response3.status_code)
                                if response3.status_code == 200:
                                    soup2 = BeautifulSoup(response3.text, "lxml")
                                    for item2 in soup2.select('.text p a'):
                                        link = item2.get('href')
                                        title = item2.text.strip()
                                        if h1[h1['title'].str.contains(title)].empty:
                                            response4 = await client.get(link, headers=headers)
                                            response4.encoding = response4.charset_encoding
                                            print('四级连接状态%d' % response4.status_code)
                                            if response4.status_code == 200:
                                                soup3 = BeautifulSoup(response4.text, "lxml")
                                                if soup3.select('h1'):
                                                    release_time = parse_time(soup3)
                                                    content = parse_html_text(soup3)
                                                    author = parse_author(soup3)
                                                    await collection.insert_one({'banmianhao': banmianhao,
                                                                                 'banmianming': banmianming,
                                                                                 'title': title,
                                                                                 'subtitle': 'empty',
                                                                                 'author': author,
                                                                                 'keywordlist': 'empty',
                                                                                 'detail_url': link,
                                                                                 'release_time': release_time,
                                                                                 'insert_timestamp': datetime.today(),
                                                                                 'content': content})
                                                    print("%s-%s-%s已完成" % (release_time, banmianhao, title))
                                                else:
                                                    real_page = soup3.select('script')
                                                    real_url = re.findall(r'window.location.href="(.*?)"',
                                                                          str(real_page))
                                                    response5 = await client.get(real_url[0], headers=headers)
                                                    response5.encoding = response5.charset_encoding
                                                    print('五级连接状态%d' % response5.status_code)
                                                    if response5.status_code == 200:
                                                        soup4 = BeautifulSoup(response5.text, "lxml")
                                                        release_time = parse_time(soup4)
                                                        content = parse_html_text(soup4)
                                                        author = parse_author(soup4)
                                                        await collection.insert_one({'banmianhao': banmianhao,
                                                                                     'banmianming': banmianming,
                                                                                     'title': title,
                                                                                     'subtitle': 'empty',
                                                                                     'author': author,
                                                                                     'keywordlist': 'empty',
                                                                                     'detail_url': link,
                                                                                     'release_time': release_time,
                                                                                     'insert_timestamp': datetime.today(),
                                                                                     'content': content})
                                                        print("%s-%s-%s已完成" % (release_time, banmianhao, title))
                                                    await asyncio.sleep(random.randint(5, 20))
                                            await asyncio.sleep(random.randint(5, 20))
                                        else:
                                            print('%s已经存在' % title)
    except Exception as result:
        # 改天数据为空
        await collection.insert_one({'banmianhao': 'empty',
                                     'banmianming': 'empty',
                                     'title': 'empty',
                                     'subtitle': 'empty',
                                     'author': 'empty',
                                     'keywordlist': 'empty',
                                     'detail_url': url,
                                     'release_time': 'empty',
                                     'insert_timestamp': datetime.today(),
                                     'content': 'empty'})
        print(result)


if __name__ == "__main__":  # 当程序执行时
    # 调用函数
    asyncio.run(main())
    print("爬取完毕！")