267 lines
16 KiB
Python
267 lines
16 KiB
Python
# -*- coding = utf-8 -*-
|
|
# @Time : 2021/12/2 20:34
|
|
# @Author : Hongshuang Gu
|
|
# @File : Crawlhqwg.py
|
|
# @Software : PyCharm
|
|
import asyncio
|
|
import random
|
|
import re
|
|
from datetime import datetime
|
|
|
|
import pandas as pd
|
|
from bs4 import BeautifulSoup
|
|
from httpx import AsyncClient
|
|
from motor.motor_asyncio import AsyncIOMotorClient
|
|
|
|
# 链接数据库
|
|
client = AsyncIOMotorClient('mongodb://localhost:27017')
|
|
db = client['zydm']
|
|
collection = db['hqwg']
|
|
# 数据库起止时间
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
|
|
|
|
|
|
async def main():
|
|
# 判断数据库是否存在
|
|
collist = await db.list_collection_names()
|
|
if "hqwg" in collist: # 检测集合是否存在
|
|
print("红旗文稿集合存在,更新数据库")
|
|
searchRes = await collection.find({}).to_list(length=None)
|
|
Res = pd.DataFrame(list(searchRes))
|
|
h1 = Res['title'].drop_duplicates().reset_index()
|
|
# 输入更新数据库时间
|
|
await upDate(h1)
|
|
else:
|
|
await getDate()
|
|
|
|
|
|
# 解析网页正文
|
|
def parse_html_text(soup):
|
|
"""
|
|
:param html: html字符串
|
|
:return: 正文 string
|
|
"""
|
|
content = '' # gu:建了一个字符串
|
|
for p in soup.select('.highlight p'):
|
|
para = p.text.strip()
|
|
if para:
|
|
content += para
|
|
content += '\n'
|
|
return content
|
|
|
|
|
|
def parse_author(soup):
|
|
all_name = soup.select('.headtitle') or soup.select('.metadata')
|
|
if all_name:
|
|
name = re.findall(r'作者:(.*)', str(all_name))[0]
|
|
else:
|
|
name = ''
|
|
return name
|
|
|
|
|
|
def parse_time(soup):
|
|
if soup.select('.pubtime'):
|
|
str_time = soup.select('.pubtime')
|
|
release_time = datetime.strptime(str_time[0].text.strip(), '%Y-%m-%d %H:%M:%S')
|
|
else:
|
|
str_time = soup.select('.headtitle span') or soup.select('.metadata')
|
|
find_time = re.findall(r'([0-9]{4}年[0-9]{2}月[0-9]{2}日 [0-9]{2}:[0-9]{2}:[0-9]{2})', str(str_time))
|
|
release_time = datetime.strptime(find_time[0].strip(), '%Y年%m月%d日 %H:%M:%S')
|
|
return release_time
|
|
|
|
|
|
# 爬取网页并建立数据库
|
|
async def getDate():
|
|
url = "http://www.qstheory.cn/hqwglist/mulu.htm"
|
|
# 进入首页
|
|
try:
|
|
async with AsyncClient(headers=headers) as client:
|
|
response = await client.get(url)
|
|
response.encoding = response.charset_encoding
|
|
print('一级连接状态%d' % response.status_code)
|
|
if response.status_code == 200:
|
|
soup = BeautifulSoup(response.text, "lxml")
|
|
for item in soup.select('.booktitle a'):
|
|
book_link = item.get('href')
|
|
if "http" not in book_link:
|
|
book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip()
|
|
year = item.text
|
|
response2 = await client.get(book_link)
|
|
response2.encoding = response2.charset_encoding
|
|
print('二级连接状态%d' % response2.status_code)
|
|
if response2.status_code == 200:
|
|
soup1 = BeautifulSoup(response2.text, "lxml")
|
|
for item1 in soup1.select('.highlight p a'):
|
|
if '《红旗文稿》' in item1.text:
|
|
banmianhao = item1.text.split("第")[-1].replace("期", "").strip()
|
|
banmianming = '红旗文稿' + year
|
|
yaowen_link = item1.get('href')
|
|
response3 = await client.get(yaowen_link)
|
|
response3.encoding = response3.charset_encoding
|
|
print('三级连接状态%d' % response3.status_code)
|
|
if response3.status_code == 200:
|
|
soup2 = BeautifulSoup(response3.text, "lxml")
|
|
for item2 in soup2.select('.text p a'):
|
|
link = item2.get('href')
|
|
title = item2.text.strip()
|
|
response4 = await client.get(link, )
|
|
response4.encoding = response4.charset_encoding
|
|
print('四级连接状态%d' % response4.status_code)
|
|
if response4.status_code == 200:
|
|
soup3 = BeautifulSoup(response4.text, "lxml")
|
|
if soup3.select('h1'):
|
|
release_time = parse_time(soup3)
|
|
content = parse_html_text(soup3)
|
|
author = parse_author(soup3)
|
|
await collection.insert_one({'banmianhao': banmianhao,
|
|
'banmianming': banmianming,
|
|
'title': title,
|
|
'subtitle': 'empty',
|
|
'author': author,
|
|
'keywordlist': 'empty',
|
|
'detail_url': link,
|
|
'release_time': release_time,
|
|
'insert_timestamp': datetime.today(),
|
|
'content': content})
|
|
print("%s-%s-%s已完成" % (release_time, banmianhao, title))
|
|
else:
|
|
real_page = soup3.select('script')
|
|
real_url = re.findall(r'window.location.href="(.*?)"', str(real_page))
|
|
response5 = await client.get(real_url[0])
|
|
response5.encoding = response5.charset_encoding
|
|
print('五级连接状态%d' % response5.status_code)
|
|
if response5.status_code == 200:
|
|
soup4 = BeautifulSoup(response5.text, "lxml")
|
|
release_time = parse_time(soup4)
|
|
content = parse_html_text(soup4)
|
|
author = parse_author(soup4)
|
|
await collection.insert_one({'banmianhao': banmianhao,
|
|
'banmianming': banmianming,
|
|
'title': title,
|
|
'subtitle': 'empty',
|
|
'author': author,
|
|
'keywordlist': 'empty',
|
|
'detail_url': link,
|
|
'release_time': release_time,
|
|
'insert_timestamp': datetime.today(),
|
|
'content': content})
|
|
print("%s-%s-%s已完成" % (release_time, banmianhao, title))
|
|
await asyncio.sleep(random.randint(5, 20))
|
|
except Exception as result:
|
|
# 改天数据为空
|
|
await collection.insert_one({'banmianhao': 'empty',
|
|
'banmianming': 'empty',
|
|
'title': 'empty',
|
|
'subtitle': 'empty',
|
|
'author': 'empty',
|
|
'keywordlist': 'empty',
|
|
'detail_url': url,
|
|
'release_time': 'empty',
|
|
'insert_timestamp': datetime.today(),
|
|
'content': 'empty'})
|
|
print(result)
|
|
|
|
|
|
# 更新数据
|
|
async def upDate(h1):
|
|
url = "http://www.qstheory.cn/hqwglist/mulu.htm"
|
|
# 进入首页
|
|
try:
|
|
async with AsyncClient(headers=headers) as client:
|
|
response = await client.get(url)
|
|
response.encoding = response.charset_encoding
|
|
print('一级连接状态%d' % response.status_code)
|
|
if response.status_code == 200:
|
|
soup = BeautifulSoup(response.text, "lxml")
|
|
for item in soup.select('.booktitle a'):
|
|
book_link = item.get('href')
|
|
if "http" not in book_link:
|
|
book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip()
|
|
year = item.text
|
|
response2 = await client.get(book_link, headers=headers)
|
|
response2.encoding = response2.charset_encoding
|
|
print('二级连接状态%d' % response2.status_code)
|
|
if response2.status_code == 200:
|
|
soup1 = BeautifulSoup(response2.text, "lxml")
|
|
for item1 in soup1.select('.highlight p a'):
|
|
if '《红旗文稿》' in item1.text:
|
|
banmianhao = item1.text.split("第")[-1].replace("期", "").strip()
|
|
banmianming = '红旗文稿' + year
|
|
yaowen_link = item1.get('href')
|
|
response3 = await client.get(yaowen_link)
|
|
response3.encoding = response3.charset_encoding
|
|
print('三级连接状态%d' % response3.status_code)
|
|
if response3.status_code == 200:
|
|
soup2 = BeautifulSoup(response3.text, "lxml")
|
|
for item2 in soup2.select('.text p a'):
|
|
link = item2.get('href')
|
|
title = item2.text.strip()
|
|
if h1[h1['title'].str.contains(title)].empty:
|
|
response4 = await client.get(link, headers=headers)
|
|
response4.encoding = response4.charset_encoding
|
|
print('四级连接状态%d' % response4.status_code)
|
|
if response4.status_code == 200:
|
|
soup3 = BeautifulSoup(response4.text, "lxml")
|
|
if soup3.select('h1'):
|
|
release_time = parse_time(soup3)
|
|
content = parse_html_text(soup3)
|
|
author = parse_author(soup3)
|
|
await collection.insert_one({'banmianhao': banmianhao,
|
|
'banmianming': banmianming,
|
|
'title': title,
|
|
'subtitle': 'empty',
|
|
'author': author,
|
|
'keywordlist': 'empty',
|
|
'detail_url': link,
|
|
'release_time': release_time,
|
|
'insert_timestamp': datetime.today(),
|
|
'content': content})
|
|
print("%s-%s-%s已完成" % (release_time, banmianhao, title))
|
|
else:
|
|
real_page = soup3.select('script')
|
|
real_url = re.findall(r'window.location.href="(.*?)"',
|
|
str(real_page))
|
|
response5 = await client.get(real_url[0], headers=headers)
|
|
response5.encoding = response5.charset_encoding
|
|
print('五级连接状态%d' % response5.status_code)
|
|
if response5.status_code == 200:
|
|
soup4 = BeautifulSoup(response5.text, "lxml")
|
|
release_time = parse_time(soup4)
|
|
content = parse_html_text(soup4)
|
|
author = parse_author(soup4)
|
|
await collection.insert_one({'banmianhao': banmianhao,
|
|
'banmianming': banmianming,
|
|
'title': title,
|
|
'subtitle': 'empty',
|
|
'author': author,
|
|
'keywordlist': 'empty',
|
|
'detail_url': link,
|
|
'release_time': release_time,
|
|
'insert_timestamp': datetime.today(),
|
|
'content': content})
|
|
print("%s-%s-%s已完成" % (release_time, banmianhao, title))
|
|
await asyncio.sleep(random.randint(5, 20))
|
|
await asyncio.sleep(random.randint(5, 20))
|
|
else:
|
|
print('%s已经存在' % title)
|
|
except Exception as result:
|
|
# 改天数据为空
|
|
await collection.insert_one({'banmianhao': 'empty',
|
|
'banmianming': 'empty',
|
|
'title': 'empty',
|
|
'subtitle': 'empty',
|
|
'author': 'empty',
|
|
'keywordlist': 'empty',
|
|
'detail_url': url,
|
|
'release_time': 'empty',
|
|
'insert_timestamp': datetime.today(),
|
|
'content': 'empty'})
|
|
print(result)
|
|
|
|
|
|
if __name__ == "__main__": # 当程序执行时
|
|
# 调用函数
|
|
asyncio.run(main())
|
|
print("爬取完毕!")
|