245 lines
14 KiB
Python

# -*- coding = utf-8 -*-
# @Time : 2021/12/2 20:34
# @Author : Hongshuang Gu
# @File : Crawlqiushi.py
# @Software : PyCharm
import asyncio
import random
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
from httpx import AsyncClient
from motor.motor_asyncio import AsyncIOMotorClient
# 链接数据库
client = AsyncIOMotorClient('mongodb://localhost:27017')
db = client['zydm']
collection = db['qiushi']
# 数据库起止时间
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
async def main():
# 判断数据库是否存在
collist = await db.list_collection_names()
if "qiushi" in collist: # 检测集合是否存在
print("求是集合存在,更新数据库")
searchRes = await collection.find({}).to_list(length=None)
Res = pd.DataFrame(list(searchRes))
h1 = Res['title'].drop_duplicates().reset_index()
# 输入更新数据库时间
await upDate(h1)
else:
await getDate()
# 解析网页正文
def parse_html_text(soup):
"""
:param html: html字符串
:return: 正文 string
"""
content = '' # gu:建了一个字符串
for p in soup.select('.highlight p'):
para = p.text.strip()
if para:
content += para
content += '\n'
return content
def parse_author(soup):
all_name = soup.select('.appellation')
if all_name:
name = all_name[-1].text
else:
name = ''
return name
# 爬取网页并建立数据库
async def getDate():
url = "http://www.qstheory.cn/qs/mulu.htm"
# 进入首页
try:
async with AsyncClient(headers=headers) as client:
response = await client.get(url)
response.encoding = response.charset_encoding
print('一级连接状态%d' % response.status_code)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "lxml")
for item in soup.select('.booktitle a'):
book_link = item.get('href')
if "http" not in book_link:
book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip()
year = item.text
response2 = await client.get(book_link)
response2.encoding = response2.charset_encoding
print('二级连接状态%d' % response2.status_code)
if response2.status_code == 200:
soup1 = BeautifulSoup(response2.text, "lxml")
for item1 in soup1.select('.highlight p a'):
if '《求是》' in item1.text:
banmianhao = item1.text.split("")[-1].replace("", "").strip()
banmianming = '求是' + year
yaowen_link = item1.get('href')
response3 = await client.get(yaowen_link)
response3.encoding = response3.charset_encoding
print('三级连接状态%d' % response3.status_code)
if response3.status_code == 200:
soup2 = BeautifulSoup(response3.text, "lxml")
for item2 in soup2.select('.text p a'):
link = item2.get('href')
title = item2.text.strip()
response4 = await client.get(link, )
response4.encoding = response4.charset_encoding
print('四级连接状态%d' % response4.status_code)
if response4.status_code == 200:
soup3 = BeautifulSoup(response4.text, "lxml")
if soup3.select('h1'):
author = parse_author(soup3)
if soup3.select('.pubtime'):
str_time = soup3.select('.pubtime')[0].text.strip()
release_time = datetime.strptime(str_time, '%Y-%m-%d %H:%M:%S')
else:
str_time = soup3.select('.headtitle span')[0].text.strip()
release_time = datetime.strptime(str_time, '%Y年%m月%d%H:%M:%S')
content = parse_html_text(soup3)
await collection.insert_one({'banmianhao': banmianhao,
'banmianming': banmianming,
'title': title,
'subtitle': 'empty',
'author': author,
'keywordlist': 'empty',
'detail_url': link,
'release_time': release_time,
'insert_timestamp': datetime.today(),
'content': content})
print("%s-%s-%s已完成" % (release_time, banmianhao, title))
else:
await collection.insert_one({'banmianhao': banmianhao,
'banmianming': banmianming,
'title': title,
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': link,
'release_time': 'empty',
'insert_timestamp': datetime.today(),
'content': 'empty'})
print("%s无内容" % (title))
await asyncio.sleep(random.randint(5, 20))
except Exception as result:
# 改天数据为空
await collection.insert_one({'banmianhao': 'empty',
'banmianming': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': 'empty',
'insert_timestamp': datetime.today(),
'content': 'empty'})
print(result)
async def upDate(h1):
url = "http://www.qstheory.cn/qs/mulu.htm"
# 进入首页
try:
async with AsyncClient(headers=headers) as client:
response = await client.get(url)
response.encoding = response.charset_encoding
print('一级连接状态%d' % response.status_code)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "lxml")
for item in soup.select('.booktitle a'):
book_link = item.get('href')
if "http" not in book_link:
book_link = "http://www.qstheory.cn" + book_link.replace("..", "").strip()
year = item.text
response2 = await client.get(book_link, headers=headers)
response2.encoding = 'utf-8'
print('二级连接状态%d' % response2.status_code)
if response2.status_code == 200:
soup1 = BeautifulSoup(response2.text, "lxml")
for item1 in soup1.select('.highlight p a'):
if '《求是》' in item1.text:
banmianhao = item1.text.split("")[-1].replace("", "").strip()
banmianming = '求是' + year
yaowen_link = item1.get('href')
response3 = await client.get(yaowen_link)
response3.encoding = response3.charset_encoding
print('三级连接状态%d' % response3.status_code)
if response3.status_code == 200:
soup2 = BeautifulSoup(response3.text, "lxml")
for item2 in soup2.select('.text p a'):
link = item2.get('href')
title = item2.text.strip()
if h1[h1['title'].str.contains(title)].empty:
response4 = await client.get(link, headers=headers)
response4.encoding = response4.charset_encoding
print('四级连接状态%d' % response4.status_code)
if response4.status_code == 200:
soup3 = BeautifulSoup(response4.text, "lxml")
if soup3.select('h1'):
author = parse_author(soup3)
if soup3.select('.pubtime'):
str_time = soup3.select('.pubtime')[0].text.strip()
release_time = datetime.strptime(str_time, '%Y-%m-%d %H:%M:%S')
else:
str_time = soup3.select('.headtitle span')[0].text.strip()
release_time = datetime.strptime(str_time,
'%Y年%m月%d%H:%M:%S')
content = parse_html_text(soup3)
await collection.insert_one({'banmianhao': banmianhao,
'banmianming': banmianming,
'title': title,
'subtitle': 'empty',
'author': author,
'keywordlist': 'empty',
'detail_url': link,
'release_time': release_time,
'insert_timestamp': datetime.today(),
'content': content})
print("%s-%s-%s已完成" % (release_time, banmianhao, title))
else:
await collection.insert_one({'banmianhao': banmianhao,
'banmianming': banmianming,
'title': title,
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': link,
'release_time': 'empty',
'insert_timestamp': datetime.today(),
'content': 'empty'})
print("%s无内容" % (title))
await asyncio.sleep(random.randint(5, 20))
else:
print('%s已经存在' % title)
except Exception as result:
# 改天数据为空
await collection.insert_one({'banmianhao': 'empty',
'banmianming': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': 'empty',
'insert_timestamp': datetime.today(),
'content': 'empty'})
print(result)
if __name__ == "__main__": # 当程序执行时
# 调用函数
asyncio.run(main())
print("爬取完毕!")