142 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# _*_ coding : UTF-8 _*_
# @Time : 2022/12/29 13:48
# @Author : Haochen Zhong
# @File : CrawlNingxia.py
# @Software : PyCharm
# @Comment : 本程序采集宁夏日报版面数据
import random
import time
from datetime import timedelta, datetime
import pymongo
import requests
from bs4 import BeautifulSoup
# 数据库起止时间
start_date = datetime.strptime('2022-01-31', '%Y-%m-%d')
"""宁夏日报2022-02-01开始有数据"""
end_date = datetime.today()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
# 创建数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client.dfdm_sjribao
ningxiaribao = mydb.ningxiaribao
# 设置随机时间
sleeptime = random.randint(2, 10)
def main():
# 判断数据库是否存在
collist = mydb.list_collection_names()
if "ningxiaribao" in collist: # 检测集合是否存在
print("宁夏集合存在,更新数据库")
# 数据库最新一条内容的时间
db_time = ningxiaribao.find_one(sort=[('release_time', -1)])[
'release_time'] # 或者find().sort('_id', -1).limit(1)
print('数据库截止时间%s' % db_time)
# 输入更新数据库时间
input_time = datetime.today()
if db_time < input_time:
getData(db_time, input_time)
else:
print('数据库无需更新!')
else:
# 爬取网页并建立数据库
print("数据库不存在,建立数据库!")
getData(start_date, end_date)
def get_content(soup3):
content = ""
for p in soup3.select("#ozoom p"):
para = p.text.strip()
if para:
content += para
content += '\n'
return content
def getData(start_date, end_date):
crawl_num = 0
for i in range((end_date - start_date).days): # gu:时间长度
date_now = start_date + timedelta(days=i + 1)
date_now_s = date_now.strftime('%Y%m/%d')
base_url = "https://szb.nxrb.cn/nxrb/pc/layout/" + date_now_s + "/"
url = base_url + "node_01.html"
# https://szb.nxrb.cn/nxrb/pc/layout/202202/01/node_01.html
try:
response = requests.get(url=url, headers=headers, timeout=(30, 45))
response.encoding = response.apparent_encoding
print(f"一级连接状态: {response.status_code}")
if response.status_code == 200:
soup = BeautifulSoup(response.text, "lxml")
for item in soup.select(".nav-list .btn-block"):
banmianhao = item.text.split("")[0].strip()
banmianming = item.text.split("")[-1].strip()
url1 = base_url + item.get("href")
response2 = requests.get(url=url1, headers=headers, timeout=(30, 45))
response2.encoding = response2.apparent_encoding
print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, "lxml")
for item2 in soup2.select(".news-list .resultList a"):
url_title = item2.text.strip()
url2 = "https://szb.nxrb.cn/nxrb/pc/" + item2.get("href")[9:]
print(url2)
response3 = requests.get(url=url2, headers=headers, timeout=(30, 45))
response3.encoding = response3.apparent_encoding
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
soup3 = BeautifulSoup(response3.text, "lxml")
try:
pretitle = soup3.select("#PreTitle")[0].text.strip()
except:
pretitle = ""
try:
title = soup3.select("#Title")[0].text.strip()
except:
title = url_title
try:
subtitle = soup3.select("SubTitle")[0].text.strip()
except:
subtitle = ""
content = get_content(soup3)
ningxiaribao.insert_one({'banmianhao': banmianhao,
'banmianming': banmianming,
'title': title,
'subtitle': subtitle,
'h3title': pretitle,
'author': '',
'keywordlist': 'empty',
'detail_url': url2,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': content})
crawl_num += 1
print(f"宁夏日报-{date_now_s}-{banmianhao}-{title}-已完成")
time.sleep(sleeptime)
print(f"宁夏日报-{date_now_s}-{banmianhao}-已完成")
time.sleep(sleeptime)
print(f"宁夏日报-{date_now_s}-已完成")
time.sleep(sleeptime)
except Exception as result:
ningxiaribao.insert_one({'banmianhao': 'empty',
'banmianming': 'empty',
'title': 'empty',
'subtitle': 'empty',
'h3title': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': 'empty'})
print(result)
print(f"宁夏日报采集完成,成功采集{crawl_num}条数据!")
if __name__ == '__main__':
main()