2024-11-09 17:00:30 +08:00

134 lines
6.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# _*_ coding : UTF-8 _*_
# @Time : 2024-03-08 10:18:55
# @Author : haochen zhong
# @File : CrawlHenan.py
# @Software : PyCharm
# @Comment :采集河南日报数字报版面数据
import datetime
import random
import time
import pymongo
import requests
from bs4 import BeautifulSoup
start_date = datetime.datetime.strptime('2007-10-13', '%Y-%m-%d')
"""采集开始时间"""
end_date = datetime.datetime.today()
"""采集结束时间"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
"""自定义请求头"""
# 创建数据库
dbclient = pymongo.MongoClient('localhost', 27017)
"""连接数据库"""
mydb = dbclient.dfdm_sjribao
henanribao = mydb.henanribao
def main():
# 判断数据库是否存在
collist = mydb.list_collection_names()
if "henanribao" in collist: # 检测集合是否存在
print("河南集合存在,更新数据库")
# 数据库最新一条内容的时间
db_time = henanribao.find_one(sort=[('release_time', -1)])['release_time']
print('数据库截止时间%s' % db_time)
# 输入更新数据库时间
input_time = datetime.datetime.today()
if db_time < input_time:
getData(db_time, input_time)
else:
print('数据库无需更新')
else:
# 爬取网页并建立数据库
print("数据库不存在,建立数据库!")
getData(start_date, end_date)
def getContent(soup: BeautifulSoup):
content = ''
for p in soup.select('#articleContent p'):
para = p.text.strip()
if para:
content += para
content += '\n'
return content
def getData(start_date, end_date):
crawl_num = 0
for i in range((end_date - start_date).days): # gu:时间长度
date_now = start_date + datetime.timedelta(days=i + 1)
date_now_s = date_now.strftime('%Y-%m/%d')
base_url = "http://newpaper.dahe.cn/hnrb/html/" + date_now_s + '/'
url = base_url + 'node_1.htm'
# http://newpaper.dahe.cn/hnrb/html/2024-03/08/node_1.htm
print(url)
try:
response = requests.get(url, headers, timeout=60)
response.encoding = response.apparent_encoding
print(f"一级链接状态:{response.status_code}")
if response.status_code == 200:
soup = BeautifulSoup(response.text, "lxml")
for item in soup.select(".layout-catalogue-item>a:nth-child(1)"):
banmianhao = item.text.split("")[0]
banmianming = item.text.split("")[-1]
url1 = base_url + item.get("href")
response2 = requests.get(url1, headers)
response2.encoding = response2.apparent_encoding
print(f"二级链接状态:{response2.status_code}")
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, "lxml")
for item2 in soup2.select(".news-item a"):
title = item2.get("title", "").strip()
url2 = base_url + item2.get("href")
response3 = requests.get(url2, headers)
response3.encoding = response3.apparent_encoding
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
soup3 = BeautifulSoup(response3.text, "lxml")
content = getContent(soup3)
try:
preTitle = soup3.select(".headline")[0].text.strip()
except Exception as e:
preTitle = ""
try:
subtitle = soup3.select(".subtitle")[0].test.strip()
except Exception as e:
subtitle = ""
henanribao.insert_one({'banmianhao': banmianhao,
'banmianming': banmianming,
'title': title,
'subtitle': subtitle,
'preTitle': preTitle,
'author': '',
'keywordlist': '',
'detail_url': url2,
'release_time': date_now,
'insert_timestamp': datetime.datetime.today(),
'content': content})
crawl_num += 1
print(f"河南日报-{date_now_s}-{banmianhao}-{title}---采集成功!")
time.sleep(random.randint(5, 10))
print(f"河南日报-{date_now_s}-{banmianhao}---采集成功!")
print(f"河南日报-{date_now_s}---采集成功!")
except Exception as result:
henanribao.insert_one({'banmianhao': 'empty',
'banmianming': 'empty',
'preTitle': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': date_now,
'insert_timestamp': datetime.datetime.today(),
'content': 'empty'})
print(result)
print(f"河南日报采集完毕,共采集{crawl_num}条数据!")
if __name__ == '__main__':
main()