# _*_ coding : UTF-8 _*_ # @Time : 2024-03-08 10:18:55 # @Author : haochen zhong # @File : CrawlHenan.py # @Software : PyCharm # @Comment :采集河南日报数字报版面数据 import datetime import random import time import pymongo import requests from bs4 import BeautifulSoup start_date = datetime.datetime.strptime('2007-10-13', '%Y-%m-%d') """采集开始时间""" end_date = datetime.datetime.today() """采集结束时间""" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'} """自定义请求头""" # 创建数据库 dbclient = pymongo.MongoClient('localhost', 27017) """连接数据库""" mydb = dbclient.dfdm_sjribao henanribao = mydb.henanribao def main(): # 判断数据库是否存在 collist = mydb.list_collection_names() if "henanribao" in collist: # 检测集合是否存在 print("河南集合存在,更新数据库") # 数据库最新一条内容的时间 db_time = henanribao.find_one(sort=[('release_time', -1)])['release_time'] print('数据库截止时间%s' % db_time) # 输入更新数据库时间 input_time = datetime.datetime.today() if db_time < input_time: getData(db_time, input_time) else: print('数据库无需更新') else: # 爬取网页并建立数据库 print("数据库不存在,建立数据库!") getData(start_date, end_date) def getContent(soup: BeautifulSoup): content = '' for p in soup.select('#articleContent p'): para = p.text.strip() if para: content += para content += '\n' return content def getData(start_date, end_date): crawl_num = 0 for i in range((end_date - start_date).days): # gu:时间长度 date_now = start_date + datetime.timedelta(days=i + 1) date_now_s = date_now.strftime('%Y-%m/%d') base_url = "http://newpaper.dahe.cn/hnrb/html/" + date_now_s + '/' url = base_url + 'node_1.htm' # http://newpaper.dahe.cn/hnrb/html/2024-03/08/node_1.htm print(url) try: response = requests.get(url, headers, timeout=60) response.encoding = response.apparent_encoding print(f"一级链接状态:{response.status_code}") if response.status_code == 200: soup = BeautifulSoup(response.text, "lxml") for item in soup.select(".layout-catalogue-item>a:nth-child(1)"): banmianhao = item.text.split(":")[0] banmianming = item.text.split(":")[-1] url1 = base_url + item.get("href") response2 = requests.get(url1, headers) response2.encoding = response2.apparent_encoding print(f"二级链接状态:{response2.status_code}") if response2.status_code == 200: soup2 = BeautifulSoup(response2.text, "lxml") for item2 in soup2.select(".news-item a"): title = item2.get("title", "").strip() url2 = base_url + item2.get("href") response3 = requests.get(url2, headers) response3.encoding = response3.apparent_encoding print(f"三级连接状态:{response3.status_code}") if response3.status_code == 200: soup3 = BeautifulSoup(response3.text, "lxml") content = getContent(soup3) try: preTitle = soup3.select(".headline")[0].text.strip() except Exception as e: preTitle = "" try: subtitle = soup3.select(".subtitle")[0].test.strip() except Exception as e: subtitle = "" henanribao.insert_one({'banmianhao': banmianhao, 'banmianming': banmianming, 'title': title, 'subtitle': subtitle, 'preTitle': preTitle, 'author': '', 'keywordlist': '', 'detail_url': url2, 'release_time': date_now, 'insert_timestamp': datetime.datetime.today(), 'content': content}) crawl_num += 1 print(f"河南日报-{date_now_s}-{banmianhao}-{title}---采集成功!") time.sleep(random.randint(5, 10)) print(f"河南日报-{date_now_s}-{banmianhao}---采集成功!") print(f"河南日报-{date_now_s}---采集成功!") except Exception as result: henanribao.insert_one({'banmianhao': 'empty', 'banmianming': 'empty', 'preTitle': 'empty', 'title': 'empty', 'subtitle': 'empty', 'author': 'empty', 'keywordlist': 'empty', 'detail_url': url, 'release_time': date_now, 'insert_timestamp': datetime.datetime.today(), 'content': 'empty'}) print(result) print(f"河南日报采集完毕,共采集{crawl_num}条数据!") if __name__ == '__main__': main()