2024-11-09 17:00:30 +08:00

138 lines
6.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# _*_ coding : UTF-8 _*_
# @Time : 2024/01/09 14:15
# @UpdateTime : 2024/01/09 16:30
# @Author : Haochen Zhong
# @File : CrawlSiChuan.py
# @Software : PyCharm
# @Comment : 本程序采集四川日报数字报板面数据
import re
from bs4 import BeautifulSoup
import requests
from datetime import timedelta, datetime
import time
import pymongo
import random
# 数据库起止时间
start_date = datetime.strptime('2021-12-31', '%Y-%m-%d')
end_date = datetime.today()
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
# 创建数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client.dfdm_sjribao
sichuanribao = mydb.sichuanribao
def main():
# 判断数据库是否存在
collist = mydb.list_collection_names()
if "sichuanribao" in collist: # 检测集合是否存在
print("四川日报集合存在,更新数据库")
# 数据库最新一条内容的时间
db_time = sichuanribao.find_one(sort=[('release_time', -1)])[
'release_time'] # 或者find().sort('_id', -1).limit(1)
print('数据库截止时间%s' % db_time)
# 输入更新数据库时间
input_time = datetime.today()
if db_time < input_time:
getData(db_time, input_time)
else:
print('数据库无需更新')
else:
# 爬取网页并建立数据库
print("数据库不存在,建立数据库!")
getData(start_date, end_date)
def getContent(soup):
content = ''
for p in soup.select('#main2 > div.main2_r > ul > li:nth-child(2) p'):
para = p.text.strip()
if para:
content += para
content += '\n'
return content
def getSubtitle(soup):
subtitle = ''
if soup.select('#main2 > div.main2_r > ul > li:nth-child(1) > h2:nth-child(1)'):
for p in soup.select('#main2 > div.main2_r > ul > li:nth-child(1) > h2:nth-child(1)'):
para = p.text.strip()
if para:
subtitle += para
subtitle += '\n'
return subtitle
def getData(start_date, end_date):
crawl_num = 0
for i in range((end_date - start_date).days):
date_now = start_date + timedelta(days=i + 1)
date_now_s = date_now.strftime('%Y%m%d')
base_url = "https://epaper.scdaily.cn/shtml/scrb/"
url = base_url + date_now_s + '/index.shtml'
try:
response = requests.get(url, headers)
print(f"一级链接状态:{response.status_code}")
if response.status_code == 200:
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, "lxml")
for item in soup.select("#main > div.main_r > ul:nth-child(2) > li:nth-child(2) a"):
banmianhao = item.text.split("")[0]
banmianming = item.text.split("")[-1]
url1 = "https://epaper.scdaily.cn" + item.get("href")
response2 = requests.get(url1, headers)
print(f"二级链接状态:{response2.status_code}")
if response2.status_code == 200:
response2.encoding = response2.apparent_encoding
soup2 = BeautifulSoup(response2.text, "lxml")
for item2 in soup2.select("#main > div.main_r > ul:nth-child(3) > li:nth-child(2) a"):
url2 = "https://epaper.scdaily.cn" + item2.get("href")
title = item2.get("title")
response3 = requests.get(url2, headers)
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
response3.encoding = response3.apparent_encoding
soup3 = BeautifulSoup(response3.text, "lxml")
content = getContent(soup3)
subtitle = getSubtitle(soup3)
sichuanribao.insert_one({'banmianhao': banmianhao,
'banmianming': banmianming,
'title': title,
'subtitle': subtitle,
'h3title': '',
'author': '',
'keywordlist': '',
'detail_url': url2,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': content})
print(f"四川日报--{date_now_s}-{banmianhao}-{title}----已完成")
crawl_num += 1
time.sleep(random.randint(3, 10))
print(f"四川日报--{date_now_s}-{banmianhao}----已完成")
time.sleep(random.randint(3, 10))
print(f"四川日报--{date_now_s}-----已完成")
time.sleep(random.randint(3, 10))
except Exception as result:
sichuanribao.insert_one({'banmianhao': 'empty',
'banmianming': 'empty',
'preTitle': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': 'empty'})
print(result)
print(f"四川日报采集完毕,共采集{crawl_num}条数据!")
if __name__ == '__main__':
main()