2024-11-09 17:00:30 +08:00

141 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# _*_ coding : UTF-8 _*_
# @Time : 2022/12/27 14:15
# @UpdateTime : 2023/11/08 16:30
# @Author : Haochen Zhong
# @File : CrawlGuizhou.py
# @Software : PyCharm
# @Comment : 本程序采集贵州日报数字报板面数据
import random
import time
from datetime import timedelta, datetime
import pymongo
import requests
from bs4 import BeautifulSoup
# 数据库起止时间
start_date = datetime.strptime('2021-12-31', '%Y-%m-%d')
"""贵州日报数字报2022-01-01开始有数据纪录"""
end_date = datetime.today()
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
# 创建数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client.dfdm_sjribao
guizhouribao = mydb.guizhouribao
# 设置随机时间
sleeptime = random.randint(2, 15)
def main():
# 判断数据库是否存在
collist = mydb.list_collection_names()
if "guizhouribao" in collist: # 检测集合是否存在
print("贵州集合存在,更新数据库")
# 数据库最新一条内容的时间
db_time = guizhouribao.find_one(sort=[('release_time', -1)])[
'release_time'] # 或者find().sort('_id', -1).limit(1)
print('数据库截止时间%s' % db_time)
# 输入更新数据库时间
input_time = datetime.today()
if db_time < input_time:
getData(db_time, input_time)
else:
print('数据库无需更新')
else:
# 爬取网页并建立数据库
print("数据库不存在,建立数据库!")
getData(start_date, end_date)
def get_content(soup3):
content = ""
for p in soup3.select("#ozoom p"):
para = p.text.strip()
if para:
content += para
content += '\n'
return content
def getData(start_date, end_date):
crawl_num = 0
for i in range((end_date - start_date).days):
date_now = start_date + timedelta(days=i + 1)
date_now_s = date_now.strftime('%Y%m/%d')
base_url = "http://szb.gzrbs.com.cn/pc/layout/" + date_now_s + "/"
url = base_url + "node_01.html"
# http://szb.gzrbs.com.cn/pc/layout/202201/01/node_01.html
try:
response = requests.get(url=url, headers=headers, timeout=(30, 45))
response.encoding = response.apparent_encoding
print(f"一级连接状态:{response.status_code}")
if response.status_code == 200:
soup = BeautifulSoup(response.text, "lxml")
for item in soup.select(".btn-block"):
banmianming = item.text.split("")[-1]
banmianhao = item.text.split("")[0]
url1 = base_url + item.get("href")
response2 = requests.get(url=url1, headers=headers, timeout=(30, 45))
response2.encoding = response2.apparent_encoding
print(f"二级连接状态:{response2.status_code}")
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, "lxml")
for item2 in soup2.select(".resultList a"):
title = item2.text.strip()
url2 = "http://szb.gzrbs.com.cn/pc/" + item2.get("href")[9:]
# http://szb.gzrbs.com.cn/pc/cont/202201/02/content_42202.html
response3 = requests.get(url=url2, headers=headers, timeout=(30, 45))
response3.encoding = response3.apparent_encoding
print(f"三级连接状态:{response3.status_code}")
if response3.status_code == 200:
soup3 = BeautifulSoup(response3.text, "lxml")
try:
title = soup3.select("#Title")[0].text.strip()
except:
title = title
try:
subtitle = soup3.select("#SubTitle")[0].text.strip()
except:
subtitle = ""
try:
preTitle = soup3.select("#PreTitle")[0].text.strip()
except:
preTitle = ""
content = get_content(soup3)
guizhouribao.insert_one({'banmianhao': banmianhao,
'banmianming': banmianming,
'preTitle': preTitle,
'title': title,
'subtitle': subtitle,
'author': '',
'keywordlist': 'empty',
'detail_url': url2,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': content})
crawl_num += 1
print(f"贵州日报-{date_now_s}-{banmianming}-{title}-已完成")
time.sleep(sleeptime)
print(f"贵州日报-{date_now_s}-{banmianming}-已完成")
time.sleep(sleeptime)
print(f"贵州日报-{date_now_s}-已完成")
except Exception as result:
guizhouribao.insert_one({'banmianhao': 'empty',
'banmianming': 'empty',
'preTitle': 'empty',
'title': 'empty',
'subtitle': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': 'empty'})
print(result)
print(f"贵州日报采集完毕,共采集{crawl_num}条数据!")
if __name__ == '__main__':
main()