164 lines
6.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# _*_ coding : UTF-8 _*_
# @Time : 2024-01-17 14:24:59
# @Author : haochen zhong
# @File : CrawlHainan.py
# @Software : PyCharm
# @Comment :
import random
import re
import time
from datetime import timedelta, datetime
import pymongo
import requests
from bs4 import BeautifulSoup
# 数据库起止时间
start_date = datetime.strptime('2008-02-29', '%Y-%m-%d')
end_date = datetime.today()
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'}
# 创建数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client.dfdm_sjribao
hainanribao = mydb.hainanribao
def main():
# 判断数据库是否存在
collist = mydb.list_collection_names()
if "hainanribao" in collist: # 检测集合是否存在
print("海南日报集合存在,更新数据库")
# 数据库最新一条内容的时间
db_time = hainanribao.find_one(sort=[('release_time', -1)])['release_time']
print(f'数据库截止时间{db_time}')
# 输入更新数据库时间
input_time = datetime.today()
if db_time < input_time:
getData(db_time, input_time)
else:
print('数据库无需更新')
else:
print("数据库不存在,建立数据库")
# 爬取网页并建立数据库
getData(start_date, end_date)
# 解析网页正文
def parse_html_text(soup):
"""
:param html: html字符串
:return: 正文 string
"""
content = ''
if soup.select('#ozoom'):
content = soup.select('#ozoom')[0].text.strip()
return content
def parse_subtitle(soup):
item = soup.select('.font02')
if re.findall(r'article-subtitle>-->(.*?)<!--', str(item)):
subtitle = re.findall(r'article-subtitle>-->(.*?)<!--', str(item))[0]
else:
subtitle = ''
return subtitle
def parse_h3title(soup):
item = soup.select('.font02')
if re.findall(r'article-pretitle>-->(.*?)<!--', str(item)):
h3title = re.findall(r'article-pretitle>-->(.*?)<!--', str(item))[0]
else:
h3title = ''
return h3title
def parse_author(soup):
item = soup.select('.font02')
if re.findall(r'article-subtitle>-->(.*?)<!--', str(item)):
author = re.findall(r'article-subtitle>-->(.*?)<!--', str(item))[0]
else:
author = ''
return author
# 爬取网页并建立数据库
def getData(start_date, end_date):
crawl_num = 0
for i in range((end_date - start_date).days):
date_now = start_date + timedelta(days=i + 1)
date_now_s = date_now.strftime('%Y-%m/%d')
base_url = "http://news.hndaily.cn/html/" + date_now_s + '/'
url = base_url + 'node_1.htm'
# 进入首页
try:
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
print(f'一级连接状态{response.status_code}')
if response.status_code == 200:
soup = BeautifulSoup(response.text, "lxml")
for item in soup.select('#pageLink'):
banmianhao = item.text.split("")[0].strip()
banmianming = item.text.split("")[-1].strip()
if banmianming == "广告":
continue
url1 = base_url + item.get('href')
response2 = requests.get(url1, headers=headers)
response2.encoding = response2.apparent_encoding
print(f'二级连接状态{response2.status_code}')
if response2.status_code == 200:
soup1 = BeautifulSoup(response2.text, "lxml")
for item1 in soup1.select('#main-ed-articlenav-list tr td div a'):
detail_url = base_url + item1.get('href')
print(detail_url)
title = item1.text.strip()
response3 = requests.get(detail_url, headers=headers)
response3.encoding = response3.apparent_encoding
print(f'三级连接状态:{response3.status_code}')
if response3.status_code == 200:
soup2 = BeautifulSoup(response3.text, "lxml")
try:
title = soup2.select('.font01')[0].text.strip()
except IndexError:
pass
subtitle = parse_subtitle(soup2)
h3title = parse_h3title(soup2)
author = parse_author(soup2)
content = parse_html_text(soup2)
hainanribao.insert_one({'banmianhao': banmianhao,
'banmianming': banmianming,
'title': title,
'subtitle': subtitle,
'h3title': h3title,
'author': author,
'keywordlist': '',
'detail_url': detail_url,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': content})
print(f"海南日报-{date_now_s}-{banmianhao}-{banmianming}-{title}已经完成")
crawl_num += 1
time.sleep(random.randint(3, 10))
print(f"海南日报-{date_now_s}-{banmianhao}-{banmianming}-已经完成")
time.sleep(random.randint(3, 10))
print(f"海南日报-{date_now_s}-已经完成")
except Exception as result:
hainanribao.insert_one({'banmianhao': 'empty',
'banmianming': 'empty',
'title': 'empty',
'subtitle': 'empty',
'h3title': 'empty',
'author': 'empty',
'keywordlist': 'empty',
'detail_url': url,
'release_time': date_now,
'insert_timestamp': datetime.today(),
'content': 'empty'})
print(result)
print(f"海南日报采集完毕,本次共采集{crawl_num}条数据!")
if __name__ == "__main__":
main()