2024-11-09 17:00:30 +08:00

73 lines
2.3 KiB
Python

import datetime
import random
import time
import pymongo
import requests
from bs4 import BeautifulSoup
# 模拟用户访问
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/51.0.2704.63 Safari/537.36',
'Connection': 'close'}
# 创建数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client.sjzf_zcwj
shanghaizcwj = mydb.shanghaizcwj
base_url = "https://www.shanghai.gov.cn"
def getContent(soup: BeautifulSoup) -> str:
"""
获取文章正文内容
:param soup:
:return:
"""
content: str = ""
for p in soup.select('#ivs_content p'):
para: str = p.text.strip()
if para:
content += para
content += '\n'
return content
def getData():
pages = 28
for i in range(1, pages + 1):
if i == 1:
url = "https://www.shanghai.gov.cn/xxzfgzwj/index.html"
else:
url = f"https://www.shanghai.gov.cn/xxzfgzwj/index_{i}.html"
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
if response.status_code == 200:
soup = BeautifulSoup(response.text, "lxml")
# print(response.text)
trList = soup.select(".trout-region-list tbody tr")
for item in trList:
data = item.select("a")[0]
title = data.get("title", "")
url = base_url + data.get("href", "")
print(url)
if shanghaizcwj.find_one({"url": url}):
continue
subtitle = data.select_one(".text-color").text.strip()
response2 = requests.get(url=url, headers=headers)
response2.encoding = response2.apparent_encoding
print(response2.status_code)
if response2.status_code == 200:
soup2 = BeautifulSoup(response2.text, "lxml")
content: str = getContent(soup=soup2)
shanghaizcwj.insert_one({
"title": title,
"subtitle": subtitle,
"content": content,
"url": url,
})
time.sleep(random.randint(3, 5))
print(title, "采集完成")
getData()