73 lines
2.3 KiB
Python
73 lines
2.3 KiB
Python
import datetime
|
|
import random
|
|
import time
|
|
|
|
import pymongo
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
# 模拟用户访问
|
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
|
'Chrome/51.0.2704.63 Safari/537.36',
|
|
'Connection': 'close'}
|
|
# 创建数据库
|
|
client = pymongo.MongoClient('localhost', 27017)
|
|
mydb = client.sjzf_zcwj
|
|
shanghaizcwj = mydb.shanghaizcwj
|
|
base_url = "https://www.shanghai.gov.cn"
|
|
|
|
|
|
def getContent(soup: BeautifulSoup) -> str:
|
|
"""
|
|
获取文章正文内容
|
|
:param soup:
|
|
:return:
|
|
"""
|
|
content: str = ""
|
|
for p in soup.select('#ivs_content p'):
|
|
para: str = p.text.strip()
|
|
if para:
|
|
content += para
|
|
content += '\n'
|
|
return content
|
|
|
|
|
|
def getData():
|
|
pages = 28
|
|
for i in range(1, pages + 1):
|
|
if i == 1:
|
|
url = "https://www.shanghai.gov.cn/xxzfgzwj/index.html"
|
|
else:
|
|
url = f"https://www.shanghai.gov.cn/xxzfgzwj/index_{i}.html"
|
|
response = requests.get(url, headers=headers)
|
|
response.encoding = response.apparent_encoding
|
|
if response.status_code == 200:
|
|
soup = BeautifulSoup(response.text, "lxml")
|
|
# print(response.text)
|
|
trList = soup.select(".trout-region-list tbody tr")
|
|
for item in trList:
|
|
data = item.select("a")[0]
|
|
title = data.get("title", "")
|
|
url = base_url + data.get("href", "")
|
|
print(url)
|
|
if shanghaizcwj.find_one({"url": url}):
|
|
continue
|
|
subtitle = data.select_one(".text-color").text.strip()
|
|
response2 = requests.get(url=url, headers=headers)
|
|
response2.encoding = response2.apparent_encoding
|
|
print(response2.status_code)
|
|
if response2.status_code == 200:
|
|
soup2 = BeautifulSoup(response2.text, "lxml")
|
|
content: str = getContent(soup=soup2)
|
|
shanghaizcwj.insert_one({
|
|
"title": title,
|
|
"subtitle": subtitle,
|
|
"content": content,
|
|
"url": url,
|
|
})
|
|
time.sleep(random.randint(3, 5))
|
|
print(title, "采集完成")
|
|
|
|
|
|
getData()
|