import datetime import random import time import pymongo import requests from bs4 import BeautifulSoup # 模拟用户访问 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/51.0.2704.63 Safari/537.36', 'Connection': 'close'} # 创建数据库 client = pymongo.MongoClient('localhost', 27017) mydb = client.sjzf_zcwj shanghaizcwj = mydb.shanghaizcwj base_url = "https://www.shanghai.gov.cn" def getContent(soup: BeautifulSoup) -> str: """ 获取文章正文内容 :param soup: :return: """ content: str = "" for p in soup.select('#ivs_content p'): para: str = p.text.strip() if para: content += para content += '\n' return content def getData(): pages = 28 for i in range(1, pages + 1): if i == 1: url = "https://www.shanghai.gov.cn/xxzfgzwj/index.html" else: url = f"https://www.shanghai.gov.cn/xxzfgzwj/index_{i}.html" response = requests.get(url, headers=headers) response.encoding = response.apparent_encoding if response.status_code == 200: soup = BeautifulSoup(response.text, "lxml") # print(response.text) trList = soup.select(".trout-region-list tbody tr") for item in trList: data = item.select("a")[0] title = data.get("title", "") url = base_url + data.get("href", "") print(url) if shanghaizcwj.find_one({"url": url}): continue subtitle = data.select_one(".text-color").text.strip() response2 = requests.get(url=url, headers=headers) response2.encoding = response2.apparent_encoding print(response2.status_code) if response2.status_code == 200: soup2 = BeautifulSoup(response2.text, "lxml") content: str = getContent(soup=soup2) shanghaizcwj.insert_one({ "title": title, "subtitle": subtitle, "content": content, "url": url, }) time.sleep(random.randint(3, 5)) print(title, "采集完成") getData()