import asyncio import datetime import random import time import pymongo import requests from httpx import AsyncClient # 模拟用户访问 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/51.0.2704.63 Safari/537.36', } # 创建数据库 client = pymongo.MongoClient('localhost', 27017) mydb = client.sjzf_zcwj shenzhenzcwj = mydb.shenzhenzcwj yearList = ['158104', '148604', '141910', '125615', '103604', '101620', '101621', '101622', '101623', '101624', '101625', '101626', '101627', '101628', '101629', '101630', '101631', '101632', '101633', '101634', '101635', '101636', '101637', '101638', '146351', '146338', '146325', '146311', '146298', '146285', '146272', '146205', '146190', '145973', '145972', '145970'] def update_json_data(original_data, new_data): # 遍历新数据的键值对 for key, value in new_data.items(): # 如果新数据的值不为 None 或者空字符串,更新原数据 if value is not None and value != "": original_data[key] = value return original_data async def getData(): async with AsyncClient(headers=headers, timeout=60, verify=False) as client: for i in yearList: url = f"http://www.sz.gov.cn/postmeta/i/{i}.json" print(url) response = await client.get(url=url) response.encoding = response.charset_encoding print(response.status_code) if response.status_code == 200: for item in response.json()["children"]: url2 = f"http://www.sz.gov.cn/postmeta/i/{item['id']}.json" print(url2) response2 = await client.get(url=url2) response2.encoding = response2.charset_encoding print(response2.status_code) if response2.status_code == 200: for item2 in response2.json()["articles"][1:]: if shenzhenzcwj.find_one({"id":item2["id"]}): continue url3 = f"http://www.sz.gov.cn/postmeta/p/{item2['id'] // 1000000}/{item2['id'] // 1000}/{item2['id']}.json" response3 = await client.get(url=url3) response3.encoding = response3.charset_encoding print(response3.status_code) if response3.status_code == 200: data = response3.json() newData = update_json_data(item2, data) shenzhenzcwj.insert_one(newData) print(newData["title"],"采集完成") await asyncio.sleep(random.randint(2,3)) asyncio.run(getData())