62 lines
2.8 KiB
Python
Raw Permalink Normal View History

2024-11-09 17:00:30 +08:00
import asyncio
import random
import pymongo
from httpx import AsyncClient
# 模拟用户访问
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/51.0.2704.63 Safari/537.36', }
# 创建数据库
client = pymongo.MongoClient('localhost', 27017)
mydb = client.sjzf_zcwj
shenzhenzcwj = mydb.shenzhenzcwj
yearList = ['158104', '148604', '141910', '125615', '103604', '101620', '101621', '101622', '101623', '101624',
'101625', '101626', '101627', '101628', '101629', '101630', '101631', '101632', '101633', '101634',
'101635', '101636', '101637', '101638', '146351', '146338', '146325', '146311', '146298', '146285',
'146272', '146205', '146190', '145973', '145972', '145970']
def update_json_data(original_data, new_data):
# 遍历新数据的键值对
for key, value in new_data.items():
# 如果新数据的值不为 None 或者空字符串,更新原数据
if value is not None and value != "":
original_data[key] = value
return original_data
async def getData():
async with AsyncClient(headers=headers, timeout=60, verify=False) as client:
for i in yearList:
url = f"http://www.sz.gov.cn/postmeta/i/{i}.json"
print(url)
response = await client.get(url=url)
response.encoding = response.charset_encoding
print(response.status_code)
if response.status_code == 200:
for item in response.json()["children"]:
url2 = f"http://www.sz.gov.cn/postmeta/i/{item['id']}.json"
print(url2)
response2 = await client.get(url=url2)
response2.encoding = response2.charset_encoding
print(response2.status_code)
if response2.status_code == 200:
for item2 in response2.json()["articles"][1:]:
if shenzhenzcwj.find_one({"id": item2["id"]}):
2024-11-09 17:00:30 +08:00
continue
url3 = f"http://www.sz.gov.cn/postmeta/p/{item2['id'] // 1000000}/{item2['id'] // 1000}/{item2['id']}.json"
response3 = await client.get(url=url3)
response3.encoding = response3.charset_encoding
print(response3.status_code)
if response3.status_code == 200:
data = response3.json()
newData = update_json_data(item2, data)
shenzhenzcwj.insert_one(newData)
print(newData["title"], "采集完成")
await asyncio.sleep(random.randint(2, 3))
2024-11-09 17:00:30 +08:00
asyncio.run(getData())