63 lines
2.8 KiB
Python
63 lines
2.8 KiB
Python
|
|
import asyncio
|
||
|
|
import datetime
|
||
|
|
import random
|
||
|
|
import time
|
||
|
|
|
||
|
|
import pymongo
|
||
|
|
import requests
|
||
|
|
from httpx import AsyncClient
|
||
|
|
|
||
|
|
# 模拟用户访问
|
||
|
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||
|
|
'Chrome/51.0.2704.63 Safari/537.36', }
|
||
|
|
# 创建数据库
|
||
|
|
client = pymongo.MongoClient('localhost', 27017)
|
||
|
|
mydb = client.sjzf_zcwj
|
||
|
|
shenzhenzcwj = mydb.shenzhenzcwj
|
||
|
|
|
||
|
|
yearList = ['158104', '148604', '141910', '125615', '103604', '101620', '101621', '101622', '101623', '101624',
|
||
|
|
'101625', '101626', '101627', '101628', '101629', '101630', '101631', '101632', '101633', '101634',
|
||
|
|
'101635', '101636', '101637', '101638', '146351', '146338', '146325', '146311', '146298', '146285',
|
||
|
|
'146272', '146205', '146190', '145973', '145972', '145970']
|
||
|
|
|
||
|
|
|
||
|
|
def update_json_data(original_data, new_data):
|
||
|
|
# 遍历新数据的键值对
|
||
|
|
for key, value in new_data.items():
|
||
|
|
# 如果新数据的值不为 None 或者空字符串,更新原数据
|
||
|
|
if value is not None and value != "":
|
||
|
|
original_data[key] = value
|
||
|
|
return original_data
|
||
|
|
|
||
|
|
|
||
|
|
async def getData():
|
||
|
|
async with AsyncClient(headers=headers, timeout=60, verify=False) as client:
|
||
|
|
for i in yearList:
|
||
|
|
url = f"http://www.sz.gov.cn/postmeta/i/{i}.json"
|
||
|
|
print(url)
|
||
|
|
response = await client.get(url=url)
|
||
|
|
response.encoding = response.charset_encoding
|
||
|
|
print(response.status_code)
|
||
|
|
if response.status_code == 200:
|
||
|
|
for item in response.json()["children"]:
|
||
|
|
url2 = f"http://www.sz.gov.cn/postmeta/i/{item['id']}.json"
|
||
|
|
print(url2)
|
||
|
|
response2 = await client.get(url=url2)
|
||
|
|
response2.encoding = response2.charset_encoding
|
||
|
|
print(response2.status_code)
|
||
|
|
if response2.status_code == 200:
|
||
|
|
for item2 in response2.json()["articles"][1:]:
|
||
|
|
if shenzhenzcwj.find_one({"id":item2["id"]}):
|
||
|
|
continue
|
||
|
|
url3 = f"http://www.sz.gov.cn/postmeta/p/{item2['id'] // 1000000}/{item2['id'] // 1000}/{item2['id']}.json"
|
||
|
|
response3 = await client.get(url=url3)
|
||
|
|
response3.encoding = response3.charset_encoding
|
||
|
|
print(response3.status_code)
|
||
|
|
if response3.status_code == 200:
|
||
|
|
data = response3.json()
|
||
|
|
newData = update_json_data(item2, data)
|
||
|
|
shenzhenzcwj.insert_one(newData)
|
||
|
|
print(newData["title"],"采集完成")
|
||
|
|
await asyncio.sleep(random.randint(2,3))
|
||
|
|
asyncio.run(getData())
|