# -*- coding = utf-8 -*- # @Time : 2021/12/2 20:34 # @Author : Hongshuang Gu # @File : 本程序采集北京数据源 # @Software : PyCharm import asyncio import random from datetime import timedelta, datetime import httpx from motor.motor_asyncio import AsyncIOMotorClient # 数据库起止时间 start_date = datetime.strptime('2020-01-01', '%Y-%m-%d') end_date = datetime.today() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'} # 创建数据库 client = AsyncIOMotorClient('mongodb://localhost:27017') mydb = client['dfdm_zxsribao'] shjfribao = mydb['shjfribao'] async def main(): # 判断数据库是否存在 collist = await mydb.list_collection_names() if "shjfribao" in collist: # 检测集合是否存在 print("上海解放集合存在,更新数据库") # 数据库最新一条内容的时间 db_time = (await shjfribao.find_one(sort=[('release_time', -1)]))[ 'release_time'] # 或者find().sort('_id', -1).limit(1) print('数据库截止时间%s' % db_time) # 输入更新数据库时间 input_time = datetime.today() if db_time < input_time: await getData(db_time, input_time) else: print('数据库无需更新') else: # 爬取网页并建立数据库 await getData(start_date, end_date) async def getData(start_date, end_date): async with httpx.AsyncClient(headers=headers) as client: for i in range((end_date - start_date).days): # gu:时间长度 date_now = start_date + timedelta(days=i + 1) date_now_s = date_now.strftime('%Y-%m-%d') base_url = 'https://www.jfdaily.com/staticsg/data/journal/' url = base_url + date_now_s + '/navi.json?' # 进入首页 try: response = await client.get(url) json_obj = response.json() print(json_obj) print('一级链接%d' % response.status_code) for item in json_obj['pages']: banmianming = item['pname'] banmianhao = item['pnumber'] for item2 in item['articleList']: title = item2['title'] subtitle = item2['subtitle'] h3title = item2['introtitle'] id = item2['id'] jdate = item2['jdate'] json_url = base_url + str(jdate) + '/' + banmianhao + '/article/' + str(id) + '.json?' response2 = await client.get(json_url) json_obj2 = response2.json() author = json_obj2['article']['author'] content = json_obj2['article']['content'].strip() url2 = 'https://www.jfdaily.com/staticsg/res/html/journal/detail.html?date=' + str( jdate) + '&id=' + str(id) + '&page=01' print(url2) await shjfribao.insert_one({ 'banmianhao': banmianhao, 'banmianming': banmianming, 'title': title, 'subtitle': subtitle, 'h3title': h3title, 'author': author, 'keywordlist': 'empty', 'detail_url': url2, 'release_time': date_now, 'insert_timestamp': datetime.today(), 'content': content }) print("%s-%s-%s已完成" % (date_now_s, banmianhao, title)) await asyncio.sleep(random.randint(5, 20)) print("%s已经完成" % date_now_s) except Exception as result: await shjfribao.insert_one({ 'banmianhao': 'empty', 'banmianming': 'empty', 'title': 'empty', 'subtitle': 'empty', 'h3title': 'empty', 'author': 'empty', 'keywordlist': 'empty', 'detail_url': url, 'release_time': date_now, 'insert_timestamp': datetime.today(), 'content': 'empty' }) print(result) # 到这为止 if __name__ == "__main__": # 当程序执行时 # 调用函数 asyncio.run(main()) print("爬取完毕!")