diff --git a/中央党报/Crawlshjfribao.py b/中央党报/Crawlshjfribao.py new file mode 100644 index 0000000..e45a41a --- /dev/null +++ b/中央党报/Crawlshjfribao.py @@ -0,0 +1,112 @@ +# -*- coding = utf-8 -*- +# @Time : 2021/12/2 20:34 +# @Author : Hongshuang Gu +# @File : 本程序采集北京数据源 +# @Software : PyCharm + +import asyncio +import random +from datetime import timedelta, datetime + +import httpx +from motor.motor_asyncio import AsyncIOMotorClient + +# 数据库起止时间 +start_date = datetime.strptime('2020-01-01', '%Y-%m-%d') +end_date = datetime.today() +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'} +# 创建数据库 +client = AsyncIOMotorClient('mongodb://localhost:27017') +mydb = client['dfdm_zxsribao'] +shjfribao = mydb['shjfribao'] + + +async def main(): + # 判断数据库是否存在 + collist = await mydb.list_collection_names() + if "shjfribao" in collist: # 检测集合是否存在 + print("上海解放集合存在,更新数据库") + # 数据库最新一条内容的时间 + db_time = (await shjfribao.find_one(sort=[('release_time', -1)]))[ + 'release_time'] # 或者find().sort('_id', -1).limit(1) + print('数据库截止时间%s' % db_time) + # 输入更新数据库时间 + input_time = datetime.today() + if db_time < input_time: + await getData(db_time, input_time) + else: + print('数据库无需更新') + else: + # 爬取网页并建立数据库 + await getData(start_date, end_date) + + +async def getData(start_date, end_date): + async with httpx.AsyncClient(headers=headers) as client: + for i in range((end_date - start_date).days): # gu:时间长度 + date_now = start_date + timedelta(days=i + 1) + date_now_s = date_now.strftime('%Y-%m-%d') + base_url = 'https://www.jfdaily.com/staticsg/data/journal/' + url = base_url + date_now_s + '/navi.json?' + # 进入首页 + try: + response = await client.get(url) + json_obj = response.json() + print(json_obj) + print('一级链接%d' % response.status_code) + for item in json_obj['pages']: + banmianming = item['pname'] + banmianhao = item['pnumber'] + for item2 in item['articleList']: + title = item2['title'] + subtitle = item2['subtitle'] + h3title = item2['introtitle'] + id = item2['id'] + jdate = item2['jdate'] + json_url = base_url + str(jdate) + '/' + banmianhao + '/article/' + str(id) + '.json?' + response2 = await client.get(json_url) + json_obj2 = response2.json() + author = json_obj2['article']['author'] + content = json_obj2['article']['content'].strip() + url2 = 'https://www.jfdaily.com/staticsg/res/html/journal/detail.html?date=' + str( + jdate) + '&id=' + str(id) + '&page=01' + print(url2) + await shjfribao.insert_one({ + 'banmianhao': banmianhao, + 'banmianming': banmianming, + 'title': title, + 'subtitle': subtitle, + 'h3title': h3title, + 'author': author, + 'keywordlist': 'empty', + 'detail_url': url2, + 'release_time': date_now, + 'insert_timestamp': datetime.today(), + 'content': content + }) + print("%s-%s-%s已完成" % (date_now_s, banmianhao, title)) + await asyncio.sleep(random.randint(5, 20)) + print("%s已经完成" % date_now_s) + except Exception as result: + await shjfribao.insert_one({ + 'banmianhao': 'empty', + 'banmianming': 'empty', + 'title': 'empty', + 'subtitle': 'empty', + 'h3title': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': url, + 'release_time': date_now, + 'insert_timestamp': datetime.today(), + 'content': 'empty' + }) + print(result) + # 到这为止 + + +if __name__ == "__main__": # 当程序执行时 + # 调用函数 + asyncio.run(main()) + print("爬取完毕!")