commit 3bfb57b662817411fd47f8c529c1fb184cbba7c3 Author: 皓月归尘 Date: Sat Nov 9 17:00:30 2024 +0800 初始化仓库 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5ba76c0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,52 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.pyc +*.pyo +*.pyd + +config.json +# Virtual environment +venv/ +env/ +.venv/ +.venv3/ +.Python +*.sqlite3 + +# IDE-specific files +.idea/ +.vscode/ + +# Compiled source +*.com +*.class +*.dll +*.exe +*.o +*.so + +# Logs and databases +*.log +*.sql +*.sqlite + +# Output files +dist/ +build/ +*.egg-info/ +*.egg + +# OS-specific files +.DS_Store +Thumbs.db + +# Miscellaneous +*.bak +*.swp +*.tmp +*.tmp.* +*.~* + + +# Jupyter Notebook +.ipynb_checkpoints/ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..48a29f8 Binary files /dev/null and b/requirements.txt differ diff --git a/国内党媒/CrawlZhongguogaigebao.py b/国内党媒/CrawlZhongguogaigebao.py new file mode 100644 index 0000000..503f105 --- /dev/null +++ b/国内党媒/CrawlZhongguogaigebao.py @@ -0,0 +1,186 @@ +# _*_ coding : UTF-8 _*_ +# @Time : 2024/11/06 21:35 +# @UpdateTime : 2024/11/06 21:35 +# @Author : haochen zhong +# @File : CrawlZhongguogaigebao.py +# @Software : PyCharm +# @Comment : 本程序采集中国改革报版面数据 +import asyncio +import random +from datetime import datetime, timedelta + +from bs4 import BeautifulSoup +from httpx import AsyncClient +from motor.motor_asyncio import AsyncIOMotorClient + +start_date = datetime.strptime('2017-09', '%Y-%m') +"""中国改革报2017年9月份开始有数据""" +end_date = datetime.today() +"""截止到今天""" +headers = { + 'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'} + +# 链接数据库 +client = AsyncIOMotorClient('mongodb://localhost:27017') +db = client['buweijiguanbao'] +collection = db['zhongguogaigebao'] + + +async def main(): + collection_names = await db.list_collection_names() + # 判断数据表是否存在 + if "zhongguogaigebao" not in collection_names: + # 如果不存在,则从2017年9月开始爬取 + print("中国改革报数据表不存在,开始采集!") + await getData(start_date, end_date) + else: + # 如果存在,则从数据库中获取最后一条记录的日期 + last_record = await collection.find_one({}, sort=[('release_time', -1)]) + last_date_str = last_record['release_time'] + print("数据库截止时间:",last_date_str) + await getData(last_date_str, end_date) + + +async def getContent(soup: BeautifulSoup) -> str: + """ + :param soup: BeautifulSoup对象 + :return: 文章内容 + """ + content = "" + for p in soup.select("#ozoom p"): + para = p.text.strip() + if para: + content += para + content += '\n' + return content + + +async def getData(start_date: datetime, end_date: datetime): + """ + :param start_date: 开始日期 + :param end_date: 结束日期 + :return: None + """ + crawl_num = 0 + # 创建一个列表保存月份 + months = [] + # 从开始日期到结束日期,每个月份都添加到列表中 + current_date = start_date + while current_date <= end_date: + months.append(current_date) + # 增加一个月 + if current_date.month == 12: + current_date = current_date.replace(year=current_date.year + 1, month=1) + else: + current_date = current_date.replace(month=current_date.month + 1) + # 遍历月份列表 + for month in months: + # 构造URL + url = f'http://www.cfgw.net.cn/epaper/{month.strftime("%Y%m")}/period.xml' + """http://www.cfgw.net.cn/epaper/201709/period.xml""" + print(url) + async with AsyncClient(headers=headers, timeout=60) as client: + # 发送GET请求 + response = await client.get(url) + response.encoding = response.charset_encoding + print(f"一级连接状态:{response.status_code}") + if response.status_code == 200: + # 解析XML + soup = BeautifulSoup(response.text, 'xml') + for period in soup.find_all("period"): + try: + period_id = period.get("id") + url1 = f"http://www.cfgw.net.cn/epaper/{month.strftime('%Y%m')}/{period_id}/node_01.htm" + """http://www.cfgw.net.cn/epaper/201709/05/node_01.htm""" + print(url1) + response2 = await client.get(url1) + response2.encoding = response2.charset_encoding + print(f"二级连接状态:{response2.status_code}") + if response2.status_code == 200: + soup2 = BeautifulSoup(response2.text, 'lxml') + for item in soup2.select(".posRelative>a"): + url2 = f"http://www.cfgw.net.cn/epaper/{month.strftime('%Y%m')}/{period_id}/" + item.get( + "href") + """http://www.cfgw.net.cn/epaper/201709/05/node_01/node_01.htm""" + banmianming = item.text.split(":")[-1] + banmianhao = item.text.split(":")[0] + print(url2) + response3 = await client.get(url2) + response3.encoding = response3.charset_encoding + print(f"三级连接状态:{response3.status_code}") + if response3.status_code == 200: + soup3 = BeautifulSoup(response3.text, 'lxml') + for item2 in soup3.select("#articlelist > .clearfix > a"): + url3 = f"http://www.cfgw.net.cn/epaper/" + item2.get("href")[6:] + if await collection.find_one({"detail_url": url3}, {"_id": False}): + continue + title = item2.text.strip() + print(url3) + response4 = await client.get(url3) + response4.encoding = response4.charset_encoding + print(f"四级连接状态:{response4.status_code}") + if response4.status_code == 200: + soup4 = BeautifulSoup(response4.text, 'lxml') + try: + title = soup4.select("#Title")[0].text.strip() + except: + title = title + try: + subtitle = soup4.select("#SubTitle")[0].text.strip() + except: + subtitle = "" + try: + preTitle = soup4.select("#PreTitle")[0].text.strip() + except: + preTitle = "" + try: + author = soup4.find("author").text.strip() + except: + author = "" + try: + keyword = soup4.find("keyword").text.strip() + except: + keyword = "" + content = await getContent(soup4) + await collection.insert_one({ + "title": title, + "subtitle": subtitle, + "preTitle": preTitle, + "author": author, + "banmianming": banmianming, + "banmianhao": banmianhao, + 'keywordlist': keyword, + 'detail_url': url3, + 'release_time': month + timedelta(days=int(period_id)-1), + 'insert_timestamp': datetime.today(), + 'content': content + }) + crawl_num += 1 + print( + f"中国改革报---{month.strftime('%Y-%m')}-{period_id}---{banmianming}---{banmianhao}---{title}---采集完成!") + await asyncio.sleep(random.randint(5, 15)) + print( + f"中国改革报---{month.strftime('%Y-%m')}-{period_id}---{banmianming}---{banmianhao}----采集完成!") + await asyncio.sleep(random.randint(5, 15)) + print( + f"中国改革报---{month.strftime('%Y-%m')}-{period_id}-----采集完成!") + await asyncio.sleep(random.randint(5, 15)) + except Exception as e: + await collection.insert_one( + {'banmianhao': 'empty', + 'banmianming': 'empty', + 'preTitle': 'empty', + 'title': 'empty', + 'subtitle': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': url, + 'release_time': month + timedelta(days=int(period_id)), + 'insert_timestamp': datetime.today(), + 'content': 'empty'} + ) + print(e) + print(f"中国改革报采集完毕,共采集{crawl_num}条数据!") + + +asyncio.run(main()) diff --git a/国内党媒/CrawlZhongguojiaoyubao.py b/国内党媒/CrawlZhongguojiaoyubao.py new file mode 100644 index 0000000..a25500d --- /dev/null +++ b/国内党媒/CrawlZhongguojiaoyubao.py @@ -0,0 +1,282 @@ +# _*_ coding : UTF-8 _*_ +# @Time : 2024/11/08 21:42 +# @UpdateTime : 2024/11/08 21:42 +# @Author : haochen zhong +# @File : CrawlZhongguojiaoyubao.py +# @Software : PyCharm +# @Comment : 本程序采集中国教育报数据 + +import asyncio +import random +from datetime import datetime, timedelta + +from bs4 import BeautifulSoup +from httpx import AsyncClient +from motor.motor_asyncio import AsyncIOMotorClient + +start_date = datetime.strptime('2022-01', '%Y-%m') +"""中国教育报2022年1月份开始有数据""" +end_date = datetime.today() +"""截止到今天""" +headers = { + 'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'} + +# 链接数据库 +client = AsyncIOMotorClient('mongodb://localhost:27017') +db = client['buweijiguanbao'] +collection = db['zhongguojiaoyubao'] + + +async def main(): + collection_names = await db.list_collection_names() + # 判断数据表是否存在 + if "zhongguojiaoyubao" not in collection_names: + # 如果不存在,则从2017年9月开始爬取 + print("中国教育报数据表不存在,开始采集!") + await getData(start_date, end_date) + else: + # 如果存在,则从数据库中获取最后一条记录的日期 + last_record = await collection.find_one({}, sort=[('release_time', -1)]) + last_date_str = last_record['release_time'] + print("数据库截止时间:", last_date_str) + await getData(last_date_str, end_date) + + +async def getContent(soup: BeautifulSoup) -> str: + """ + :param soup: BeautifulSoup对象 + :return: 文章内容 + """ + content = "" + for p in soup.select(".content_tt p"): + para = p.text.strip() + if para: + content += para + content += '\n' + return content + + +async def getData(start_date: datetime, end_date: datetime): + """ + :param start_date: 开始日期 + :param end_date: 结束日期 + :return: None + """ + crawl_num = 0 + # 创建一个列表保存月份 + months = [] + # 从开始日期到结束日期,每个月份都添加到列表中 + current_date = start_date + while current_date <= end_date: + months.append(current_date) + # 增加一个月 + if current_date.month == 12: + current_date = current_date.replace(year=current_date.year + 1, month=1) + else: + current_date = current_date.replace(month=current_date.month + 1) + # 遍历月份列表 + for month in months: + # 构造URL + url = f'http://paper.jyb.cn/zgjyb/html/{month.strftime("%Y-%m")}/period.xml' + """http://paper.jyb.cn/zgjyb/html/2023-01/period.xml""" + print(url) + async with AsyncClient(headers=headers, timeout=60) as client: + response = await client.get(url) + response.encoding = response.charset_encoding + print(f"一级连接状态:{response.status_code}") + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'xml') + for period in soup.select("period"): + period_name = datetime.strptime(period.find("period_name").text.strip(), "%Y-%m-%d") + front_page = period.find("front_page").text.strip() + try: + url1 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/{front_page}" + print(url1) + response2 = await client.get(url1) + response2.encoding = response2.charset_encoding + print(f"二级连接状态:{response2.status_code}") + if response2.status_code == 200: + soup2 = BeautifulSoup(response2.text, 'lxml') + for item in soup2.select(".right_title-name a"): + banmianming = item.text.split(":")[-1] + banmianhao = item.text.split(":")[0] + url2 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/" + item.get( + "href").replace("./","").strip() + print(url2) + response3 = await client.get(url2) + response3.encoding = response3.charset_encoding + print(f"三级连接状态:{response3.status_code}") + if response3.status_code == 200: + soup3 = BeautifulSoup(response3.text, 'lxml') + for item2 in soup3.select("#titleList1 a"): + url3 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/" + item2.get( + "href") + if await collection.find_one({"detail_url": url3}, {"_id": False}): + continue + title = item2.text.strip() + print(url3) + response4 = await client.get(url3) + response4.encoding = response4.charset_encoding + print(f"四级连接状态:{response4.status_code}") + if response4.status_code == 200: + soup4 = BeautifulSoup(response4.text, 'lxml') + try: + title = soup4.select_one(".title1").text.strip() + except: + title = title + try: + subTitle = soup4.select(".title2")[0].text.strip() + except: + subTitle = "" + try: + author = soup4.select_one(".title3").text.strip() + except: + author = "" + try: + perTitle = soup4.select(".title2")[-1].text.strip() + except: + perTitle = "" + try: + keywordlist = soup4.find("founder-keyword").text.strip() + except: + keywordlist = "" + content = await getContent(soup4) + await collection.insert_one({ + "title": title, + "subtitle": subTitle, + "preTitle": perTitle, + "author": author, + "banmianming": banmianming, + "banmianhao": banmianhao, + 'keywordlist': keywordlist, + 'detail_url': url3, + 'release_time': period_name, + 'insert_timestamp': datetime.today(), + 'content': content + }) + crawl_num += 1 + print( + f"中国教育报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}---{title}---采集完成!") + await asyncio.sleep(random.randint(5, 15)) + print( + f"中国教育报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}-----采集完成!") + await asyncio.sleep(random.randint(5, 15)) + print(f"中国教育报---{period_name.strftime('%Y-%m-%d')}-----采集完成!") + await asyncio.sleep(random.randint(5, 15)) + except Exception as e: + print(e) + await collection.insert_one( + {'banmianhao': 'empty', + 'banmianming': 'empty', + 'preTitle': 'empty', + 'title': 'empty', + 'subtitle': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': url, + 'release_time': period_name, + 'insert_timestamp': datetime.today(), + 'content': 'empty'} + ) + + else: + url = f"http://paper.jyb.cn/zgjyb/html/{month.strftime('%Y-%m')}/navi.xml" + response = await client.get(url) + response.encoding = response.charset_encoding + print(f"一级连接状态:{response.status_code}") + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'xml') + for period in soup.select("calendar"): + period_name = datetime.strptime(period.find("date").text.strip(), "%Y-%m-%d") + front_page = period.find("url").text.strip()[6:] + try: + url1 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/{front_page}" + print(url1) + response2 = await client.get(url1) + response2.encoding = response2.charset_encoding + print(f"二级连接状态:{response2.status_code}") + if response2.status_code == 200: + soup2 = BeautifulSoup(response2.text, 'lxml') + for item in soup2.select(".right_title-name a"): + banmianming = item.text.split(":")[-1] + banmianhao = item.text.split(":")[0] + url2 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/" + item.get( + "href").replace("./","").strip() + print(url2) + response3 = await client.get(url2) + response3.encoding = response3.charset_encoding + print(f"三级连接状态:{response3.status_code}") + if response3.status_code == 200: + soup3 = BeautifulSoup(response3.text, 'lxml') + for item2 in soup3.select("#titleList1 a"): + url3 = f"http://paper.jyb.cn/zgjyb/html/{period_name.strftime('%Y-%m/%d')}/" + item2.get( + "href") + if await collection.find_one({"detail_url": url3}, {"_id": False}): + continue + title = item2.text.strip() + print(url3) + response4 = await client.get(url3) + response4.encoding = response4.charset_encoding + print(f"四级连接状态:{response4.status_code}") + if response4.status_code == 200: + soup4 = BeautifulSoup(response4.text, 'lxml') + try: + title = soup4.select(".article-title")[0].text.strip() + except: + title = title + try: + subTitle = soup4.select(".article-subtitle")[0].text.strip() + except: + subTitle = "" + try: + author = soup4.select(".article-author")[0].text.strip() + except: + author = "" + try: + perTitle = soup4.select(".article-pretitle")[0].text.strip() + except: + perTitle = "" + try: + keywordlist = soup4.find("founder-keyword").text.strip() + except: + keywordlist = "" + content = await getContent(soup4) + await collection.insert_one({ + "title": title, + "subtitle": subTitle, + "preTitle": perTitle, + "author": author, + "banmianming": banmianming, + "banmianhao": banmianhao, + 'keywordlist': keywordlist, + 'detail_url': url3, + 'release_time': period_name, + 'insert_timestamp': datetime.today(), + 'content': content + }) + crawl_num += 1 + print( + f"中国教育报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}---{title}---采集完成!") + await asyncio.sleep(random.randint(5, 15)) + print( + f"中国教育报---{period_name.strftime('%Y-%m-%d')}----{banmianming}---{banmianhao}-----采集完成!") + await asyncio.sleep(random.randint(5, 15)) + print(f"中国教育报---{period_name.strftime('%Y-%m-%d')}-----采集完成!") + await asyncio.sleep(random.randint(5, 15)) + except Exception as e: + await collection.insert_one( + {'banmianhao': 'empty', + 'banmianming': 'empty', + 'preTitle': 'empty', + 'title': 'empty', + 'subtitle': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': url, + 'release_time':period_name , + 'insert_timestamp': datetime.today(), + 'content': 'empty'} + ) + print(f"中国教育报采集完毕,共采集{crawl_num}条数据!") + +asyncio.run(main()) diff --git a/国内党媒/CrawlZhongguojingjidaobao.py b/国内党媒/CrawlZhongguojingjidaobao.py new file mode 100644 index 0000000..39c9f2e --- /dev/null +++ b/国内党媒/CrawlZhongguojingjidaobao.py @@ -0,0 +1,185 @@ +# _*_ coding : UTF-8 _*_ +# @Time : 2024/11/08 00:07 +# @UpdateTime : 2024/11/08 00:07 +# @Author : haochen zhong +# @File : CrawlZhongguojingjidaobao.py +# @Software : PyCharm +# @Comment : 本程序采集中国经济导报数据 +import asyncio +import random +from datetime import datetime, timedelta + +from bs4 import BeautifulSoup +from httpx import AsyncClient +from motor.motor_asyncio import AsyncIOMotorClient + +start_date = datetime.strptime('2012-09', '%Y-%m') +"""中国经济导报2012年9月份开始有数据""" +end_date = datetime.today() +"""截止到今天""" +headers = { + 'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'} + +# 链接数据库 +client = AsyncIOMotorClient('mongodb://localhost:27017') +db = client['buweijiguanbao'] +collection = db['zhongguojingjidaobao'] + + +async def main(): + collection_names = await db.list_collection_names() + # 判断数据表是否存在 + if "zhongguojingjidaobao" not in collection_names: + # 如果不存在,则从2017年9月开始爬取 + print("中国经济导报数据表不存在,开始采集!") + await getData(start_date, end_date) + else: + # 如果存在,则从数据库中获取最后一条记录的日期 + last_record = await collection.find_one({}, sort=[('release_time', -1)]) + last_date_str = last_record['release_time'] + print("数据库截止时间:", last_date_str) + await getData(last_date_str, end_date) + + +async def getContent(soup: BeautifulSoup) -> str: + """ + :param soup: BeautifulSoup对象 + :return: 文章内容 + """ + content = "" + for p in soup.select("#pgcontent"): + para = p.text.strip() + if para: + content += para + content += '\n' + return content + + +async def getData(start_date: datetime, end_date: datetime): + """ + :param start_date: 开始日期 + :param end_date: 结束日期 + :return: None + """ + crawl_num = 0 + # 创建一个列表保存月份 + months = [] + # 从开始日期到结束日期,每个月份都添加到列表中 + current_date = start_date + while current_date <= end_date: + months.append(current_date) + # 增加一个月 + if current_date.month == 12: + current_date = current_date.replace(year=current_date.year + 1, month=1) + else: + current_date = current_date.replace(month=current_date.month + 1) + # 遍历月份列表 + for month in months: + # 构造URL + url = f'http://www.ceh.com.cn/epaper/uniflows/html/{month.strftime("%Y/%m")}/date.txt' + """http://www.ceh.com.cn/epaper/uniflows/html/2012/09/date.txt""" + print(url) + async with AsyncClient(headers=headers, timeout=60) as client: + # 发送GET请求 + response = await client.get(url) + response.encoding = "gb2312" + print(f"一级连接状态:{response.status_code}") + if response.status_code == 200: + # 解析XML + soup = response.text.split("|") + for period in soup: + period_id, element = period.split(",") + if len(element) < 5: + continue + try: + url1 = f"http://www.ceh.com.cn/epaper/uniflows/html/{month.strftime('%Y/%m')}/{period_id}/boardurl.htm" + """http://www.ceh.com.cn/epaper/uniflows/html/2012/09/01/boardurl.htm""" + print(url1) + response2 = await client.get(url1) + response2.encoding = "gb2312" + print(f"二级连接状态:{response2.status_code}") + if response2.status_code == 200: + soup2 = BeautifulSoup(response2.text, 'lxml') + for item in soup2.select(".board_link td>a"): + url2 = f"http://www.ceh.com.cn/epaper/uniflows/html/{month.strftime('%Y/%m')}/{period_id}/" + item.get( + "href") + """http://www.ceh.com.cn/epaper/uniflows/html/2024/11/07/01/default.htm""" + banmianming = item.text.split(":")[-1].strip() + banmianhao = item.text.split(":")[0].replace(" ", "").replace(" ", "").strip() + print(url2) + response3 = await client.get(url2) + response3.encoding = "gb2312" + print(f"三级连接状态:{response3.status_code}") + if response3.status_code == 200: + soup3 = BeautifulSoup(response3.text, 'lxml') + for item2 in soup3.select("#mp_32"): + url3 = f"http://www.ceh.com.cn/epaper/uniflows/html/{month.strftime('%Y/%m')}/{period_id}/" + \ + item.get("href").split("/")[0] + "/" + item2.get("href") + if await collection.find_one({"detail_url": url3}, {"_id": False}): + continue + title = item2.text.strip() + print(url3) + response4 = await client.get(url3) + response4.encoding = "gb2312" + print(f"四级连接状态:{response4.status_code}") + if response4.status_code == 200: + soup4 = BeautifulSoup(response4.text, 'lxml') + try: + title = soup4.select(".content_title")[0].text.strip() + except: + title = title + try: + subtitle = soup4.select(".subtitle")[0].text.strip() + except: + subtitle = "" + try: + preTitle = soup4.select(".yinti_title")[0].text.strip() + except: + preTitle = "" + try: + author = soup4.select(".others")[0].text.strip() + except: + author = "" + content = await getContent(soup4) + await collection.insert_one({ + "title": title, + "subtitle": subtitle, + "preTitle": preTitle, + "author": author, + "banmianming": banmianming, + "banmianhao": banmianhao, + 'keywordlist': 'empty', + 'detail_url': url3, + 'release_time': month + timedelta(days=int(period_id) - 1), + 'insert_timestamp': datetime.today(), + 'content': content + }) + crawl_num += 1 + print( + f"中国经济导报---{month.strftime('%Y-%m')}-{period_id}---{banmianming}---{banmianhao}---{title}---采集完成!") + await asyncio.sleep(random.randint(5, 15)) + print( + f"中国经济导报---{month.strftime('%Y-%m')}-{period_id}---{banmianming}---{banmianhao}----采集完成!") + await asyncio.sleep(random.randint(5, 15)) + print( + f"中国经济导报---{month.strftime('%Y-%m')}-{period_id}-----采集完成!") + await asyncio.sleep(random.randint(5, 15)) + except Exception as e: + await collection.insert_one( + {'banmianhao': 'empty', + 'banmianming': 'empty', + 'preTitle': 'empty', + 'title': 'empty', + 'subtitle': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': url, + 'release_time': month + timedelta(days=int(period_id)), + 'insert_timestamp': datetime.today(), + 'content': 'empty'} + ) + print(e) + print(f"中国经济导报采集完毕,共采集{crawl_num}条数据!") + + +asyncio.run(main()) diff --git a/国内党媒/v1.0各部委的机关报纸和专业报纸名录信息20241104.xlsx b/国内党媒/v1.0各部委的机关报纸和专业报纸名录信息20241104.xlsx new file mode 100644 index 0000000..6e76f91 Binary files /dev/null and b/国内党媒/v1.0各部委的机关报纸和专业报纸名录信息20241104.xlsx differ diff --git a/地方政策/报刊/CrawlAnhui.py b/地方政策/报刊/CrawlAnhui.py new file mode 100644 index 0000000..e89f760 --- /dev/null +++ b/地方政策/报刊/CrawlAnhui.py @@ -0,0 +1,145 @@ +# _*_ coding : UTF-8 _*_ +# @Time : 2024/11/08 20:29 +# @UpdateTime : 2024/11/08 20:29 +# @Author : haochen zhong +# @File : CrawlAnhui.py +# @Software : PyCharm +# @Comment : 本程序采集安徽日报数字报数据 + + +import asyncio +import random +from datetime import datetime, timedelta + +from bs4 import BeautifulSoup +from httpx import AsyncClient +from motor.motor_asyncio import AsyncIOMotorClient + +start_date = datetime.strptime('2017-09-29', '%Y-%m-%d') +"""安徽日报报2018年09月29日开始有数据""" +end_date = datetime.today() +"""截止到今天""" +headers = { + 'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'} + +# 链接数据库 +client = AsyncIOMotorClient('mongodb://localhost:27017') +db = client['dfdm_sjribao'] +collection = db['anhuiribao'] + + +async def main(): + collection_names = await db.list_collection_names() + # 判断数据表是否存在 + if "anhuiribao" not in collection_names: + # 如果不存在,则从2017年9月开始爬取 + print("安徽日报报数据表不存在,开始采集!") + await getData(start_date, end_date) + else: + # 如果存在,则从数据库中获取最后一条记录的日期 + last_record = await collection.find_one({}, sort=[('release_time', -1)]) + last_date_str = last_record['release_time'] + print("数据库截止时间:", last_date_str) + await getData(last_date_str, end_date) + + +async def getContent(soup: BeautifulSoup) -> str: + """ + :param soup: BeautifulSoup对象 + :return: 文章内容 + """ + content = "" + for p in soup.select(".content p"): + para = p.text.strip() + if para: + content += para + content += '\n' + return content + + +async def getData(start_date: datetime, end_date: datetime): + crawl_num = 0 + for i in range((end_date - start_date).days): + date_now = start_date + timedelta(days=i + 1) + date_now_s = date_now.strftime('%Y%m/%d') + base_url = "https://szb.ahnews.com.cn/ahrb/layout/" + date_now_s + '/' + url = base_url + 'node_01.html' + """https://szb.ahnews.com.cn/ahrb/layout/201811/01/node_01.html""" + try: + async with AsyncClient(headers=headers, timeout=60) as client: + print(url) + response = await client.get(url) + response.encoding = response.charset_encoding + print(f"一级连接状态:{response.status_code}") + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'lxml') + for item in soup.select(".Chunkiconlist p > a:nth-child(1)"): + banmianming = item.text.split(":")[-1].strip() + banmianhao = item.text.split(":")[0].replace(" ", "").replace(" ", "").strip() + url1 = base_url + item.get("href") + print(url1) + response2= await client.get(url1) + response2.encoding = response2.charset_encoding + print(f"二级连接状态:{response2.status_code}") + if response2.status_code == 200: + soup2 = BeautifulSoup(response2.text, 'lxml') + for item2 in soup2.select(".newslist a"): + url2 = "https://szb.ahnews.com.cn/ahrb/" + item2.get("href")[9:] + """https://szb.ahnews.com.cn/ahrb/content/201709/29/c17310.html""" + if await collection.find_one({"detail_url": url2}, {"_id": False}): + continue + title = item2.text.strip() + print(url2) + response3 = await client.get(url2) + response3.encoding = response3.charset_encoding + print(f"三级连接状态:{response3.status_code}") + if response3.status_code == 200: + soup3 = BeautifulSoup(response3.text, 'lxml') + content = await getContent(soup3) + try: + title = soup3.select(".newsdetatit h3")[0].text.strip() + except: + title = title + try: + subTitle= soup3.select(".newsdetatext p")[0].text.strip() + except: + subTitle = "" + await collection.insert_one({ + "title": title, + "subtitle": subTitle, + "preTitle": "", + "author": "", + "banmianming": banmianming, + "banmianhao": banmianhao, + 'keywordlist': 'empty', + 'detail_url': url2, + 'release_time': date_now, + 'insert_timestamp': datetime.today(), + 'content': content + }) + crawl_num += 1 + print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}---{title}---采集完成!") + await asyncio.sleep(random.randint(5, 15)) + print(f"安徽日报---{date_now_s}---{banmianming}---{banmianhao}-----采集完成!") + await asyncio.sleep(random.randint(5, 15)) + print(f"安徽日报---{date_now_s}-----采集完成!") + await asyncio.sleep(random.randint(5, 15)) + except Exception as e: + print(e) + await collection.insert_one( + {'banmianhao': 'empty', + 'banmianming': 'empty', + 'preTitle': 'empty', + 'title': 'empty', + 'subtitle': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': url, + 'release_time': date_now, + 'insert_timestamp': datetime.today(), + 'content': 'empty'} + ) + print(f"安徽日报采集完毕,共采集{crawl_num}条数据!") + +asyncio.run(main()) + diff --git a/地方政策/报刊/CrawlGuizhou.py b/地方政策/报刊/CrawlGuizhou.py new file mode 100644 index 0000000..ce44478 --- /dev/null +++ b/地方政策/报刊/CrawlGuizhou.py @@ -0,0 +1,140 @@ +# _*_ coding : UTF-8 _*_ +# @Time : 2022/12/27 14:15 +# @UpdateTime : 2023/11/08 16:30 +# @Author : Haochen Zhong +# @File : CrawlGuizhou.py +# @Software : PyCharm +# @Comment : 本程序采集贵州日报数字报板面数据 +import random +import time +from datetime import timedelta, datetime + +import pymongo +import requests +from bs4 import BeautifulSoup + +# 数据库起止时间 +start_date = datetime.strptime('2021-12-31', '%Y-%m-%d') +"""贵州日报数字报2022-01-01开始有数据纪录""" +end_date = datetime.today() +headers = { + 'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'} +# 创建数据库 +client = pymongo.MongoClient('localhost', 27017) +mydb = client.dfdm_sjribao +guizhouribao = mydb.guizhouribao +# 设置随机时间 +sleeptime = random.randint(2, 15) + + +def main(): + # 判断数据库是否存在 + collist = mydb.list_collection_names() + if "guizhouribao" in collist: # 检测集合是否存在 + print("贵州集合存在,更新数据库") + # 数据库最新一条内容的时间 + db_time = guizhouribao.find_one(sort=[('release_time', -1)])[ + 'release_time'] # 或者find().sort('_id', -1).limit(1) + print('数据库截止时间%s' % db_time) + # 输入更新数据库时间 + input_time = datetime.today() + if db_time < input_time: + getData(db_time, input_time) + else: + print('数据库无需更新') + else: + # 爬取网页并建立数据库 + print("数据库不存在,建立数据库!") + getData(start_date, end_date) + + +def get_content(soup3): + content = "" + for p in soup3.select("#ozoom p"): + para = p.text.strip() + if para: + content += para + content += '\n' + return content + + +def getData(start_date, end_date): + crawl_num = 0 + for i in range((end_date - start_date).days): + date_now = start_date + timedelta(days=i + 1) + date_now_s = date_now.strftime('%Y%m/%d') + base_url = "http://szb.gzrbs.com.cn/pc/layout/" + date_now_s + "/" + url = base_url + "node_01.html" + # http://szb.gzrbs.com.cn/pc/layout/202201/01/node_01.html + try: + response = requests.get(url=url, headers=headers, timeout=(30, 45)) + response.encoding = response.apparent_encoding + print(f"一级连接状态:{response.status_code}") + if response.status_code == 200: + soup = BeautifulSoup(response.text, "lxml") + for item in soup.select(".btn-block"): + banmianming = item.text.split(":")[-1] + banmianhao = item.text.split(":")[0] + url1 = base_url + item.get("href") + response2 = requests.get(url=url1, headers=headers, timeout=(30, 45)) + response2.encoding = response2.apparent_encoding + print(f"二级连接状态:{response2.status_code}") + if response2.status_code == 200: + soup2 = BeautifulSoup(response2.text, "lxml") + for item2 in soup2.select(".resultList a"): + title = item2.text.strip() + url2 = "http://szb.gzrbs.com.cn/pc/" + item2.get("href")[9:] + # http://szb.gzrbs.com.cn/pc/cont/202201/02/content_42202.html + response3 = requests.get(url=url2, headers=headers, timeout=(30, 45)) + response3.encoding = response3.apparent_encoding + print(f"三级连接状态:{response3.status_code}") + if response3.status_code == 200: + soup3 = BeautifulSoup(response3.text, "lxml") + try: + title = soup3.select("#Title")[0].text.strip() + except: + title = title + try: + subtitle = soup3.select("#SubTitle")[0].text.strip() + except: + subtitle = "" + try: + preTitle = soup3.select("#PreTitle")[0].text.strip() + except: + preTitle = "" + content = get_content(soup3) + guizhouribao.insert_one({'banmianhao': banmianhao, + 'banmianming': banmianming, + 'preTitle': preTitle, + 'title': title, + 'subtitle': subtitle, + 'author': '', + 'keywordlist': 'empty', + 'detail_url': url2, + 'release_time': date_now, + 'insert_timestamp': datetime.today(), + 'content': content}) + crawl_num += 1 + print(f"贵州日报-{date_now_s}-{banmianming}-{title}-已完成") + time.sleep(sleeptime) + print(f"贵州日报-{date_now_s}-{banmianming}-已完成") + time.sleep(sleeptime) + print(f"贵州日报-{date_now_s}-已完成") + except Exception as result: + guizhouribao.insert_one({'banmianhao': 'empty', + 'banmianming': 'empty', + 'preTitle': 'empty', + 'title': 'empty', + 'subtitle': 'empty', + 'author': 'empty', + 'keywordlist': 'empty', + 'detail_url': url, + 'release_time': date_now, + 'insert_timestamp': datetime.today(), + 'content': 'empty'}) + print(result) + print(f"贵州日报采集完毕,共采集{crawl_num}条数据!") + + +if __name__ == '__main__': + main() diff --git a/地方政策/报刊/CrawlHainan.py b/地方政策/报刊/CrawlHainan.py new file mode 100644 index 0000000..ba50485 --- /dev/null +++ b/地方政策/报刊/CrawlHainan.py @@ -0,0 +1,162 @@ +# _*_ coding : UTF-8 _*_ +# @Time : 2024-01-17 14:24:59 +# @Author : haochen zhong +# @File : CrawlHainan.py +# @Software : PyCharm +# @Comment : +import re +from bs4 import BeautifulSoup +import requests +from datetime import timedelta, datetime +import time +import pymongo +import random + +# 数据库起止时间 +start_date = datetime.strptime('2008-02-29', '%Y-%m-%d') +end_date = datetime.today() +headers = { + 'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.42'} +# 创建数据库 +client = pymongo.MongoClient('localhost', 27017) +mydb = client.dfdm_sjribao +hainanribao = mydb.hainanribao + + +def main(): + # 判断数据库是否存在 + collist = mydb.list_collection_names() + if "hainanribao" in collist: # 检测集合是否存在 + print("海南日报集合存在,更新数据库") + # 数据库最新一条内容的时间 + db_time = hainanribao.find_one(sort=[('release_time', -1)])['release_time'] + print(f'数据库截止时间{db_time}') + # 输入更新数据库时间 + input_time = datetime.today() + if db_time < input_time: + getData(db_time, input_time) + else: + print('数据库无需更新') + else: + print("数据库不存在,建立数据库") + # 爬取网页并建立数据库 + getData(start_date, end_date) + + +# 解析网页正文 +def parse_html_text(soup): + """ + :param html: html字符串 + :return: 正文 string + """ + content = '' + if soup.select('#ozoom'): + content = soup.select('#ozoom')[0].text.strip() + return content + + +def parse_subtitle(soup): + item = soup.select('.font02') + if re.findall(r'article-subtitle>-->(.*?)(.*?)(.*?)(.*?)(.*?)(.*?)