# _*_ coding : UTF-8 _*_ # @Time : 2023/8/28 0:50 # @Author : Haochen Zhong # @File : Exportxjzfgz.py # @Software : PyCharm # @Comment : 本程序用于到处新疆维吾尔自治区人民政府规章 import datetime import os import time import pandas as pd import pymongo from docx import Document from docx.enum.text import WD_PARAGRAPH_ALIGNMENT from docx.oxml.ns import qn from docx.shared import Pt, Cm client = pymongo.MongoClient('localhost', 27017) """与mongoDB数据库建立连接""" mydb = client.sjzf_zcwj """政策文件存放在数据库的一级目录对象""" xinjiangzcwj = mydb.xinjiangzcwj """政策文件存放对象""" savePath = "" """导出文件存放路径""" def replace_invalid_chars(text): """ 替换Window系统和Linux系统文件路径禁止字符,统一转换成Html实体编码 """ invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*'] """Window系统和Linux系统文件路径禁止字符列表""" replace_char = ['<', '>', ':', '"', '/', '\', '|', '?', '*'] """Window系统和Linux系统文件路径禁止字符替换列表 统一转换成html实体编码""" for i, char in enumerate(invalid_chars): text = text.replace(char, replace_char[i]) return text def analysisTime(timestamp: int) -> str: """ 处理时间,将1970-01-01之前的时间戳正确转换 """ if timestamp == 0: return "未知" if timestamp < 0: # 计算从 1970-01-01 开始的时间间隔 delta = datetime.timedelta(seconds=abs(timestamp)) date = datetime.datetime(1970, 1, 1) - delta else: date = datetime.datetime.fromtimestamp(timestamp) # 格式化为字符串 return date.strftime('%Y-%m-%d') def saveFile(): num = 0 startTime = time.time() global savePath query = { 'typeOneName': "", 'typeSecondName': "", 'articleType': "", "title": "", "subTitle": "", "childtype": "", "index": "", "pcode": "", "puborg": "", "ptime": "", "pubtime": "", "effectiveness": "", "author": "", "year": "", "manuscriptRelatedRes": "", "url": "", "source": "", "content": "" } query = {f'{k}': v for k, v in query.items() if v} """需要过滤的文章,默认不过滤""" dataList = list(xinjiangzcwj.find(query)) if not savePath: savePath = input("请输入数据存放路径:") totalPath = os.path.join(savePath, "数据统计表.csv") for data in dataList: try: typeOneName = data["typeOneName"] """一级分类目录""" typeSecondName = data["typeSecondName"] """二级分类目录""" articleType = data["articleType"] """四级分类目录""" # 创建目录 output_directory = os.path.join(savePath, typeOneName, typeSecondName) if not os.path.exists(output_directory): os.makedirs(output_directory) doc = Document() firstLine = doc.add_paragraph() firstLineText = f"索引号:{data['index']}\t\t有效性:{data['effectiveness']}" firstLine_run = firstLine.add_run(firstLineText) firstLine_run.font.size = Pt(12) firstLine_run.font.name = 'Times New Roman' # 设置标题西文字体 firstLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体") secondLine = doc.add_paragraph() secondLineText = f"发文机关:{data['puborg']}\t\t发文字号:{data['pcode']}" secondLine_run = secondLine.add_run(secondLineText) secondLine_run.font.size = Pt(12) secondLine_run.font.name = 'Times New Roman' # 设置标题西文字体 secondLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体") thirdLine = doc.add_paragraph() thirdLineText = f"标题:{data['title']}" thirdLine_run = thirdLine.add_run(thirdLineText) thirdLine_run.font.size = Pt(12) thirdLine_run.font.name = 'Times New Roman' # 设置标题西文字体 thirdLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体") fourLine = doc.add_paragraph() pubtime = analysisTime(data['pubtime']) ptime = analysisTime(data['ptime']) fourLineText = f"成文日期:{ptime}\t\t发布日期:{pubtime}" fourLine_run = fourLine.add_run(fourLineText) fourLine_run.font.size = Pt(12) fourLine_run.font.name = 'Times New Roman' # 设置标题西文字体 fourLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体") urlLine = doc.add_paragraph() urlLineText = f"文章链接:{data['url']}" urlLine_run = urlLine.add_run(urlLineText) urlLine_run.font.size = Pt(12) urlLine_run.font.name = 'Times New Roman' # 设置标题西文字体 urlLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体") title = doc.add_paragraph() title_run = title.add_run(data["title"]) title_run.bold = True title_run.font.size = Pt(22) title_run.font.name = 'Times New Roman' # 设置标题西文字体 title_run.element.rPr.rFonts.set(qn('w:eastAsia'), "华文中宋") title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER # 设置大标题居中对齐 for section in data["content"].split("\n"): paragraph = doc.add_paragraph() run = paragraph.add_run("\t" + section) run.font.size = Pt(16) run.font.name = "Times New Roman" run.element.rPr.rFonts.set(qn('w:eastAsia'), "仿宋") run.first_line_indent = Cm(0.74) if data["manuscriptRelatedRes"]: urlLine = doc.add_paragraph() urlLineText = f"附件链接:{data['manuscriptRelatedRes']}" urlLine_run = urlLine.add_run(urlLineText) urlLine_run.font.size = Pt(12) urlLine_run.font.name = 'Times New Roman' # 设置标题西文字体 urlLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体") if len(data["title"]) > 45: title_ = data["title"][len(data["title"]) - 30:] else: title_ = data["title"] fileName = f"{replace_invalid_chars(title_)}.docx" filePath = os.path.join(output_directory, fileName) doc.save(filePath) num += 1 print(f"{typeOneName}--{typeSecondName}--{data['title']}--导出成功!") except Exception as e: print(e) continue csvData = pd.DataFrame(dataList) csvData.columns = ["数据库ID", "文章归类", "文种(自治区)", "公文种类", "文章标题", "文章副标题", "主题分类", "索引号", "文号", "发文机关(自治区)", "成文日期时间戳", "发布日期时间戳", "有效性", "作者", "年份", "附件链接", "文章链接", "来源", "正文内容"] csvData.to_csv(totalPath, encoding="utf-8-sig", index_label="序号") print(f"耗时:{time.time() - startTime} 秒,一共导出{num}份文件,详情数据请看数据统计表.csv") if __name__ == '__main__': saveFile()