guoneimeitishujucaiji/地方政策/政策/新疆/export/Exportxjzfgz.py

# _*_ coding : UTF-8 _*_
# @Time : 2023/8/28 0:50
# @Author : Haochen Zhong
# @File : Exportxjzfgz.py
# @Software : PyCharm
# @Comment : 本程序用于到处新疆维吾尔自治区人民政府规章

import datetime
import os
import time

import pandas as pd
import pymongo
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.oxml.ns import qn
from docx.shared import Pt, Cm

client = pymongo.MongoClient('localhost', 27017)
"""与mongoDB数据库建立连接"""
mydb = client.sjzf_zcwj
"""政策文件存放在数据库的一级目录对象"""
xinjiangzcwj = mydb.xinjiangzcwj
"""政策文件存放对象"""

savePath = ""
"""导出文件存放路径"""


def replace_invalid_chars(text):
    """
    替换Window系统和Linux系统文件路径禁止字符，统一转换成Html实体编码
    """
    invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
    """Window系统和Linux系统文件路径禁止字符列表"""
    replace_char = ['&lt;', '&gt;', '&#58;', '&quot;', '&#47;', '&#92;', '&#124;', '&#63;', '&#42;']
    """Window系统和Linux系统文件路径禁止字符替换列表 统一转换成html实体编码"""

    for i, char in enumerate(invalid_chars):
        text = text.replace(char, replace_char[i])
    return text


def analysisTime(timestamp: int) -> str:
    """
    处理时间，将1970-01-01之前的时间戳正确转换
    """
    if timestamp == 0:
        return "未知"
    if timestamp < 0:
        # 计算从 1970-01-01 开始的时间间隔
        delta = datetime.timedelta(seconds=abs(timestamp))
        date = datetime.datetime(1970, 1, 1) - delta
    else:
        date = datetime.datetime.fromtimestamp(timestamp)
    # 格式化为字符串
    return date.strftime('%Y-%m-%d')


def saveFile():
    num = 0
    startTime = time.time()
    global savePath
    query = {
        'typeOneName': "",
        'typeSecondName': "",
        'articleType': "",
        "title": "",
        "subTitle": "",
        "childtype": "",
        "index": "",
        "pcode": "",
        "puborg": "",
        "ptime": "",
        "pubtime": "",
        "effectiveness": "",
        "author": "",
        "year": "",
        "manuscriptRelatedRes": "",
        "url": "",
        "source": "",
        "content": ""
    }
    query = {f'{k}': v for k, v in query.items() if v}
    """需要过滤的文章，默认不过滤"""
    dataList = list(xinjiangzcwj.find(query))
    if not savePath:
        savePath = input("请输入数据存放路径：")
    totalPath = os.path.join(savePath, "数据统计表.csv")
    for data in dataList:
        try:
            typeOneName = data["typeOneName"]
            """一级分类目录"""
            typeSecondName = data["typeSecondName"]
            """二级分类目录"""
            articleType = data["articleType"]
            """四级分类目录"""
            # 创建目录
            output_directory = os.path.join(savePath, typeOneName, typeSecondName)
            if not os.path.exists(output_directory):
                os.makedirs(output_directory)
            doc = Document()
            firstLine = doc.add_paragraph()
            firstLineText = f"索引号：{data['index']}\t\t有效性：{data['effectiveness']}"
            firstLine_run = firstLine.add_run(firstLineText)
            firstLine_run.font.size = Pt(12)
            firstLine_run.font.name = 'Times New Roman'  # 设置标题西文字体
            firstLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")

            secondLine = doc.add_paragraph()
            secondLineText = f"发文机关：{data['puborg']}\t\t发文字号：{data['pcode']}"
            secondLine_run = secondLine.add_run(secondLineText)
            secondLine_run.font.size = Pt(12)
            secondLine_run.font.name = 'Times New Roman'  # 设置标题西文字体
            secondLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")

            thirdLine = doc.add_paragraph()
            thirdLineText = f"标题：{data['title']}"
            thirdLine_run = thirdLine.add_run(thirdLineText)
            thirdLine_run.font.size = Pt(12)
            thirdLine_run.font.name = 'Times New Roman'  # 设置标题西文字体
            thirdLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")

            fourLine = doc.add_paragraph()
            pubtime = analysisTime(data['pubtime'])
            ptime = analysisTime(data['ptime'])
            fourLineText = f"成文日期：{ptime}\t\t发布日期：{pubtime}"
            fourLine_run = fourLine.add_run(fourLineText)
            fourLine_run.font.size = Pt(12)
            fourLine_run.font.name = 'Times New Roman'  # 设置标题西文字体
            fourLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")

            urlLine = doc.add_paragraph()
            urlLineText = f"文章链接：{data['url']}"
            urlLine_run = urlLine.add_run(urlLineText)
            urlLine_run.font.size = Pt(12)
            urlLine_run.font.name = 'Times New Roman'  # 设置标题西文字体
            urlLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")

            title = doc.add_paragraph()
            title_run = title.add_run(data["title"])
            title_run.bold = True
            title_run.font.size = Pt(22)
            title_run.font.name = 'Times New Roman'  # 设置标题西文字体
            title_run.element.rPr.rFonts.set(qn('w:eastAsia'), "华文中宋")
            title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER  # 设置大标题居中对齐

            for section in data["content"].split("\n"):
                paragraph = doc.add_paragraph()
                run = paragraph.add_run("\t" + section)
                run.font.size = Pt(16)
                run.font.name = "Times New Roman"
                run.element.rPr.rFonts.set(qn('w:eastAsia'), "仿宋")
                run.first_line_indent = Cm(0.74)

            if data["manuscriptRelatedRes"]:
                urlLine = doc.add_paragraph()
                urlLineText = f"附件链接：{data['manuscriptRelatedRes']}"
                urlLine_run = urlLine.add_run(urlLineText)
                urlLine_run.font.size = Pt(12)
                urlLine_run.font.name = 'Times New Roman'  # 设置标题西文字体
                urlLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
            if len(data["title"]) > 45:
                title_ = data["title"][len(data["title"]) - 30:]
            else:
                title_ = data["title"]
            fileName = f"{replace_invalid_chars(title_)}.docx"
            filePath = os.path.join(output_directory, fileName)
            doc.save(filePath)
            num += 1
            print(f"{typeOneName}--{typeSecondName}--{data['title']}--导出成功！")
        except Exception as e:
            print(e)
            continue
    csvData = pd.DataFrame(dataList)
    csvData.columns = ["数据库ID", "文章归类", "文种（自治区）", "公文种类", "文章标题", "文章副标题", "主题分类",
                       "索引号", "文号", "发文机关（自治区）", "成文日期时间戳", "发布日期时间戳", "有效性", "作者",
                       "年份",
                       "附件链接",
                       "文章链接", "来源", "正文内容"]
    csvData.to_csv(totalPath, encoding="utf-8-sig", index_label="序号")
    print(f"耗时：{time.time() - startTime} 秒，一共导出{num}份文件,详情数据请看数据统计表.csv")


if __name__ == '__main__':
    saveFile()