187 lines
7.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# _*_ coding : UTF-8 _*_
# @Time : 2023/8/28 0:50
# @Author : Haochen Zhong
# @File : Exportxjzfgz.py
# @Software : PyCharm
# @Comment : 本程序用于到处新疆维吾尔自治区人民政府规章
import datetime
import os
import time
import pandas as pd
import pymongo
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.oxml.ns import qn
from docx.shared import Pt, Cm
client = pymongo.MongoClient('localhost', 27017)
"""与mongoDB数据库建立连接"""
mydb = client.sjzf_zcwj
"""政策文件存放在数据库的一级目录对象"""
xinjiangzcwj = mydb.xinjiangzcwj
"""政策文件存放对象"""
savePath = ""
"""导出文件存放路径"""
def replace_invalid_chars(text):
"""
替换Window系统和Linux系统文件路径禁止字符统一转换成Html实体编码
"""
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
"""Window系统和Linux系统文件路径禁止字符列表"""
replace_char = ['&lt;', '&gt;', '&#58;', '&quot;', '&#47;', '&#92;', '&#124;', '&#63;', '&#42;']
"""Window系统和Linux系统文件路径禁止字符替换列表 统一转换成html实体编码"""
for i, char in enumerate(invalid_chars):
text = text.replace(char, replace_char[i])
return text
def analysisTime(timestamp: int) -> str:
"""
处理时间将1970-01-01之前的时间戳正确转换
"""
if timestamp == 0:
return "未知"
if timestamp < 0:
# 计算从 1970-01-01 开始的时间间隔
delta = datetime.timedelta(seconds=abs(timestamp))
date = datetime.datetime(1970, 1, 1) - delta
else:
date = datetime.datetime.fromtimestamp(timestamp)
# 格式化为字符串
return date.strftime('%Y-%m-%d')
def saveFile():
num = 0
startTime = time.time()
global savePath
query = {
'typeOneName': "",
'typeSecondName': "",
'articleType': "",
"title": "",
"subTitle": "",
"childtype": "",
"index": "",
"pcode": "",
"puborg": "",
"ptime": "",
"pubtime": "",
"effectiveness": "",
"author": "",
"year": "",
"manuscriptRelatedRes": "",
"url": "",
"source": "",
"content": ""
}
query = {f'{k}': v for k, v in query.items() if v}
"""需要过滤的文章,默认不过滤"""
dataList = list(xinjiangzcwj.find(query))
if not savePath:
savePath = input("请输入数据存放路径:")
totalPath = os.path.join(savePath, "数据统计表.csv")
for data in dataList:
try:
typeOneName = data["typeOneName"]
"""一级分类目录"""
typeSecondName = data["typeSecondName"]
"""二级分类目录"""
articleType = data["articleType"]
"""四级分类目录"""
# 创建目录
output_directory = os.path.join(savePath, typeOneName, typeSecondName)
if not os.path.exists(output_directory):
os.makedirs(output_directory)
doc = Document()
firstLine = doc.add_paragraph()
firstLineText = f"索引号:{data['index']}\t\t有效性:{data['effectiveness']}"
firstLine_run = firstLine.add_run(firstLineText)
firstLine_run.font.size = Pt(12)
firstLine_run.font.name = 'Times New Roman' # 设置标题西文字体
firstLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
secondLine = doc.add_paragraph()
secondLineText = f"发文机关:{data['puborg']}\t\t发文字号:{data['pcode']}"
secondLine_run = secondLine.add_run(secondLineText)
secondLine_run.font.size = Pt(12)
secondLine_run.font.name = 'Times New Roman' # 设置标题西文字体
secondLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
thirdLine = doc.add_paragraph()
thirdLineText = f"标题:{data['title']}"
thirdLine_run = thirdLine.add_run(thirdLineText)
thirdLine_run.font.size = Pt(12)
thirdLine_run.font.name = 'Times New Roman' # 设置标题西文字体
thirdLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
fourLine = doc.add_paragraph()
pubtime = analysisTime(data['pubtime'])
ptime = analysisTime(data['ptime'])
fourLineText = f"成文日期:{ptime}\t\t发布日期:{pubtime}"
fourLine_run = fourLine.add_run(fourLineText)
fourLine_run.font.size = Pt(12)
fourLine_run.font.name = 'Times New Roman' # 设置标题西文字体
fourLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
urlLine = doc.add_paragraph()
urlLineText = f"文章链接:{data['url']}"
urlLine_run = urlLine.add_run(urlLineText)
urlLine_run.font.size = Pt(12)
urlLine_run.font.name = 'Times New Roman' # 设置标题西文字体
urlLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
title = doc.add_paragraph()
title_run = title.add_run(data["title"])
title_run.bold = True
title_run.font.size = Pt(22)
title_run.font.name = 'Times New Roman' # 设置标题西文字体
title_run.element.rPr.rFonts.set(qn('w:eastAsia'), "华文中宋")
title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER # 设置大标题居中对齐
for section in data["content"].split("\n"):
paragraph = doc.add_paragraph()
run = paragraph.add_run("\t" + section)
run.font.size = Pt(16)
run.font.name = "Times New Roman"
run.element.rPr.rFonts.set(qn('w:eastAsia'), "仿宋")
run.first_line_indent = Cm(0.74)
if data["manuscriptRelatedRes"]:
urlLine = doc.add_paragraph()
urlLineText = f"附件链接:{data['manuscriptRelatedRes']}"
urlLine_run = urlLine.add_run(urlLineText)
urlLine_run.font.size = Pt(12)
urlLine_run.font.name = 'Times New Roman' # 设置标题西文字体
urlLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
if len(data["title"]) > 45:
title_ = data["title"][len(data["title"]) - 30:]
else:
title_ = data["title"]
fileName = f"{replace_invalid_chars(title_)}.docx"
filePath = os.path.join(output_directory, fileName)
doc.save(filePath)
num += 1
print(f"{typeOneName}--{typeSecondName}--{data['title']}--导出成功!")
except Exception as e:
print(e)
continue
csvData = pd.DataFrame(dataList)
csvData.columns = ["数据库ID", "文章归类", "文种(自治区)", "公文种类", "文章标题", "文章副标题", "主题分类",
"索引号", "文号", "发文机关(自治区)", "成文日期时间戳", "发布日期时间戳", "有效性", "作者",
"年份",
"附件链接",
"文章链接", "来源", "正文内容"]
csvData.to_csv(totalPath, encoding="utf-8-sig", index_label="序号")
print(f"耗时:{time.time() - startTime} 秒,一共导出{num}份文件,详情数据请看数据统计表.csv")
if __name__ == '__main__':
saveFile()