187 lines
7.4 KiB
Python
Raw Normal View History

2024-11-09 17:00:30 +08:00
# _*_ coding : UTF-8 _*_
# @Time : 2023/8/28 0:50
# @Author : Haochen Zhong
# @File : Exportxjzfgz.py
# @Software : PyCharm
# @Comment : 本程序用于到处新疆维吾尔自治区人民政府规章
import datetime
import os
import time
import pandas as pd
import pymongo
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.oxml.ns import qn
from docx.shared import Pt, Cm
client = pymongo.MongoClient('localhost', 27017)
"""与mongoDB数据库建立连接"""
mydb = client.sjzf_zcwj
"""政策文件存放在数据库的一级目录对象"""
xinjiangzcwj = mydb.xinjiangzcwj
"""政策文件存放对象"""
savePath = ""
"""导出文件存放路径"""
def replace_invalid_chars(text):
"""
替换Window系统和Linux系统文件路径禁止字符统一转换成Html实体编码
"""
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
"""Window系统和Linux系统文件路径禁止字符列表"""
replace_char = ['&lt;', '&gt;', '&#58;', '&quot;', '&#47;', '&#92;', '&#124;', '&#63;', '&#42;']
"""Window系统和Linux系统文件路径禁止字符替换列表 统一转换成html实体编码"""
for i, char in enumerate(invalid_chars):
text = text.replace(char, replace_char[i])
return text
def analysisTime(timestamp: int) -> str:
"""
处理时间将1970-01-01之前的时间戳正确转换
"""
if timestamp == 0:
return "未知"
if timestamp < 0:
# 计算从 1970-01-01 开始的时间间隔
delta = datetime.timedelta(seconds=abs(timestamp))
date = datetime.datetime(1970, 1, 1) - delta
else:
date = datetime.datetime.fromtimestamp(timestamp)
# 格式化为字符串
return date.strftime('%Y-%m-%d')
def saveFile():
num = 0
startTime = time.time()
global savePath
query = {
'typeOneName': "",
'typeSecondName': "",
'articleType': "",
"title": "",
"subTitle": "",
"childtype": "",
"index": "",
"pcode": "",
"puborg": "",
"ptime": "",
"pubtime": "",
"effectiveness": "",
"author": "",
"year": "",
"manuscriptRelatedRes": "",
"url": "",
"source": "",
"content": ""
}
query = {f'{k}': v for k, v in query.items() if v}
"""需要过滤的文章,默认不过滤"""
dataList = list(xinjiangzcwj.find(query))
if not savePath:
savePath = input("请输入数据存放路径:")
totalPath = os.path.join(savePath, "数据统计表.csv")
for data in dataList:
try:
typeOneName = data["typeOneName"]
"""一级分类目录"""
typeSecondName = data["typeSecondName"]
"""二级分类目录"""
articleType = data["articleType"]
"""四级分类目录"""
# 创建目录
output_directory = os.path.join(savePath, typeOneName, typeSecondName)
if not os.path.exists(output_directory):
os.makedirs(output_directory)
doc = Document()
firstLine = doc.add_paragraph()
firstLineText = f"索引号:{data['index']}\t\t有效性:{data['effectiveness']}"
firstLine_run = firstLine.add_run(firstLineText)
firstLine_run.font.size = Pt(12)
firstLine_run.font.name = 'Times New Roman' # 设置标题西文字体
firstLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
secondLine = doc.add_paragraph()
secondLineText = f"发文机关:{data['puborg']}\t\t发文字号:{data['pcode']}"
secondLine_run = secondLine.add_run(secondLineText)
secondLine_run.font.size = Pt(12)
secondLine_run.font.name = 'Times New Roman' # 设置标题西文字体
secondLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
thirdLine = doc.add_paragraph()
thirdLineText = f"标题:{data['title']}"
thirdLine_run = thirdLine.add_run(thirdLineText)
thirdLine_run.font.size = Pt(12)
thirdLine_run.font.name = 'Times New Roman' # 设置标题西文字体
thirdLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
fourLine = doc.add_paragraph()
pubtime = analysisTime(data['pubtime'])
ptime = analysisTime(data['ptime'])
fourLineText = f"成文日期:{ptime}\t\t发布日期:{pubtime}"
fourLine_run = fourLine.add_run(fourLineText)
fourLine_run.font.size = Pt(12)
fourLine_run.font.name = 'Times New Roman' # 设置标题西文字体
fourLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
urlLine = doc.add_paragraph()
urlLineText = f"文章链接:{data['url']}"
urlLine_run = urlLine.add_run(urlLineText)
urlLine_run.font.size = Pt(12)
urlLine_run.font.name = 'Times New Roman' # 设置标题西文字体
urlLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
title = doc.add_paragraph()
title_run = title.add_run(data["title"])
title_run.bold = True
title_run.font.size = Pt(22)
title_run.font.name = 'Times New Roman' # 设置标题西文字体
title_run.element.rPr.rFonts.set(qn('w:eastAsia'), "华文中宋")
title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER # 设置大标题居中对齐
for section in data["content"].split("\n"):
paragraph = doc.add_paragraph()
run = paragraph.add_run("\t" + section)
run.font.size = Pt(16)
run.font.name = "Times New Roman"
run.element.rPr.rFonts.set(qn('w:eastAsia'), "仿宋")
run.first_line_indent = Cm(0.74)
if data["manuscriptRelatedRes"]:
urlLine = doc.add_paragraph()
urlLineText = f"附件链接:{data['manuscriptRelatedRes']}"
urlLine_run = urlLine.add_run(urlLineText)
urlLine_run.font.size = Pt(12)
urlLine_run.font.name = 'Times New Roman' # 设置标题西文字体
urlLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
if len(data["title"]) > 45:
title_ = data["title"][len(data["title"]) - 30:]
else:
title_ = data["title"]
fileName = f"{replace_invalid_chars(title_)}.docx"
filePath = os.path.join(output_directory, fileName)
doc.save(filePath)
num += 1
print(f"{typeOneName}--{typeSecondName}--{data['title']}--导出成功!")
except Exception as e:
print(e)
continue
csvData = pd.DataFrame(dataList)
csvData.columns = ["数据库ID", "文章归类", "文种(自治区)", "公文种类", "文章标题", "文章副标题", "主题分类",
"索引号", "文号", "发文机关(自治区)", "成文日期时间戳", "发布日期时间戳", "有效性", "作者",
"年份",
2024-11-09 17:00:30 +08:00
"附件链接",
"文章链接", "来源", "正文内容"]
csvData.to_csv(totalPath, encoding="utf-8-sig", index_label="序号")
2024-11-09 17:00:30 +08:00
print(f"耗时:{time.time() - startTime} 秒,一共导出{num}份文件,详情数据请看数据统计表.csv")
if __name__ == '__main__':
saveFile()