186 lines
7.4 KiB
Python
186 lines
7.4 KiB
Python
|
|
# _*_ coding : UTF-8 _*_
|
|||
|
|
# @Time : 2023/8/28 0:50
|
|||
|
|
# @Author : Haochen Zhong
|
|||
|
|
# @File : Exportxjzfgz.py
|
|||
|
|
# @Software : PyCharm
|
|||
|
|
# @Comment : 本程序用于到处新疆维吾尔自治区人民政府规章
|
|||
|
|
|
|||
|
|
import datetime
|
|||
|
|
import os
|
|||
|
|
import time
|
|||
|
|
|
|||
|
|
import pandas as pd
|
|||
|
|
import pymongo
|
|||
|
|
from docx import Document
|
|||
|
|
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
|
|||
|
|
from docx.oxml.ns import qn
|
|||
|
|
from docx.shared import Pt, Cm
|
|||
|
|
|
|||
|
|
client = pymongo.MongoClient('localhost', 27017)
|
|||
|
|
"""与mongoDB数据库建立连接"""
|
|||
|
|
mydb = client.sjzf_zcwj
|
|||
|
|
"""政策文件存放在数据库的一级目录对象"""
|
|||
|
|
xinjiangzcwj = mydb.xinjiangzcwj
|
|||
|
|
"""政策文件存放对象"""
|
|||
|
|
|
|||
|
|
savePath = ""
|
|||
|
|
"""导出文件存放路径"""
|
|||
|
|
|
|||
|
|
|
|||
|
|
def replace_invalid_chars(text):
|
|||
|
|
"""
|
|||
|
|
替换Window系统和Linux系统文件路径禁止字符,统一转换成Html实体编码
|
|||
|
|
"""
|
|||
|
|
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
|
|||
|
|
"""Window系统和Linux系统文件路径禁止字符列表"""
|
|||
|
|
replace_char = ['<', '>', ':', '"', '/', '\', '|', '?', '*']
|
|||
|
|
"""Window系统和Linux系统文件路径禁止字符替换列表 统一转换成html实体编码"""
|
|||
|
|
|
|||
|
|
for i, char in enumerate(invalid_chars):
|
|||
|
|
text = text.replace(char, replace_char[i])
|
|||
|
|
return text
|
|||
|
|
|
|||
|
|
|
|||
|
|
def analysisTime(timestamp: int) -> str:
|
|||
|
|
"""
|
|||
|
|
处理时间,将1970-01-01之前的时间戳正确转换
|
|||
|
|
"""
|
|||
|
|
if timestamp == 0:
|
|||
|
|
return "未知"
|
|||
|
|
if timestamp < 0:
|
|||
|
|
# 计算从 1970-01-01 开始的时间间隔
|
|||
|
|
delta = datetime.timedelta(seconds=abs(timestamp))
|
|||
|
|
date = datetime.datetime(1970, 1, 1) - delta
|
|||
|
|
else:
|
|||
|
|
date = datetime.datetime.fromtimestamp(timestamp)
|
|||
|
|
# 格式化为字符串
|
|||
|
|
return date.strftime('%Y-%m-%d')
|
|||
|
|
|
|||
|
|
|
|||
|
|
def saveFile():
|
|||
|
|
num = 0
|
|||
|
|
startTime = time.time()
|
|||
|
|
global savePath
|
|||
|
|
query = {
|
|||
|
|
'typeOneName': "",
|
|||
|
|
'typeSecondName': "",
|
|||
|
|
'articleType': "",
|
|||
|
|
"title": "",
|
|||
|
|
"subTitle": "",
|
|||
|
|
"childtype": "",
|
|||
|
|
"index": "",
|
|||
|
|
"pcode": "",
|
|||
|
|
"puborg": "",
|
|||
|
|
"ptime": "",
|
|||
|
|
"pubtime": "",
|
|||
|
|
"effectiveness": "",
|
|||
|
|
"author": "",
|
|||
|
|
"year": "",
|
|||
|
|
"manuscriptRelatedRes": "",
|
|||
|
|
"url": "",
|
|||
|
|
"source": "",
|
|||
|
|
"content": ""
|
|||
|
|
}
|
|||
|
|
query = {f'{k}': v for k, v in query.items() if v}
|
|||
|
|
"""需要过滤的文章,默认不过滤"""
|
|||
|
|
dataList = list(xinjiangzcwj.find(query))
|
|||
|
|
if not savePath:
|
|||
|
|
savePath = input("请输入数据存放路径:")
|
|||
|
|
totalPath = os.path.join(savePath, "数据统计表.csv")
|
|||
|
|
for data in dataList:
|
|||
|
|
try:
|
|||
|
|
typeOneName = data["typeOneName"]
|
|||
|
|
"""一级分类目录"""
|
|||
|
|
typeSecondName = data["typeSecondName"]
|
|||
|
|
"""二级分类目录"""
|
|||
|
|
articleType = data["articleType"]
|
|||
|
|
"""四级分类目录"""
|
|||
|
|
# 创建目录
|
|||
|
|
output_directory = os.path.join(savePath, typeOneName, typeSecondName)
|
|||
|
|
if not os.path.exists(output_directory):
|
|||
|
|
os.makedirs(output_directory)
|
|||
|
|
doc = Document()
|
|||
|
|
firstLine = doc.add_paragraph()
|
|||
|
|
firstLineText = f"索引号:{data['index']}\t\t有效性:{data['effectiveness']}"
|
|||
|
|
firstLine_run = firstLine.add_run(firstLineText)
|
|||
|
|
firstLine_run.font.size = Pt(12)
|
|||
|
|
firstLine_run.font.name = 'Times New Roman' # 设置标题西文字体
|
|||
|
|
firstLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
|
|||
|
|
|
|||
|
|
secondLine = doc.add_paragraph()
|
|||
|
|
secondLineText = f"发文机关:{data['puborg']}\t\t发文字号:{data['pcode']}"
|
|||
|
|
secondLine_run = secondLine.add_run(secondLineText)
|
|||
|
|
secondLine_run.font.size = Pt(12)
|
|||
|
|
secondLine_run.font.name = 'Times New Roman' # 设置标题西文字体
|
|||
|
|
secondLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
|
|||
|
|
|
|||
|
|
thirdLine = doc.add_paragraph()
|
|||
|
|
thirdLineText = f"标题:{data['title']}"
|
|||
|
|
thirdLine_run = thirdLine.add_run(thirdLineText)
|
|||
|
|
thirdLine_run.font.size = Pt(12)
|
|||
|
|
thirdLine_run.font.name = 'Times New Roman' # 设置标题西文字体
|
|||
|
|
thirdLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
|
|||
|
|
|
|||
|
|
fourLine = doc.add_paragraph()
|
|||
|
|
pubtime = analysisTime(data['pubtime'])
|
|||
|
|
ptime = analysisTime(data['ptime'])
|
|||
|
|
fourLineText = f"成文日期:{ptime}\t\t发布日期:{pubtime}"
|
|||
|
|
fourLine_run = fourLine.add_run(fourLineText)
|
|||
|
|
fourLine_run.font.size = Pt(12)
|
|||
|
|
fourLine_run.font.name = 'Times New Roman' # 设置标题西文字体
|
|||
|
|
fourLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
|
|||
|
|
|
|||
|
|
urlLine = doc.add_paragraph()
|
|||
|
|
urlLineText = f"文章链接:{data['url']}"
|
|||
|
|
urlLine_run = urlLine.add_run(urlLineText)
|
|||
|
|
urlLine_run.font.size = Pt(12)
|
|||
|
|
urlLine_run.font.name = 'Times New Roman' # 设置标题西文字体
|
|||
|
|
urlLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
|
|||
|
|
|
|||
|
|
title = doc.add_paragraph()
|
|||
|
|
title_run = title.add_run(data["title"])
|
|||
|
|
title_run.bold = True
|
|||
|
|
title_run.font.size = Pt(22)
|
|||
|
|
title_run.font.name = 'Times New Roman' # 设置标题西文字体
|
|||
|
|
title_run.element.rPr.rFonts.set(qn('w:eastAsia'), "华文中宋")
|
|||
|
|
title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER # 设置大标题居中对齐
|
|||
|
|
|
|||
|
|
for section in data["content"].split("\n"):
|
|||
|
|
paragraph = doc.add_paragraph()
|
|||
|
|
run = paragraph.add_run("\t" + section)
|
|||
|
|
run.font.size = Pt(16)
|
|||
|
|
run.font.name = "Times New Roman"
|
|||
|
|
run.element.rPr.rFonts.set(qn('w:eastAsia'), "仿宋")
|
|||
|
|
run.first_line_indent = Cm(0.74)
|
|||
|
|
|
|||
|
|
if data["manuscriptRelatedRes"]:
|
|||
|
|
urlLine = doc.add_paragraph()
|
|||
|
|
urlLineText = f"附件链接:{data['manuscriptRelatedRes']}"
|
|||
|
|
urlLine_run = urlLine.add_run(urlLineText)
|
|||
|
|
urlLine_run.font.size = Pt(12)
|
|||
|
|
urlLine_run.font.name = 'Times New Roman' # 设置标题西文字体
|
|||
|
|
urlLine_run.element.rPr.rFonts.set(qn('w:eastAsia'), "楷体")
|
|||
|
|
if len(data["title"]) > 45:
|
|||
|
|
title_ = data["title"][len(data["title"]) - 30:]
|
|||
|
|
else:
|
|||
|
|
title_ = data["title"]
|
|||
|
|
fileName = f"{replace_invalid_chars(title_)}.docx"
|
|||
|
|
filePath = os.path.join(output_directory, fileName)
|
|||
|
|
doc.save(filePath)
|
|||
|
|
num += 1
|
|||
|
|
print(f"{typeOneName}--{typeSecondName}--{data['title']}--导出成功!")
|
|||
|
|
except Exception as e:
|
|||
|
|
print(e)
|
|||
|
|
continue
|
|||
|
|
csvData = pd.DataFrame(dataList)
|
|||
|
|
csvData.columns = ["数据库ID", "文章归类", "文种(自治区)", "公文种类", "文章标题", "文章副标题", "主题分类",
|
|||
|
|
"索引号", "文号", "发文机关(自治区)", "成文日期时间戳", "发布日期时间戳", "有效性", "作者", "年份",
|
|||
|
|
"附件链接",
|
|||
|
|
"文章链接", "来源", "正文内容"]
|
|||
|
|
csvData.to_csv(totalPath, encoding="utf-8-sig",index_label="序号")
|
|||
|
|
print(f"耗时:{time.time() - startTime} 秒,一共导出{num}份文件,详情数据请看数据统计表.csv")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
saveFile()
|