2.6 结构整理、增加了cache命中缓存
This commit is contained in:
parent
d4a3446236
commit
2e1d070f3b
@ -3,7 +3,7 @@ import os
|
|||||||
import mimetypes
|
import mimetypes
|
||||||
import requests
|
import requests
|
||||||
from PyPDF2 import PdfReader
|
from PyPDF2 import PdfReader
|
||||||
from flask_app.general.clean_pdf import is_scanned_pdf
|
from flask_app.general.读取文件.clean_pdf import is_scanned_pdf
|
||||||
def download_file(url, local_filename):
|
def download_file(url, local_filename):
|
||||||
"""
|
"""
|
||||||
下载文件并保存到本地,基于Content-Type设置文件扩展名。
|
下载文件并保存到本地,基于Content-Type设置文件扩展名。
|
||||||
|
@ -1,11 +1,9 @@
|
|||||||
import json
|
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import time
|
import time
|
||||||
import PyPDF2
|
import PyPDF2
|
||||||
import requests
|
import requests
|
||||||
from ratelimit import sleep_and_retry, limits
|
from ratelimit import sleep_and_retry, limits
|
||||||
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
|
from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content
|
||||||
|
|
||||||
def pdf2txt(file_path):
|
def pdf2txt(file_path):
|
||||||
common_header = extract_common_header(file_path)
|
common_header = extract_common_header(file_path)
|
||||||
|
@ -5,7 +5,7 @@ import os
|
|||||||
import fitz
|
import fitz
|
||||||
import regex
|
import regex
|
||||||
from PyPDF2 import PdfReader, PdfWriter
|
from PyPDF2 import PdfReader, PdfWriter
|
||||||
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
|
from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content
|
||||||
from flask_app.general.format_change import docx2pdf
|
from flask_app.general.format_change import docx2pdf
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
|
|
||||||
|
@ -1,98 +0,0 @@
|
|||||||
import os
|
|
||||||
import docx
|
|
||||||
from spire.doc import *
|
|
||||||
from spire.doc.common import *
|
|
||||||
|
|
||||||
|
|
||||||
def split_and_clean_docx(input_file, output_folder):
|
|
||||||
"""
|
|
||||||
拆分指定的Word文档为多个节,并去除文档中的水印。
|
|
||||||
|
|
||||||
参数:
|
|
||||||
input_file (str): 需要拆分的源Word文档路径。
|
|
||||||
output_folder (str): 拆分后的文档输出目录。
|
|
||||||
"""
|
|
||||||
|
|
||||||
# 检查输出目录是否存在,如果不存在则创建
|
|
||||||
if not os.path.exists(output_folder):
|
|
||||||
os.makedirs(output_folder)
|
|
||||||
|
|
||||||
# 加载源文档
|
|
||||||
with Document() as document:
|
|
||||||
document.LoadFromFile(input_file)
|
|
||||||
|
|
||||||
# 遍历文档中的所有节
|
|
||||||
for sec_index in range(document.Sections.Count):
|
|
||||||
# 访问当前节
|
|
||||||
section = document.Sections[sec_index]
|
|
||||||
|
|
||||||
# 为当前节创建一个新文档
|
|
||||||
with Document() as new_document:
|
|
||||||
# 将当前节复制到新文档
|
|
||||||
new_document.Sections.Add(section.Clone())
|
|
||||||
|
|
||||||
# 复制源文档的主题和样式到新文档以确保格式一致
|
|
||||||
document.CloneThemesTo(new_document)
|
|
||||||
document.CloneDefaultStyleTo(new_document)
|
|
||||||
|
|
||||||
# 将新文档保存为单独的文件
|
|
||||||
output_file = os.path.join(output_folder, f"节{sec_index + 1}.docx")
|
|
||||||
new_document.SaveToFile(output_file, FileFormat.Docx2016)
|
|
||||||
|
|
||||||
# 去除水印
|
|
||||||
remove_watermark(output_folder)
|
|
||||||
|
|
||||||
print("文档拆分并去除水印完成!")
|
|
||||||
|
|
||||||
|
|
||||||
def remove_watermark(output_folder):
|
|
||||||
"""
|
|
||||||
去除指定目录下所有docx文件中的水印。
|
|
||||||
|
|
||||||
参数:
|
|
||||||
output_folder (str): 需要去除水印的文档所在的文件夹。
|
|
||||||
"""
|
|
||||||
bookmark_name = "Evaluation Warning: The document was created with Spire.Doc for Python."
|
|
||||||
|
|
||||||
for file_name in os.listdir(output_folder):
|
|
||||||
if file_name.endswith(".docx"):
|
|
||||||
doc_path = os.path.join(output_folder, file_name)
|
|
||||||
doc = docx.Document(doc_path)
|
|
||||||
# 查找并替换水印内容
|
|
||||||
for paragraph in doc.paragraphs:
|
|
||||||
if bookmark_name in paragraph.text:
|
|
||||||
# 替换水印文本为空
|
|
||||||
inline = paragraph.runs
|
|
||||||
for i in range(len(inline)):
|
|
||||||
if bookmark_name in inline[i].text:
|
|
||||||
inline[i].text = inline[i].text.replace(bookmark_name, "")
|
|
||||||
break
|
|
||||||
# 保存文档
|
|
||||||
doc.save(doc_path)
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
def test_split_and_clean_docx():
|
|
||||||
"""
|
|
||||||
测试文档拆分和水印去除功能。
|
|
||||||
"""
|
|
||||||
input_file = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件.docx" # 替换为你的源文档路径
|
|
||||||
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zboutpub" # 指定输出文件夹
|
|
||||||
|
|
||||||
# 执行拆分和去除水印
|
|
||||||
split_and_clean_docx(input_file, output_folder)
|
|
||||||
|
|
||||||
# 验证输出文件是否生成
|
|
||||||
if os.path.exists(output_folder):
|
|
||||||
print("测试通过: 输出文件夹已生成。")
|
|
||||||
output_files = os.listdir(output_folder)
|
|
||||||
if output_files:
|
|
||||||
print(f"测试通过: 生成了 {len(output_files)} 个文档。")
|
|
||||||
else:
|
|
||||||
print("测试失败: 没有生成任何文档。")
|
|
||||||
else:
|
|
||||||
print("测试失败: 输出文件夹未生成。")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
test_split_and_clean_docx()
|
|
@ -1,5 +1,5 @@
|
|||||||
import PyPDF2
|
import PyPDF2
|
||||||
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
|
from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content
|
||||||
|
|
||||||
def extract_text_by_page(file_path):
|
def extract_text_by_page(file_path):
|
||||||
common_header = extract_common_header(file_path)
|
common_header = extract_common_header(file_path)
|
||||||
|
@ -3,7 +3,7 @@ import os
|
|||||||
import requests
|
import requests
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
|
||||||
from flask_app.general.clean_pdf import is_scanned_pdf
|
from flask_app.general.读取文件.clean_pdf import is_scanned_pdf
|
||||||
from flask_app.general.format_change import pdf2docx, docx2pdf
|
from flask_app.general.format_change import pdf2docx, docx2pdf
|
||||||
|
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@ import os
|
|||||||
import fitz
|
import fitz
|
||||||
import PyPDF2
|
import PyPDF2
|
||||||
import tempfile
|
import tempfile
|
||||||
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
|
from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content
|
||||||
from flask_app.general.llm.doubao import doubao_model
|
from flask_app.general.llm.doubao import doubao_model
|
||||||
from flask_app.old_version.table_ocr_old import CommonOcr
|
from flask_app.old_version.table_ocr_old import CommonOcr
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ import regex
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from PyPDF2 import PdfReader
|
from PyPDF2 import PdfReader
|
||||||
from flask_app.general.clean_pdf import clean_page_content
|
from flask_app.general.读取文件.clean_pdf import clean_page_content
|
||||||
from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, get_invalid_file, \
|
from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, get_invalid_file, \
|
||||||
extract_pages_tobidders_notice, get_notice, extract_pages_generic
|
extract_pages_tobidders_notice, get_notice, extract_pages_generic
|
||||||
from flask_app.general.通用功能函数 import get_global_logger
|
from flask_app.general.通用功能函数 import get_global_logger
|
||||||
|
@ -9,7 +9,7 @@ from flask_app.general.json_utils import clean_json_string
|
|||||||
from flask_app.general.llm.model_continue_query import process_continue_answers
|
from flask_app.general.llm.model_continue_query import process_continue_answers
|
||||||
from flask_app.general.截取pdf通用函数 import create_get_text_function
|
from flask_app.general.截取pdf通用函数 import create_get_text_function
|
||||||
from flask_app.general.llm.通义千问long_plus import upload_file, qianwen_long_stream, qianwen_plus
|
from flask_app.general.llm.通义千问long_plus import upload_file, qianwen_long_stream, qianwen_plus
|
||||||
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
|
from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content
|
||||||
from flask_app.general.format_change import docx2pdf, pdf2docx
|
from flask_app.general.format_change import docx2pdf, pdf2docx
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
|
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
import fitz
|
import fitz
|
||||||
from PyPDF2 import PdfReader, PdfWriter
|
from PyPDF2 import PdfReader
|
||||||
import regex # 导入正则表达式库
|
import regex # 导入正则表达式库
|
||||||
import os # 用于文件和文件夹操作
|
import os # 用于文件和文件夹操作
|
||||||
from flask_app.general.clean_pdf import clean_page_content
|
from flask_app.general.读取文件.clean_pdf import clean_page_content
|
||||||
from flask_app.general.merge_pdfs import merge_and_cleanup
|
from flask_app.general.merge_pdfs import merge_and_cleanup
|
||||||
from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, is_pdf_or_doc, \
|
from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, is_pdf_or_doc, \
|
||||||
get_invalid_file, extract_pages_tobidders_notice, extract_pages_generic, get_notice, create_get_text_function
|
get_invalid_file, extract_pages_tobidders_notice, extract_pages_generic, get_notice, create_get_text_function
|
||||||
|
Loading…
x
Reference in New Issue
Block a user