diff --git a/flask_app/general/format_change.py b/flask_app/general/format_change.py index 86798a2..5378752 100644 --- a/flask_app/general/format_change.py +++ b/flask_app/general/format_change.py @@ -3,7 +3,7 @@ import os import mimetypes import requests from PyPDF2 import PdfReader -from flask_app.general.clean_pdf import is_scanned_pdf +from flask_app.general.读取文件.clean_pdf import is_scanned_pdf def download_file(url, local_filename): """ 下载文件并保存到本地,基于Content-Type设置文件扩展名。 diff --git a/flask_app/general/llm/doubao.py b/flask_app/general/llm/doubao.py index 990d195..5ddccd7 100644 --- a/flask_app/general/llm/doubao.py +++ b/flask_app/general/llm/doubao.py @@ -1,11 +1,9 @@ -import json import os -import re import time import PyPDF2 import requests from ratelimit import sleep_and_retry, limits -from flask_app.general.clean_pdf import extract_common_header, clean_page_content +from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content def pdf2txt(file_path): common_header = extract_common_header(file_path) diff --git a/flask_app/general/截取pdf通用函数.py b/flask_app/general/截取pdf通用函数.py index d45fdc0..93a0106 100644 --- a/flask_app/general/截取pdf通用函数.py +++ b/flask_app/general/截取pdf通用函数.py @@ -5,7 +5,7 @@ import os import fitz import regex from PyPDF2 import PdfReader, PdfWriter -from flask_app.general.clean_pdf import extract_common_header, clean_page_content +from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content from flask_app.general.format_change import docx2pdf import concurrent.futures diff --git a/flask_app/general/clean_pdf.py b/flask_app/general/读取文件/clean_pdf.py similarity index 100% rename from flask_app/general/clean_pdf.py rename to flask_app/general/读取文件/clean_pdf.py diff --git a/flask_app/general/读取文件/split_test2.py b/flask_app/general/读取文件/split_test2.py deleted file mode 100644 index 7ba052c..0000000 --- a/flask_app/general/读取文件/split_test2.py +++ /dev/null @@ -1,98 +0,0 @@ -import os -import docx -from spire.doc import * -from spire.doc.common import * - - -def split_and_clean_docx(input_file, output_folder): - """ - 拆分指定的Word文档为多个节,并去除文档中的水印。 - - 参数: - input_file (str): 需要拆分的源Word文档路径。 - output_folder (str): 拆分后的文档输出目录。 - """ - - # 检查输出目录是否存在,如果不存在则创建 - if not os.path.exists(output_folder): - os.makedirs(output_folder) - - # 加载源文档 - with Document() as document: - document.LoadFromFile(input_file) - - # 遍历文档中的所有节 - for sec_index in range(document.Sections.Count): - # 访问当前节 - section = document.Sections[sec_index] - - # 为当前节创建一个新文档 - with Document() as new_document: - # 将当前节复制到新文档 - new_document.Sections.Add(section.Clone()) - - # 复制源文档的主题和样式到新文档以确保格式一致 - document.CloneThemesTo(new_document) - document.CloneDefaultStyleTo(new_document) - - # 将新文档保存为单独的文件 - output_file = os.path.join(output_folder, f"节{sec_index + 1}.docx") - new_document.SaveToFile(output_file, FileFormat.Docx2016) - - # 去除水印 - remove_watermark(output_folder) - - print("文档拆分并去除水印完成!") - - -def remove_watermark(output_folder): - """ - 去除指定目录下所有docx文件中的水印。 - - 参数: - output_folder (str): 需要去除水印的文档所在的文件夹。 - """ - bookmark_name = "Evaluation Warning: The document was created with Spire.Doc for Python." - - for file_name in os.listdir(output_folder): - if file_name.endswith(".docx"): - doc_path = os.path.join(output_folder, file_name) - doc = docx.Document(doc_path) - # 查找并替换水印内容 - for paragraph in doc.paragraphs: - if bookmark_name in paragraph.text: - # 替换水印文本为空 - inline = paragraph.runs - for i in range(len(inline)): - if bookmark_name in inline[i].text: - inline[i].text = inline[i].text.replace(bookmark_name, "") - break - # 保存文档 - doc.save(doc_path) - -import os - -def test_split_and_clean_docx(): - """ - 测试文档拆分和水印去除功能。 - """ - input_file = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件.docx" # 替换为你的源文档路径 - output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zboutpub" # 指定输出文件夹 - - # 执行拆分和去除水印 - split_and_clean_docx(input_file, output_folder) - - # 验证输出文件是否生成 - if os.path.exists(output_folder): - print("测试通过: 输出文件夹已生成。") - output_files = os.listdir(output_folder) - if output_files: - print(f"测试通过: 生成了 {len(output_files)} 个文档。") - else: - print("测试失败: 没有生成任何文档。") - else: - print("测试失败: 输出文件夹未生成。") - - -if __name__ == "__main__": - test_split_and_clean_docx() diff --git a/flask_app/general/读取文件/按页读取pdf.py b/flask_app/general/读取文件/按页读取pdf.py index ccd390d..8c950b7 100644 --- a/flask_app/general/读取文件/按页读取pdf.py +++ b/flask_app/general/读取文件/按页读取pdf.py @@ -1,5 +1,5 @@ import PyPDF2 -from flask_app.general.clean_pdf import extract_common_header, clean_page_content +from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content def extract_text_by_page(file_path): common_header = extract_common_header(file_path) diff --git a/flask_app/old_version/download_old.py b/flask_app/old_version/download_old.py index 772b619..012936c 100644 --- a/flask_app/old_version/download_old.py +++ b/flask_app/old_version/download_old.py @@ -3,7 +3,7 @@ import os import requests import mimetypes -from flask_app.general.clean_pdf import is_scanned_pdf +from flask_app.general.读取文件.clean_pdf import is_scanned_pdf from flask_app.general.format_change import pdf2docx, docx2pdf diff --git a/flask_app/old_version/extarct_img_2_txt_old.py b/flask_app/old_version/extarct_img_2_txt_old.py index 8f1aea3..2cd12e4 100644 --- a/flask_app/old_version/extarct_img_2_txt_old.py +++ b/flask_app/old_version/extarct_img_2_txt_old.py @@ -3,7 +3,7 @@ import os import fitz import PyPDF2 import tempfile -from flask_app.general.clean_pdf import extract_common_header, clean_page_content +from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content from flask_app.general.llm.doubao import doubao_model from flask_app.old_version.table_ocr_old import CommonOcr diff --git a/flask_app/工程标/截取pdf工程标版.py b/flask_app/工程标/截取pdf工程标版.py index 33b4437..374e902 100644 --- a/flask_app/工程标/截取pdf工程标版.py +++ b/flask_app/工程标/截取pdf工程标版.py @@ -4,7 +4,7 @@ import regex import os import time from PyPDF2 import PdfReader -from flask_app.general.clean_pdf import clean_page_content +from flask_app.general.读取文件.clean_pdf import clean_page_content from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, get_invalid_file, \ extract_pages_tobidders_notice, get_notice, extract_pages_generic from flask_app.general.通用功能函数 import get_global_logger diff --git a/flask_app/货物标/商务服务其他要求提取.py b/flask_app/货物标/商务服务其他要求提取.py index cf23334..bfed7e5 100644 --- a/flask_app/货物标/商务服务其他要求提取.py +++ b/flask_app/货物标/商务服务其他要求提取.py @@ -9,7 +9,7 @@ from flask_app.general.json_utils import clean_json_string from flask_app.general.llm.model_continue_query import process_continue_answers from flask_app.general.截取pdf通用函数 import create_get_text_function from flask_app.general.llm.通义千问long_plus import upload_file, qianwen_long_stream, qianwen_plus -from flask_app.general.clean_pdf import extract_common_header, clean_page_content +from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content from flask_app.general.format_change import docx2pdf, pdf2docx import concurrent.futures diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py index 6f56667..749ec5a 100644 --- a/flask_app/货物标/截取pdf货物标版.py +++ b/flask_app/货物标/截取pdf货物标版.py @@ -1,8 +1,8 @@ import fitz -from PyPDF2 import PdfReader, PdfWriter +from PyPDF2 import PdfReader import regex # 导入正则表达式库 import os # 用于文件和文件夹操作 -from flask_app.general.clean_pdf import clean_page_content +from flask_app.general.读取文件.clean_pdf import clean_page_content from flask_app.general.merge_pdfs import merge_and_cleanup from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, is_pdf_or_doc, \ get_invalid_file, extract_pages_tobidders_notice, extract_pages_generic, get_notice, create_get_text_function