10.24

2024-10-24 20:51:48 +08:00 · 2024-10-24 20:51:48 +08:00 · 772aaa17f0
commit 772aaa17f0
parent 89d709f9d6
3 changed files with 215 additions and 181 deletions
--- a/flask_app/general/format_change.py
+++ b/flask_app/general/format_change.py
@ -49,130 +49,132 @@ def pdf2docx(local_path_in):
    print(f"format_change p2d:have downloaded file to: {downloaded_filepath}")
    return downloaded_filepath

-def doc2docx(local_path_in):
-    remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d'
-    receive_download_url = upload_file(local_path_in, remote_url)
-    print(receive_download_url)
-    filename, folder = get_filename_and_folder(local_path_in)  # 输入输出在同一个文件夹
-    local_filename = os.path.join(folder, filename)  # 输出文件名
-    downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
-    print(f"format_change d2d:have downloaded file to: {downloaded_filepath}")
-    return downloaded_filepath
-def docx2pdf(local_path_in):
-    remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p'
-    receive_download_url = upload_file(local_path_in, remote_url)
-    filename, folder = get_filename_and_folder(local_path_in)  # 输入输出在同一个文件夹
-    local_filename = os.path.join(folder, filename)  # 输出文件名
-    downloaded_filepath,file_type = download_file(receive_download_url, local_filename)
-    print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
-    return downloaded_filepath
-# def docx2pdf(file_path):
-#     """
-#     将本地的 .docx 或 .doc 文件转换为 .pdf 文件。
-#
-#     参数：
-#     - file_path: str, 本地文件的路径，支持 .docx 和 .doc 格式。
-#     """
-#     # 检查文件是否存在
-#     if not os.path.isfile(file_path):
-#         raise FileNotFoundError(f"文件未找到: {file_path}")
-#
-#     # 获取文件名和扩展名
-#     base_name = os.path.basename(file_path)
-#     name, ext = os.path.splitext(base_name)
-#     ext = ext.lower().lstrip('.')
-#
-#     if ext not in ['docx', 'doc']:
-#         raise ValueError(f"doc2pdf 仅支持 .docx 和 .doc 文件，当前文件扩展名为: .{ext}")
-#
-#     # 定义转换接口
-#     endpoint = 'http://120.26.236.97:5008/convert_to_pdf'
-#
-#     # 获取文件所在目录
-#     output_dir = os.path.dirname(file_path)
-#
-#     # 准备上传的文件
-#     with open(file_path, 'rb') as f:
-#         files = {'file': (base_name, f)}
-#         try:
-#             print(f"正在将 {base_name} 转换为 .pdf 格式...")
-#             response = requests.post(endpoint, files=files)
-#             response.raise_for_status()  # 检查请求是否成功
-#         except requests.RequestException as e:
-#             print(f"转换过程中发生错误: {e}")
-#             return
-#
-#     # 准备保存转换后文件的路径
-#     output_file_name = f"{name}.pdf"
-#     output_path = os.path.join(output_dir, output_file_name)
-#
-#     # 保存转换后的文件
-#     with open(output_path, 'wb') as out_file:
-#         out_file.write(response.content)
-#
-#     print(f"文件已成功转换并保存至: {output_path}")
-#     return output_path
-#
-#
-# def doc2docx(file_path):
-#     """
-#     将本地的 .doc 文件转换为 .docx 文件。
-#
-#     参数：
-#     - file_path: str, 本地文件的路径，支持 .doc 格式。
-#     """
-#     # 检查文件是否存在
-#     if not os.path.isfile(file_path):
-#         raise FileNotFoundError(f"文件未找到: {file_path}")
-#
-#     # 获取文件名和扩展名
-#     base_name = os.path.basename(file_path)
-#     name, ext = os.path.splitext(base_name)
-#     ext = ext.lower().lstrip('.')
-#
-#     if ext != 'doc':
-#         raise ValueError(f"doc2docx 仅支持 .doc 文件，当前文件扩展名为: .{ext}")
-#
-#     # 定义转换接口
-#     endpoint = 'http://120.26.236.97:5008/convert_to_docx'
-#
-#     # 获取文件所在目录
-#     output_dir = os.path.dirname(file_path)
-#
-#     # 准备上传的文件
-#     with open(file_path, 'rb') as f:
-#         files = {'file': (base_name, f)}
-#         try:
-#             print(f"正在将 {base_name} 转换为 .docx 格式...")
-#             response = requests.post(endpoint, files=files)
-#             response.raise_for_status()  # 检查请求是否成功
-#         except requests.RequestException as e:
-#             print(f"转换过程中发生错误: {e}")
-#             return
-#
-#     # 准备保存转换后文件的路径
-#     output_file_name = f"{name}.docx"
-#     output_path = os.path.join(output_dir, output_file_name)
-#
-#     # 保存转换后的文件
-#     with open(output_path, 'wb') as out_file:
-#         out_file.write(response.content)
-#
-#     print(f"文件已成功转换并保存至: {output_path}")
-#     return output_path
+# def doc2docx(local_path_in):
+#     remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d'
+#     receive_download_url = upload_file(local_path_in, remote_url)
+#     print(receive_download_url)
+#     filename, folder = get_filename_and_folder(local_path_in)  # 输入输出在同一个文件夹
+#     local_filename = os.path.join(folder, filename)  # 输出文件名
+#     downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
+#     print(f"format_change d2d:have downloaded file to: {downloaded_filepath}")
+#     return downloaded_filepath
+# def docx2pdf(local_path_in):
+#     remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p'
+#     receive_download_url = upload_file(local_path_in, remote_url)
+#     filename, folder = get_filename_and_folder(local_path_in)  # 输入输出在同一个文件夹
+#     local_filename = os.path.join(folder, filename)  # 输出文件名
+#     downloaded_filepath,file_type = download_file(receive_download_url, local_filename)
+#     print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
+#     return downloaded_filepath
+
+def docx2pdf(file_path):
+    """
+    将本地的 .docx 或 .doc 文件转换为 .pdf 文件。
+
+    参数：
+    - file_path: str, 本地文件的路径，支持 .docx 和 .doc 格式。
+    """
+    # 检查文件是否存在
+    if not os.path.isfile(file_path):
+        raise FileNotFoundError(f"文件未找到: {file_path}")
+
+    # 获取文件名和扩展名
+    base_name = os.path.basename(file_path)
+    name, ext = os.path.splitext(base_name)
+    ext = ext.lower().lstrip('.')
+
+    if ext not in ['docx', 'doc']:
+        raise ValueError(f"doc2pdf 仅支持 .docx 和 .doc 文件，当前文件扩展名为: .{ext}")
+
+    # 定义转换接口
+    endpoint = 'http://120.26.236.97:5008/convert_to_pdf'
+    # endpoint = 'http://192.168.0.2:5008/convert_to_pdf'
+
+    # 获取文件所在目录
+    output_dir = os.path.dirname(file_path)
+
+    # 准备上传的文件
+    with open(file_path, 'rb') as f:
+        files = {'file': (base_name, f)}
+        try:
+            print(f"正在将 {base_name} 转换为 .pdf 格式...")
+            response = requests.post(endpoint, files=files)
+            response.raise_for_status()  # 检查请求是否成功
+        except requests.RequestException as e:
+            print(f"转换过程中发生错误: {e}")
+            return
+
+    # 准备保存转换后文件的路径
+    output_file_name = f"{name}.pdf"
+    output_path = os.path.join(output_dir, output_file_name)
+
+    # 保存转换后的文件
+    with open(output_path, 'wb') as out_file:
+        out_file.write(response.content)
+
+    print(f"文件已成功转换并保存至: {output_path}")
+    return output_path
+
+
+def doc2docx(file_path):
+    """
+    将本地的 .doc 文件转换为 .docx 文件。
+
+    参数：
+    - file_path: str, 本地文件的路径，支持 .doc 格式。
+    """
+    # 检查文件是否存在
+    if not os.path.isfile(file_path):
+        raise FileNotFoundError(f"文件未找到: {file_path}")
+
+    # 获取文件名和扩展名
+    base_name = os.path.basename(file_path)
+    name, ext = os.path.splitext(base_name)
+    ext = ext.lower().lstrip('.')
+
+    if ext != 'doc':
+        raise ValueError(f"doc2docx 仅支持 .doc 文件，当前文件扩展名为: .{ext}")
+
+    # 定义转换接口
+    endpoint = 'http://120.26.236.97:5008/convert_to_docx'
+
+    # 获取文件所在目录
+    output_dir = os.path.dirname(file_path)
+
+    # 准备上传的文件
+    with open(file_path, 'rb') as f:
+        files = {'file': (base_name, f)}
+        try:
+            print(f"正在将 {base_name} 转换为 .docx 格式...")
+            response = requests.post(endpoint, files=files)
+            response.raise_for_status()  # 检查请求是否成功
+        except requests.RequestException as e:
+            print(f"转换过程中发生错误: {e}")
+            return
+
+    # 准备保存转换后文件的路径
+    output_file_name = f"{name}.docx"
+    output_path = os.path.join(output_dir, output_file_name)
+
+    # 保存转换后的文件
+    with open(output_path, 'wb') as out_file:
+        out_file.write(response.content)
+
+    print(f"文件已成功转换并保存至: {output_path}")
+    return output_path



 if __name__ == '__main__':
    # 替换为你的文件路径和API URL
    # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf"
-    local_path_in = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.docx"
+    # local_path_in = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.docx"
    # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
+    local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
    # downloaded_file=doc2docx(local_path_in)
-    # downloaded_file=pdf2docx(local_path_in)
-    for i in range(1):
-        downloaded_file=docx2pdf(local_path_in)
-        print(downloaded_file)
+    downloaded_file=pdf2docx(local_path_in)
+    # downloaded_file=docx2pdf(local_path_in)
+    print(downloaded_file)



--- a/flask_app/general/读取文件/按页读取pdf.py
+++ b/flask_app/general/读取文件/按页读取pdf.py
@ -153,7 +153,7 @@ if __name__ == '__main__':
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
-    file_path='D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\ztbfile_tobidders_notice_part2.pdf'
+    file_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\\zbtest12.pdf"
    # ress = extract_common_header(file_path)
    # print(ress)
    res=extract_text_by_page(file_path)
--- a/flask_app/testdir/截取文件格式.py
+++ b/flask_app/testdir/截取文件格式.py
@ -1,7 +1,6 @@
 from PyPDF2 import PdfReader, PdfWriter
 import re  # 导入正则表达式库
 import os  # 用于文件和文件夹操作
-# from flask_app.general.format_change import docx2pdf  # 确保此模块可用

 def clean_page_content(text, common_header):
    """
@ -13,14 +12,14 @@ def clean_page_content(text, common_header):
                # 替换首次出现的完整行
                text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)

-    # 删除页码，合并多个正则表达式为一个
-    text = re.sub(
-        r'(^\s*\d+\s*(?=\D))|(\s+\d+\s*$)|(\s*/\s*\d+\s*)|(\s*[—-]\s*\d+\s*[—-]\s*)',
-        '',
-        text,
-        flags=re.MULTILINE
-    )
+    # 删除页码，支持多种格式
+    text = re.sub(r'^\s*\d+\s*(?=\D)', '', text)  # 删除开头的页码，仅当紧跟非数字字符时
+    text = re.sub(r'\s+\d+\s*$', '', text)        # 删除结尾的页码
+    text = re.sub(r'\s*\/\s*\d+\s*', '', text)    # 删除形如 /129 的页码
+    text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text)  # 删除形如 '—2—' 或 '-2-' 的页码
    return text
+
+
 def extract_common_header(pdf_path):
    """
    从PDF的前几页提取公共抬头。
@ -39,13 +38,19 @@ def extract_common_header(pdf_path):
        if text:
            # 只取每页的前三行
            first_lines = text.strip().split('\n')[:3]
-            headers.append(set(line.strip() for line in first_lines if line.strip()))
+            headers.append(first_lines)

    if len(headers) < 2:
        return ""  # 如果没有足够的页来比较，返回空字符串

    # 寻找每一行中的公共部分，按顺序保留
-    common_headers = set.intersection(*headers)
+    common_headers = []
+    for lines in zip(*headers):
+        first_words = lines[0].split()
+        common_line = [word for word in first_words if all(word in line.split() for line in lines[1:])]
+        if common_line:
+            common_headers.append(' '.join(common_line))
+
    return '\n'.join(common_headers)


@ -67,13 +72,17 @@ def is_pdf_or_doc(filename):

 def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):
    """
-    将提取的页面保存为新的PDF文件。
+    将提取的页面保存为新的PDF文件，统一命名为 "bid_format.pdf" 并存放在独立的子文件夹中。
    """
    try:
-        base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
-        output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
+        output_pdf_path = os.path.join(output_folder, "bid_format.pdf")

-        if start_page is None or end_page is None or start_page > end_page:
+        # 检查 start_page 和 end_page 是否为 None
+        if start_page is None or end_page is None:
+            print(f"起始页或结束页为 None: start_page={start_page}, end_page={end_page} 文件: {pdf_path}")
+            return ""
+
+        if start_page < 0 or end_page >= len(pdf_document.pages) or start_page > end_page:
            print(f"无效的页面范围: {start_page} 到 {end_page} 文件: {pdf_path}")
            return ""

@ -89,29 +98,57 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo
        return ""  # 返回空字符串


-def find_page_indices(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
+def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
    """
-    查找起始页和结束页的索引。
+    通用函数，根据模式提取起始页和结束页。
    """
    start_page = None
    end_page = None
-    total_pages = len(pdf_document.pages)
-
    for i, page in enumerate(pdf_document.pages):
-        if i <= begin_page:
-            continue
        text = page.extract_text() or ""
        cleaned_text = clean_page_content(text, common_header)
-        if start_page is None and re.search(begin_pattern, cleaned_text):
+        # print(cleaned_text)
+        if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
            start_page = i
-            continue
-        if start_page is not None and re.search(end_pattern, cleaned_text):
+        if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
            end_page = i
            break
+    return start_page, end_page

-    return start_page, end_page if end_page else total_pages - 1
-
-
+def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
+    pdf_document = PdfReader(pdf_path)
+    total_pages = len(pdf_document.pages) - 1  # 获取总页数
+    begin_page = 10
+    start_page = None
+    end_page = None
+    begin_patterns = [
+        re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*', re.MULTILINE),
+        re.compile(r"\s*(投标文件格式|响应文件格式|响应性文件格式)\s*")
+    ]
+    exclusion_pattern = re.compile(r'文件的构成|文件的组成|目录|评标办法|技术标准|其他材料|合同')
+    end_pattern = re.compile(
+                r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
+    for begin_pattern in begin_patterns:
+        for i, page in enumerate(pdf_document.pages):
+            text = page.extract_text() or ""
+            cleaned_text = clean_page_content(text, common_header)
+            if exclusion_pattern and re.search(exclusion_pattern, cleaned_text):
+                continue
+            if re.search(begin_pattern, cleaned_text) and i > begin_page:
+                start_page = i
+            if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
+                end_page = i
+                break
+        if start_page is not None:
+            break
+    if start_page is None:
+        print(f"{output_suffix}: 未找到起始页，提取失败！ 文件: {pdf_path}")
+        return ""
+    if end_page is None:
+        # 如果未匹配到结束页，默认截取到文件末尾
+        end_page = total_pages
+        print(f"{output_suffix}: 未找到结束页，默认截取到文件末尾。 文件: {pdf_path}")
+    return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
 def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
    """
    根据开始和结束模式提取PDF页面。
@ -119,31 +156,20 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
    try:
        common_header = extract_common_header(pdf_path)
        pdf_document = PdfReader(pdf_path)
+        total_pages = len(pdf_document.pages) - 1  # 获取总页数

-        start_page, end_page = find_page_indices(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
+        # 提取起始页和结束页
+        start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)

-        if output_suffix == "format" and start_page is None:
-            print(f"{output_suffix}: 未找到起始页，尝试二次提取！ 文件: {pdf_path}")
-            # 二次提取逻辑，保留原有功能
-            start_page, end_page = find_page_indices(
-                pdf_document,
-                re.compile(r'^(?:响应|投标).*?格式.*', re.MULTILINE),
-                end_pattern,
-                begin_page,
-                common_header
-            )
+        if output_suffix == "format":
            if start_page is None:
-                print(f"{output_suffix}: 未找到起始页，提取失败！ 文件: {pdf_path}")
-                return ""
-
-        if start_page is None:
-            print(f"未找到起始页，提取失败！ 文件: {pdf_path}")
-            return ""
-
-        if end_page is None:
-            end_page = len(pdf_document.pages) - 1
-            print(f"{output_suffix}: 未找到结束页，默认截取到文件末尾。 文件: {pdf_path}")
-
+                print(f"{output_suffix}: 未找到起始页，提取失败！ 文件: {pdf_path} 尝试二次提取！")
+                return extract_pages_twice(pdf_path,output_folder,output_suffix,common_header)
+            if end_page is None:
+                # 如果未匹配到结束页，默认截取到文件末尾
+                end_page = total_pages
+                print(f"{output_suffix}: 未找到结束页，默认截取到文件末尾。 文件: {pdf_path}")
+            return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
        return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
@ -156,7 +182,8 @@ def process_files(file_path, output_folder, begin_pattern, begin_page, end_patte
    """
    # pdf_path = convert_to_pdf(file_path)
    pdf_path=file_path
-    return extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) or ""
+    result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
+    return result or ""


 def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
@ -172,47 +199,52 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
            file_path = os.path.join(input_path, file_name)
            if is_pdf_or_doc(file_path):
                result = process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
-                generated_files.append(result)
+                if isinstance(result, tuple):
+                    generated_files.extend([f if f else "" for f in result])  # 保留空字符串
+                else:
+                    generated_files.append(result)  # 直接添加result，可能是空字符串
    elif os.path.isfile(input_path) and is_pdf_or_doc(input_path):
        result = process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
-        generated_files.append(result)
+        if isinstance(result, tuple):
+            generated_files.extend([f if f else "" for f in result])  # 保留空字符串
+        else:
+            generated_files.append(result)  # 直接添加result，可能是空字符串
    else:
-        print("提供的路径既不是文件夹也不是PDF/Word文件。")
-    return [f for f in generated_files if f]  # 过滤空字符串
+        print("提供的路径既不是文件夹也不是PDF文件。")
+
+    return generated_files


-def truncate_pdf_main(input_path, output_folder, selection):
+def truncate_pdf_main(input_path, output_folder, selection,begin_page=10):
    """
    主函数，根据选择截取PDF内容。
    """
    try:
        if selection == 1:  # 投标文件格式
-            begin_page = 10
            begin_pattern = re.compile(
-                r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*',
+                r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*'
            )
            end_pattern = re.compile(
-                r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
-                re.MULTILINE
+                r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE
            )
            local_output_suffix = "format"
        else:
            print("无效的选择:请选择1")
-            return []
+            return None

        # 调用相应的处理函数
-        return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, local_output_suffix)
+        return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, local_output_suffix) or ""
    except Exception as e:
        print(f"Error in truncate_pdf_main: {e}")
-        return []
+        return ""  # 返回空字符串


 if __name__ == "__main__":
    # 定义输入和输出路径
-    # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
-    input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
+    # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件(2).pdf"
+    input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest12.pdf"
    output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
    selection = 1  # 1 - 投标文件格式
    # 执行截取
    generated_files = truncate_pdf_main(input_path, output_folder, selection)
-    # print("生成的文件:", generated_files)
+    print("生成的文件:", generated_files)