1.10

2025-01-10 17:38:55 +08:00 · 2025-01-10 17:38:55 +08:00 · 4617595c82
commit 4617595c82
parent 06a3c2bbaf
8 changed files with 72 additions and 84 deletions
--- a/flask_app/general/截取pdf通用函数.py
+++ b/flask_app/general/截取pdf通用函数.py
@ -161,7 +161,7 @@ def get_invalid_file(file_path, output_folder, common_header,begin_page):
    # 定义排除模式
    exclusion_pattern = regex.compile(
-        r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制',
+        r'文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制',
        regex.MULTILINE
    )
@ -294,29 +294,16 @@ def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
                    break
    return start_page, end_page
-def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern):
+def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header):
    """
    从PDF文档中提取起始页、中间页和结束页。
    如果第一次提取失败，则使用新的 begin_pattern 和 end_pattern 重新提取。
    参数:
        pdf_document (PDFDocument): 要处理的PDF文档对象。
        begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。
        begin_page (int): 开始搜索的页码。
        common_header (str): 每页需要清理的公共头部文本。
        exclusion_pattern (str 或 regex.Pattern): 用于排除某些页的模式。
    返回:
        tuple: (start_page, mid_page, end_page) 如果成功，否则 (None, None, None)
    """
-
+    exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')
    def run_extraction(local_begin_pattern, local_end_pattern=None):
        """
        使用提供的 begin 和 end 模式运行提取过程。
        如果未提供 local_end_pattern，则根据匹配的章节类型动态生成 end_pattern。
        参数:
            local_begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。
            local_end_pattern (str 或 regex.Pattern, 可选): 用于识别结束的正则表达式模式。
@ -427,32 +414,46 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
        return start_page, mid_page, end_page
    def perform_extraction_and_save(start_page, mid_page, end_page):
        path1 = save_extracted_pages(pdf_path,output_folder,start_page,mid_page,"tobidders_notice_part1",common_header)
        if mid_page != end_page:
            path2 = save_extracted_pages(pdf_path,output_folder,mid_page,end_page,"tobidders_notice_part2",common_header)
        else:
            path2 = path1
        return path1, path2
    # 第一次提取尝试，使用初始的 begin_pattern
    start_page, mid_page, end_page = run_extraction(begin_pattern)
    # 如果第一次提取成功，保存并返回路径
    if start_page is not None and mid_page is not None and end_page is not None:
        return perform_extraction_and_save(start_page, mid_page, end_page)
-    # 如果第一次提取失败，则使用新的 begin_pattern 和 end_pattern 重新提取
+    print(f"第一次提取tobidders_notice失败，尝试第二次提取: {pdf_path}")
-    if not (start_page and mid_page and end_page):
+    pdf_document = PdfReader(pdf_path)
-        print(f"第二次尝试 tobidders_notice!")
+    start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
        pdf_document = PdfReader(pdf_path)
        start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
        if start_page and end_page and mid_page:
            return start_page, mid_page, end_page
        else:
            # 定义新的 begin_pattern 和 end_pattern
            new_begin_pattern = regex.compile(
                r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
                r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表',
                regex.MULTILINE
            )
            new_end_pattern = regex.compile(
                r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$',
                regex.MULTILINE
            )
            print("第三次尝试 tobidders_notice! ")
            # 第二次提取尝试，使用新的模式
            start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern)
-    return start_page, mid_page, end_page
+    if start_page is not None and mid_page is not None and end_page is not None:
        return perform_extraction_and_save(start_page, mid_page, end_page)
    # 如果第二次提取失败，定义新的 begin_pattern 和 end_pattern 进行第三次尝试
    new_begin_pattern = regex.compile(
        r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
        r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表',
        regex.MULTILINE
    )
    new_end_pattern = regex.compile(
        r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$',
        regex.MULTILINE
    )
    print("第二次提取tobidders_notice失败，尝试第三次提取!")
    start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern)
    if start_page is not None and mid_page is not None and end_page is not None:
        return perform_extraction_and_save(start_page, mid_page, end_page)
    else:
        # 所有提取尝试均失败
        print(f"三次提取tobidders_notice均失败!! {pdf_path}")
        return "", ""
 def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page):
--- a/flask_app/general/读取文件/按页读取pdf.py
+++ b/flask_app/general/读取文件/按页读取pdf.py
@ -118,7 +118,7 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
 if __name__ == '__main__':
    # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
-    file_path=r"C:\Users\Administrator\Downloads\2024-福建-2024年度防汛抢险物资储备.pdf"
+    file_path=r'D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\ztbfile.pdf'
    # file_path = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
    # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
--- a/flask_app/routes/judge_zbfile.py
+++ b/flask_app/routes/judge_zbfile.py
@ -34,6 +34,7 @@ def judge_zbfile() -> Any:
        try:
            start_time = time.time()
            downloaded_filename = os.path.join(output_folder, "ztbfile")
            logger.info(f"接收到的url:{file_url}")
            downloaded_filepath, file_type = download_file(file_url, downloaded_filename)
            if not downloaded_filepath or file_type == 4:
--- a/flask_app/routes/判断是否是招标文件.py
+++ b/flask_app/routes/判断是否是招标文件.py
@ -44,7 +44,7 @@ def judge_zbfile_exec(file_path):
 if __name__ == '__main__':
    start_time = time.time()
-    pdf_path = r"C:/Users/Administrator/Downloads/094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件 - 副本.pdf"
+    pdf_path = r"C:\Users\Administrator\Downloads\商务标null1736476867734.docx"
    res = judge_zbfile_exec(pdf_path)
    if res:
        print("yes")
--- a/flask_app/routes/货物标解析main.py
+++ b/flask_app/routes/货物标解析main.py
@ -283,8 +283,10 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
 #   2.废标项这边，考虑大模型+正则并用
 #   废标项，增加对表格的提取+排除重复项，按顺序处理
 #   考虑将工程标和货物标的 投标人须知那块逻辑结合
-#   C:\Users\Administrator\Desktop\fsdownload\b29de31a-297e-42cf-b9ba-6859e530a472 这里的投标文件要求有点问题
+#   D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3排查一下
 #   解决禅道 测试的bug
 #   截取：对应 按照 根据
 #   评分点
 if __name__ == "__main__":
    # 配置日志器
--- a/flask_app/工程标/截取pdf工程标版.py
+++ b/flask_app/工程标/截取pdf工程标版.py
@ -13,7 +13,7 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
                                   is_secondary_match):
    pdf_document = PdfReader(pdf_path)
    exclusion_pattern = regex.compile(
-        r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制')
+        r'文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制')
    def run_extraction():
        start_page = None
        mid_page = None
@ -134,7 +134,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
    end_page = None
    flag = True
    exclusion_pattern = regex.compile(
-        r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制')
+        r'文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制')
    # 遍历文档的每一页，查找开始和结束短语的位置
    for i in range(len(pdf_document.pages)):
        page = pdf_document.pages[i]
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -12,36 +12,18 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
        common_header = extract_common_header(pdf_path)
        pdf_document = PdfReader(pdf_path)
        exclusion_pattern = None
-
+        # 原有的处理逻辑保持不变
-        if output_suffix == "tobidders_notice":
+        if output_suffix == "qualification1" or output_suffix == "procurement" or output_suffix == "evaluation_method":
            exclusion_pattern = regex.compile(
                r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')
-            start_page, mid_page, end_page = extract_pages_tobidders_notice(
+        start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
                pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern
            )
            if not start_page or not mid_page or not end_page:
                print(f"三次提取tobidders_notice均失败!!{pdf_path}")
                return "", ""
            path1 = save_extracted_pages(pdf_path,output_folder,start_page, mid_page, "tobidders_notice_part1",common_header)
            if mid_page != end_page:
                path2 = save_extracted_pages(pdf_path,output_folder, mid_page, end_page,"tobidders_notice_part2",common_header)
            else:
                path2=path1
            return path1, path2
        else:
            # 原有的处理逻辑保持不变
            if output_suffix == "qualification1" or output_suffix == "procurement" or output_suffix == "evaluation_method":
                exclusion_pattern = regex.compile(
                    r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')
            start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
                                                         common_header, exclusion_pattern, output_suffix)
-            if start_page is None or end_page is None:
+        if start_page is None or end_page is None:
-                print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！尝试备用提取策略。")
+            print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！尝试备用提取策略。")
-                return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page,logger)
+            return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page,logger)
-            elif output_suffix == "qualification1":
+        elif output_suffix == "qualification1":
-                truncate_pdf_main_goods(pdf_path, output_folder, 2, logger,"qualification3")  # 合并'资格审查'章节和'评标办法'章节
+            truncate_pdf_main_goods(pdf_path, output_folder, 2, logger,"qualification3")  # 合并'资格审查'章节和'评标办法'章节
-            return save_extracted_pages(pdf_path, output_folder,start_page, end_page, output_suffix,common_header)
+        return save_extracted_pages(pdf_path, output_folder,start_page, end_page, output_suffix,common_header)
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return ""
@ -163,7 +145,7 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
 def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page, logger):
    try:
        exclusion_pattern = regex.compile(
-            r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制')
+            r'文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制')
        pdf_document = PdfReader(pdf_path)
        patterns = None
        start_page = None
@ -298,16 +280,18 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
                if output_suffix == "default":
                    output_suffix = local_output_suffix
-                # 将原先的 process_files 逻辑合并到此处
+                if selection==4:
-                result = extract_pages(
+                    result = extract_pages_tobidders_notice(pdf_path,output_folder,begin_pattern,begin_page,common_header)
-                    pdf_path,
+                else:
-                    output_folder,
+                    result = extract_pages(
-                    begin_pattern,
+                        pdf_path,
-                    begin_page,
+                        output_folder,
-                    end_pattern,
+                        begin_pattern,
-                    output_suffix,
+                        begin_page,
-                    logger
+                        end_pattern,
-                )
+                        output_suffix,
                        logger
                    )
                # 根据提取结果以及不同的 output_suffix 进行处理
                if result:
                    if output_suffix == "tobidders_notice":
@ -363,10 +347,10 @@ if __name__ == "__main__":
    logger = get_global_logger("123")
    # input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
    # pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
-    pdf_path=r"C:\Users\Administrator\Downloads\招标文件 (4).pdf"
+    pdf_path=r"D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\ztbfile.pdf"
    # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件（广水市教育局封闭管理）.pdf"
    # pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
-    output_folder = r"D:\flask_project\flask_app\static\output\output1\test"
+    output_folder = r"D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\tmp"
    # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
    selection = 4  # 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求 6-invalid_path
    generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
--- a/flask_app/货物标/技术参数要求提取.py
+++ b/flask_app/货物标/技术参数要求提取.py
@ -297,7 +297,7 @@ def generate_prompt(judge_res, full_text=None):
 **输出格式**：
 1.JSON格式，外层键名为需要采购的货物、设备或系统名称(一级键)。
 2.层次关系用嵌套键值对表示:
-    -采购活动可能将目标划分为多个系统或货物。若文档通过大标题或表格层次或序号标明这种归属关系，请在JSON中以嵌套形式表示：
+    -采购活动可能将目标划分为多个系统或货物。若文档通过大标题或表格结构或序号标明这种归属关系，请在JSON中以嵌套形式表示：
        -系统作为一级键，包含的货物作为二级键。
        -例子:"""
             1. 激光扫描系统：xxx