11.5货物标截取优化

2024-11-05 16:29:32 +08:00 · 2024-11-05 16:29:32 +08:00 · ccab078ac3
commit ccab078ac3
parent d0e7f060c8
5 changed files with 208 additions and 94 deletions
--- a/flask_app/general/format_change.py
+++ b/flask_app/general/format_change.py
@ -184,9 +184,9 @@ if __name__ == '__main__':
    # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
    # local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
    local_path_in="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\6.2定版视频会议磋商文件.doc"
-    # downloaded_file=doc2docx(local_path_in)
+    downloaded_file=doc2docx(local_path_in)
    # downloaded_file=pdf2docx(local_path_in)
-    downloaded_file=docx2pdf(local_path_in)
+    # downloaded_file=docx2pdf(local_path_in)
    print(downloaded_file)


--- a/flask_app/general/读取文件/按页读取pdf.py
+++ b/flask_app/general/读取文件/按页读取pdf.py
@ -95,7 +95,7 @@ def extract_text_by_page(file_path):


 if __name__ == '__main__':
-    file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
+    file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\交警支队机动车查验监管系统项目采购.pdf"
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
--- a/flask_app/main/截取pdf.py
+++ b/flask_app/main/截取pdf.py
@ -16,8 +16,6 @@ def get_global_logger(unique_id):


 logger = None
-
-
 def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header):
    try:
        # 获取文件基本名称
@ -238,7 +236,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
            begin_pattern = r'^(?:附录(?:[一1])?[：:]|附件(?:[一1])?[：:]|附表(?:[一1])?[：:])'
            end_pattern = re.compile(
                r'^(?:附录[一二三四五六七八九1-9]*[：:]|附件[一二三四五六七八九1-9]*[：:]|附表[一二三四五六七八九1-9]*[：:])(?!.*(?:资质|能力|信誉)).*$|'
-                r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、]\s*$|\s*评标(办法|方法)前附表\s*$|投标人须知',
+                r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]\s*$|\s*评标(办法|方法)前附表\s*$|投标人须知',
                re.MULTILINE
            )
            start_page = None
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -110,12 +110,14 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
        if output_suffix == "tobidders_notice":
            exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
            start_page, mid_page, end_page = extract_pages_tobidders_notice(
-                pdf_document, begin_pattern, begin_page, common_header, exclusion_pattern
+                pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern
            )
-
-            if start_page is None or end_page is None or mid_page is None:
-                print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！尝试备用提取策略。")
-                return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header)
+            if not start_page or not mid_page or not end_page:
+                print(f"三次提取tobidders_notice均失败!!{pdf_path}")
+                return "",""
+            # if start_page is None or end_page is None or mid_page is None:
+            #     print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！尝试备用提取策略。")
+            #     return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page)

            path1 = save_extracted_pages(pdf_document, start_page, mid_page, pdf_path, output_folder, "tobidders_notice_part1")
            path2 = save_extracted_pages(pdf_document, mid_page, end_page, pdf_path, output_folder, "tobidders_notice_part2")
@ -138,7 +140,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
                return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
            if start_page is None or end_page is None:
                print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！尝试备用提取策略。")
-                return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header)
+                return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page)
            elif output_suffix == "qualification1":
                truncate_pdf_main(pdf_path, output_folder, 2, "qualification3")  # 合并'资格审查'章节和'评标办法'章节
            return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
@ -175,28 +177,33 @@ def get_patterns_for_evaluation_method():
    return begin_pattern, end_pattern


+# def get_patterns_for_qualification():
+#     # # 原始匹配逻辑
+#     # begin_pattern_original = re.compile(
+#     #     r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE)
+#     # end_pattern_original = re.compile(
+#     #     r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
+#
+#     # 新匹配逻辑
+#     begin_pattern_new = re.compile(
+#         r'^资格性检查', re.MULTILINE)
+#     end_pattern_new = re.compile(
+#         r'^附件\s*\d+|^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
+#
+#     return begin_pattern_new, end_pattern_new
 def get_patterns_for_qualification():
-    # # 原始匹配逻辑
-    # begin_pattern_original = re.compile(
-    #     r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE)
-    # end_pattern_original = re.compile(
-    #     r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
-
-    # 新匹配逻辑
-    begin_pattern_new = re.compile(
-        r'^资格性检查', re.MULTILINE)
-    end_pattern_new = re.compile(
-        r'^附件\s*\d+|^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
-
-    return begin_pattern_new, end_pattern_new
-def get_patterns_for_qualification2():
+    # begin_pattern = re.compile(
+    #     r'^(?:附录(?:一|1)?[：:]|附件(?:一|1)?[：:]|附表(?:一|1)?[：:]|资格性检查|资格审查表).*(?:资格|符合性)?.*$',
+    #     re.MULTILINE
+    # )
    begin_pattern = re.compile(
-        r'^(?:附录(?:一|1)?[：:]|附件(?:一|1)?[：:]|附表(?:一|1)?[：:]).*(?:资格|符合性).*$',
+        r'^(?:附录(?:一|1)?[：:]?|附件(?:一|1)?[：:]?|附表(?:一|1)?[：:]?|资格性检查|资格审查表|符合性审查表)',   #目前可以匹配 附录3,因为允许了'附录'匹配
        re.MULTILINE
    )
    end_pattern = re.compile(
-        r'^(?!.*(?:资格|符合))(?:附录.*?[：:]|附件.*?[：:]|附表.*?[：:]).*$|'
-        r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
+        r'^(?!.*(?:资格|符合))(?:附录.*?[：:]|附件.*?[：:]|附表.*?[：:]|附件\s*\d+).*$|'
+        # r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',  
+        r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$',    #更宽松的匹配
        re.MULTILINE
    )
    return begin_pattern, end_pattern
@ -239,7 +246,7 @@ def get_patterns_for_notice_twice():
 #             break
 #     return start_page, mid_page, end_page

-def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, common_header, exclusion_pattern):
+def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern):
    """
    从PDF文档中提取起始页、中间页和结束页。

@ -274,7 +281,7 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, comm
        end_page = None
        chapter_type = None  # 用于存储“章”或“部分”
        combined_mid_pattern = None  # 中间页的组合模式
-
+        pdf_document = PdfReader(pdf_path)
        for i, page in enumerate(pdf_document.pages):
            text = page.extract_text() or ""
            cleaned_text = clean_page_content(text, common_header)
@ -345,8 +352,7 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, comm
                            )
                    else:
                        # 如果提供了固定的 end_pattern，则使用默认的 mid_pattern
-                        base_mid_pattern = r'^\s*(?:[（(]\s*[一二12]?\s*[)）]\s*[、．.]*|' \
-                                           r'[一二12][、．.]+|[、．.]+)\s*(说\s*明|总\s*则)'
+                        base_mid_pattern = r'.*[（(]?\s*[一二12]?[)）]?\s*[、．.]*\s*(说\s*明|总\s*则)\s*$'   #可以匹配"东莞市粤隆招标有限公司编制 46一、说明"
                        combined_mid_pattern = re.compile(
                            rf'{base_mid_pattern}',
                            re.MULTILINE
@ -355,6 +361,8 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, comm

            # 识别中间页
            if start_page is not None and mid_page is None and combined_mid_pattern:
+                if (start_page+1==i) and re.search(local_begin_pattern,cleaned_text):
+                    continue
                if re.search(combined_mid_pattern, cleaned_text):
                    mid_page = i

@ -374,23 +382,83 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, comm

    # 如果第一次提取失败，则使用新的 begin_pattern 和 end_pattern 重新提取
    if not (start_page and mid_page and end_page):
-        # 定义新的 begin_pattern 和 end_pattern
-        new_begin_pattern = re.compile(
-            r'.*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|'
-            r'(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表',
-            re.MULTILINE
-        )
-        new_end_pattern = re.compile(
-            r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*$',
-            re.MULTILINE
-        )
-        # 第二次提取尝试，使用新的模式
-        start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern)
+        print(f"第二次尝试 tobidders_notice!{pdf_path}")
+        pdf_document = PdfReader(pdf_path)
+        start_page,mid_page,end_page=extract_pages_twice_tobidders_notice(pdf_document,common_header,begin_page)
+        if start_page and end_page and mid_page:
+            return start_page, mid_page, end_page
+        else:
+            # 定义新的 begin_pattern 和 end_pattern
+            new_begin_pattern = re.compile(
+                r'.*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|'
+                r'(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表',
+                re.MULTILINE
+            )
+            new_end_pattern = re.compile(
+                r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*$',
+                re.MULTILINE
+            )
+            print("第三次尝试 tobidders_notice! ")
+            # 第二次提取尝试，使用新的模式
+            start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern)

    return start_page, mid_page, end_page


-def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header):
+#投标人须知分为两个章节
+# def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page):
+#     begin_pattern = re.compile(
+#         r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+'
+#     )
+#     end_pattern = re.compile(
+#         r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)'  # 捕获中文部分
+#     )
+#     exclusion_words = ["合同", "评标", "开标","评审","采购","资格"]  # 在这里添加需要排除的关键词
+#
+#     pdf_document = PdfReader(pdf_path)
+#     exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
+#
+#     # 提取第一部分
+#     start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
+#     if start_page1 is None or end_page1 is None:
+#         print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！")
+#         return "", ""
+#
+#     # 保存第一部分的路径
+#     path1 = save_extracted_pages(pdf_document, start_page1, end_page1, pdf_path, output_folder,
+#                                  "tobidders_notice_part1")
+#
+#     # 提取第二部分
+#     start_page2 = end_page1
+#
+#     # 检查end_page1页面的内容
+#     text = pdf_document.pages[end_page1].extract_text() or ""
+#     cleaned_text = clean_page_content(text, common_header)
+#     match = end_pattern.search(cleaned_text)
+#
+#     if match:
+#         # 获取匹配到的中文部分
+#         chapter_title = match.group(1)
+#         # 检查是否包含排除关键词
+#         if any(word in chapter_title for word in exclusion_words):
+#             # 如果包含排除关键词，直接返回相同的路径
+#             return path1, path1
+#
+#     # 如果不包含排除关键词，继续提取第二部分
+#     _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
+#                                          exclusion_pattern)
+#
+#     if end_page2 is None:
+#         print(f"second: {output_suffix} 未找到第二部分的结束页在文件 {pdf_path} 中！")
+#         return path1, path1
+#
+#     # 保存第二部分的路径
+#     path2 = save_extracted_pages(pdf_document, start_page2, end_page2, pdf_path, output_folder,
+#                                  "tobidders_notice_part2")
+#
+#     return path1, path2
+
+def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page):
    begin_pattern = re.compile(
        r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+'
    )
@ -399,18 +467,16 @@ def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix,
    )
    exclusion_words = ["合同", "评标", "开标","评审","采购","资格"]  # 在这里添加需要排除的关键词

-    pdf_document = PdfReader(pdf_path)
    exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')

    # 提取第一部分
-    start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, -1, common_header)
+    start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
    if start_page1 is None or end_page1 is None:
-        print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！")
-        return "", ""
+        return "", "",""

-    # 保存第一部分的路径
-    path1 = save_extracted_pages(pdf_document, start_page1, end_page1, pdf_path, output_folder,
-                                 "tobidders_notice_part1")
+    # # 保存第一部分的路径
+    # path1 = save_extracted_pages(pdf_document, start_page1, end_page1, pdf_path, output_folder,
+    #                              "tobidders_notice_part1")

    # 提取第二部分
    start_page2 = end_page1
@ -426,52 +492,71 @@ def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix,
        # 检查是否包含排除关键词
        if any(word in chapter_title for word in exclusion_words):
            # 如果包含排除关键词，直接返回相同的路径
-            return path1, path1
+            return start_page1, end_page1,end_page1

    # 如果不包含排除关键词，继续提取第二部分
    _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
                                         exclusion_pattern)

    if end_page2 is None:
-        print(f"second: {output_suffix} 未找到第二部分的结束页在文件 {pdf_path} 中！")
-        return path1, path1
+        return start_page1, end_page1,end_page1

-    # 保存第二部分的路径
-    path2 = save_extracted_pages(pdf_document, start_page2, end_page2, pdf_path, output_folder,
-                                 "tobidders_notice_part2")
+    # # 保存第二部分的路径
+    # path2 = save_extracted_pages(pdf_document, start_page2, end_page2, pdf_path, output_folder,
+    #                              "tobidders_notice_part2")

-    return path1, path2
+    return start_page1, end_page1,end_page2

+def extract_pages_qualification(pdf_document,begin_page,begin_pattern,end_pattern, common_header):
+    start_page = None
+    end_page = None
+    include_keywords = ["资格审查", "资质审查", "符合性审查"]
+    exclude_keywords = ["声明函", "承诺函"]
+    # 从章节开始后的位置进行检查
+    for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
+        text = page.extract_text()
+        if text:
+            cleaned_text = clean_page_content(text, common_header)
+            # 确定起始页，需在last_begin_index之后
+            if (
+                    any(keyword in cleaned_text for keyword in include_keywords) and
+                    all(keyword not in cleaned_text for keyword in exclude_keywords) and
+                    start_page is None
+            ):
+                if begin_pattern.search(cleaned_text):
+                    print("附件")
+                    start_page = i
+            # 确定结束页
+            if start_page is not None and end_pattern.search(cleaned_text):
+                if i > start_page:
+                    end_page = i
+                    break  # 找到结束页后退出循环
+    return start_page,end_page

-def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
+def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page):
    try:
        exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
        pdf_document = PdfReader(pdf_path)
        patterns = None
-        begin_page = 0
        start_page = None
        end_page = None

        if output_suffix == "procurement":
            patterns = [get_patterns_for_procurement()]
-            begin_page = 5
        elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3":
            patterns = [get_patterns_for_evaluation_method()]
-            begin_page = 5
        elif output_suffix == "qualification1":
-            patterns = [get_patterns_for_qualification(),get_patterns_for_qualification2()]
-            begin_page = 5
+            patterns = [get_patterns_for_qualification()]
        elif output_suffix == "notice":
            patterns = [get_patterns_for_notice(),get_patterns_for_notice_twice()]
-            begin_page = 0

        if patterns:
            for pattern_pair in patterns:
-                # print(pattern_pair[0])
-                # print(pattern_pair[1])
-                start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
-                                                             common_header,
-                                                             exclusion_pattern, output_suffix)
+                if output_suffix=="qualification1":
+                    start_page,end_page=extract_pages_qualification(pdf_document,begin_page,pattern_pair[0],pattern_pair[1],common_header)
+                else:
+                    start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
+                                                             common_header,exclusion_pattern, output_suffix)
                if start_page is not None and end_page is not None:
                    break

@ -544,7 +629,7 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo
 def get_start_and_common_header(input_path):
    common_header = extract_common_header(input_path)
    last_begin_index = 0
-    begin_pattern = re.compile(r'.*(?:招标公告|邀请书|邀请函|投标邀请)[\)）]?\s*$',re.MULTILINE)
+    begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$',re.MULTILINE)
    pdf_document = PdfReader(input_path)
    for i, page in enumerate(pdf_document.pages):
        if i > 10:
@ -604,8 +689,8 @@ def process_input(input_path, output_folder, selection, output_suffix):

        # 根据选择设置对应的模式和结束模式
        if selection == 1:
-            begin_pattern = re.compile(r'.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$', re.MULTILINE)
-            end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
+            begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$', re.MULTILINE)
+            end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE)
            local_output_suffix = "notice"
        elif selection == 2:
            begin_pattern = re.compile(
@ -737,9 +822,10 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
        logger.warning(f"合并失败，没有生成合并文件 for {pdf_path}")

    return truncate_files
-#TODO:截取还有问题

 # TODO:交通智能系统和招标(1)(1)文件有问题  包头 绍兴    资格审查文件可能不需要默认与"evaluation"同一章  无效投标可能也要考虑 “more”的情况，类似工程标   唐山投标只有正文，没有附表
+
+#ztbfile.pdf jiao通   广水农商行门禁控制主机及基础验证设备采购项目——磋商文件（定稿）（三次）_qualification2.pdf   唐山   包头
 if __name__ == "__main__":
    input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
    # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\f8b793b5-aa60-42d3-ae59-a3f474e06610\\ztbfile.pdf"
@ -752,6 +838,6 @@ if __name__ == "__main__":
    # selections = [1,4]
    # files=truncate_pdf_specific_goods(input_path,output_folder,selections)
    # print(files)
-    selection = 1# 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求
+    selection = 4# 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求
    generated_files = truncate_pdf_main(input_path, output_folder, selection)
    # print(generated_files)
--- a/flask_app/货物标/资格审查main.py
+++ b/flask_app/货物标/资格审查main.py
@ -2,12 +2,12 @@
 import json
 import os
 import re
-from flask_app.general.通义千问long import upload_file
+from flask_app.general.通义千问long import upload_file, qianwen_long
 from flask_app.general.多线程提问 import multi_threading
 from flask_app.general.json_utils import clean_json_string
 from flask_app.货物标.投标人须知正文条款提取成json文件货物标版 import convert_clause_to_json
 import copy
-
+import concurrent.futures
 # 这个字典可能有嵌套，你需要遍历里面的键名，对键名作判断，而不是键值，具体是这样的：如果处于同一层级的键的数量>1并且键名全由数字或点号组成。那么就将这些序号键名全部删除，重新组织成一个字典格式的数据，你可以考虑用字符串列表来保持部分平级的数据
 # 对于同级的键，如果数量>1且键名都统一，那么将键名去掉，用列表保持它们的键值

@ -379,23 +379,53 @@ def combine_qualification_review(invalid_path, output_folder, qualification_path
        }
    }

-    def process_file(file_path):
+    def process_file(file_path, invalid_path):
        file_id = upload_file(file_path)
-        user_queries = [
-            "该招标文件中规定的资格性审查标准是怎样的？请以json格式给出，外层为'资格性审查'，你的回答要与原文完全一致，不可擅自总结删减，也不要回答有关符合性性审查的内容。",
-            "该招标文件中规定的符合性审查标准是怎样的？请以json格式给出，外层为'符合性审查'，你的回答要与原文完全一致，不可擅自总结删减，也不要回答有关资格性审查的内容。"
-        ]
-        results = multi_threading(user_queries, "", file_id, 2)
-        combined_res = {}
-        for _, response in results:
-            if response:
-                # print(response)
-                cleaned_data = clean_json_string(response)
-                processed = process_dict(preprocess_dict(cleaned_data))
-                combined_res.update(processed)
-            else:
-                print(f"Warning: No response for a query.")

+        first_query = """
+        该文档中是否有关于资格性审查标准的具体内容,是否有关于符合性审查标准的具体内容?请以json格式给出回答,外键分别为'资格性审查'和'符合性审查',键值仅限于'是','否',输出格式示例如下:
+        {
+            "资格性审查":"是",
+            "符合性审查":"是"
+        }
+        """
+        qianwen_ans = clean_json_string(qianwen_long(file_id, first_query))
+        user_queries = [
+            {
+                "key": "资格性审查",
+                "query": "该招标文件中规定的资格性审查标准是怎样的？请以json格式给出，外层为'资格性审查'，你的回答要与原文完全一致，不可擅自总结删减，也不要回答有关符合性性审查的内容。"
+            },
+            {
+                "key": "符合性审查",
+                "query": "该招标文件中规定的符合性审查标准是怎样的？请以json格式给出，外层为'符合性审查'，你的回答要与原文完全一致，不可擅自总结删减，也不要回答有关资格性审查的内容。"
+            }
+        ]
+        combined_res = {}
+        file_id2 = None  # 延迟上传 invalid_path
+        def process_single_query(query_info):
+            nonlocal file_id2
+            key = query_info["key"]
+            query = query_info["query"]
+            # 根据键值决定使用哪个 file_id
+            if qianwen_ans.get(key) == "否":
+                if not file_id2:
+                    file_id2 = upload_file(invalid_path)
+                current_file_id = file_id2
+            else:
+                current_file_id = file_id
+
+            # 调用大模型获取回答
+            ans = qianwen_long(current_file_id, query)
+            cleaned_data = clean_json_string(ans)
+            processed = process_dict(preprocess_dict(cleaned_data))
+            return processed
+
+        # 使用线程池并行处理查询
+        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+            futures = [executor.submit(process_single_query, q) for q in user_queries]
+            for future in concurrent.futures.as_completed(futures):
+                result = future.result()
+                combined_res.update(result)
        return combined_res

    try: