11.5货物标截取优化

2024-11-05 16:57:04 +08:00 · 2024-11-05 16:57:04 +08:00 · f9f17757f3
commit f9f17757f3
parent ccab078ac3
1 changed files with 7 additions and 15 deletions
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -474,10 +474,6 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page)
    if start_page1 is None or end_page1 is None:
        return "", "",""

-    # # 保存第一部分的路径
-    # path1 = save_extracted_pages(pdf_document, start_page1, end_page1, pdf_path, output_folder,
-    #                              "tobidders_notice_part1")
-
    # 提取第二部分
    start_page2 = end_page1

@ -501,10 +497,6 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page)
    if end_page2 is None:
        return start_page1, end_page1,end_page1

-    # # 保存第二部分的路径
-    # path2 = save_extracted_pages(pdf_document, start_page2, end_page2, pdf_path, output_folder,
-    #                              "tobidders_notice_part2")
-
    return start_page1, end_page1,end_page2

 def extract_pages_qualification(pdf_document,begin_page,begin_pattern,end_pattern, common_header):
@ -524,7 +516,7 @@ def extract_pages_qualification(pdf_document,begin_page,begin_pattern,end_patter
                    start_page is None
            ):
                if begin_pattern.search(cleaned_text):
-                    print("附件")
+                    print("第二次尝试:匹配附件")
                    start_page = i
            # 确定结束页
            if start_page is not None and end_pattern.search(cleaned_text):
@ -562,8 +554,8 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,be

        if start_page is None or end_page is None:
            if output_suffix == "qualification1":
-                print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！")
-                print("third:尝试提取评分办法章节...")
+                # print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！")
+                print("第三次尝试资格审查:尝试提取评分办法章节...")
                temp = truncate_pdf_main(pdf_path, output_folder, 2, "qualification2")
                if len(temp) > 0:
                    return temp[0]
@ -825,9 +817,9 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1

 # TODO:交通智能系统和招标(1)(1)文件有问题  包头 绍兴    资格审查文件可能不需要默认与"evaluation"同一章  无效投标可能也要考虑 “more”的情况，类似工程标   唐山投标只有正文，没有附表

-#ztbfile.pdf jiao通   广水农商行门禁控制主机及基础验证设备采购项目——磋商文件（定稿）（三次）_qualification2.pdf   唐山   包头
+#ztbfile.pdf少资格评审  包头少符合性评审
 if __name__ == "__main__":
-    input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
+    input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\ztbfile.pdf"
    # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\f8b793b5-aa60-42d3-ae59-a3f474e06610\\ztbfile.pdf"
    # input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
    # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
@ -838,6 +830,6 @@ if __name__ == "__main__":
    # selections = [1,4]
    # files=truncate_pdf_specific_goods(input_path,output_folder,selections)
    # print(files)
-    selection = 4# 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求
+    selection = 3# 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求
    generated_files = truncate_pdf_main(input_path, output_folder, selection)
-    # print(generated_files)
+    print(generated_files)