From 94b945e6b131148f63810f6623f7de8ce6ac58f2 Mon Sep 17 00:00:00 2001
From: zy123 <646228430@qq.com>
Date: Fri, 17 Jan 2025 15:45:53 +0800
Subject: [PATCH] =?UTF-8?q?1.17=20=E5=B0=8Fbug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 flask_app/general/model_continue_query.py |  3 +-
 flask_app/general/多线程提问.py           | 10 ++--
 flask_app/general/截取pdf_main.py         |  6 +--
 flask_app/general/截取pdf通用函数.py      | 57 ++++++++++++-----------
 flask_app/general/无效标和废标公共代码.py | 10 ++--
 flask_app/货物标/截取pdf货物标版.py       |  7 ++-
 flask_app/货物标/技术参数要求提取.py      | 10 ++--
 7 files changed, 56 insertions(+), 47 deletions(-)

diff --git a/flask_app/general/model_continue_query.py b/flask_app/general/model_continue_query.py
index 769add4..e0df96e 100644
--- a/flask_app/general/model_continue_query.py
+++ b/flask_app/general/model_continue_query.py
@@ -48,7 +48,8 @@ def continue_answer(original_query, original_answer, model_type=1, file_id=None)
 
     # 根据模型类型选择调用的模型
     if model_type == 1:  # 使用 doubao_model
-        model_res = doubao_model(continue_query)
+        # model_res = doubao_model(continue_query)
+        model_res=qianwen_plus(continue_query)
     elif model_type == 2:  # 使用 qianwen_long
         if file_id is None:
             raise ValueError("file_id 必须在使用 qianwen_long 模型时提供！")
diff --git a/flask_app/general/多线程提问.py b/flask_app/general/多线程提问.py
index 5417c27..e10cb36 100644
--- a/flask_app/general/多线程提问.py
+++ b/flask_app/general/多线程提问.py
@@ -13,7 +13,8 @@ from dashscope import Assistants, Messages, Runs, Threads
 from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
 
 from flask_app.general.doubao import doubao_model
-from flask_app.general.通义千问long import qianwen_long, upload_file
+from flask_app.general.通义千问long import qianwen_long, upload_file, qianwen_plus
+
 prompt = """
 # 角色
 你是一个文档处理专家，专门负责理解和操作基于特定内容的文档任务，这包括解析、总结、搜索或生成与给定文档相关的各类信息。
@@ -165,11 +166,12 @@ def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type
                 result_queue.put((ans_index, (question, qianwen_res)))
         elif llm_type==3:
             # print(f"doubao! question:{question}")
-            doubao_res=doubao_model(question,need_extra)
-            if not doubao_res:
+            # doubao_res=doubao_model(question,need_extra)
+            qianwen_plus_res=qianwen_plus(question,need_extra)
+            if not qianwen_plus_res:
                 result_queue.put((ans_index, None))  # 如果为空字符串，直接返回 None
             else:
-                result_queue.put((ans_index, (question, doubao_res)))
+                result_queue.put((ans_index, (question, qianwen_plus_res)))
         else :
             assistant = pure_assistant()
             ans = send_message(assistant, message=question)
diff --git a/flask_app/general/截取pdf_main.py b/flask_app/general/截取pdf_main.py
index 201ef18..11442a7 100644
--- a/flask_app/general/截取pdf_main.py
+++ b/flask_app/general/截取pdf_main.py
@@ -119,14 +119,14 @@ if __name__ == "__main__":
     # input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
     # pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
     # pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
-    pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
+    pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\“平安城市”三期暨雪亮工程一体化 建设项目.pdf"
     # pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
     # input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
-    output_folder = r"C:\Users\Administrator\Desktop\招标文件\output33"
+    output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)"
     # selections = [1, 4]  # 仅处理 selection 4、1
     # selections = [1, 2, 3, 5]
     # files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods', selections)   #engineering
-    files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'engineering')
+    files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods')
     print(files)
     # print(files[-1])
     # print(files[-2])
diff --git a/flask_app/general/截取pdf通用函数.py b/flask_app/general/截取pdf通用函数.py
index 9ef489b..361817f 100644
--- a/flask_app/general/截取pdf通用函数.py
+++ b/flask_app/general/截取pdf通用函数.py
@@ -118,11 +118,9 @@ def convert_to_pdf(file_path):
 def get_invalid_file(file_path, output_folder, common_header, begin_page):
     """
     从指定的PDF文件中提取无效部分并保存到输出文件夹中。
-    begin_pattern从前往后匹配  end_pattern从后往前
+    begin_pattern匹配开头  end_pattern匹配结尾
     页数小于100不进行begin_pattern，默认start_page=0
     页数大于200时end_pattern从前往后匹配
-
-
     Args:
         file_path (str): 输入的PDF文件路径。
         output_folder (str): 提取后文件的输出文件夹路径。
@@ -134,29 +132,25 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
     # 定义开始模式
     begin_patterns = [
         regex.compile(
-            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',
+            r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)）]?\s*$',
             regex.MULTILINE
         ),
         regex.compile(
-            r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$',
+            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请',
             regex.MULTILINE
         )
     ]
-
     # 定义结束模式
     end_patterns = [
         regex.compile(
+            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:格式|文件(?:的)?组成|文件(?:的)?构成|文件(?:的)?编制).*|'
             r'第[一二三四五六七八九十]+(?:章|部分).*?合同|[：:]清标报告',
             regex.MULTILINE
         ),
-        regex.compile(
-            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:格式|文件(?:的)?组成|文件(?:的)?构成|文件(?:的)?编制).*',
-            regex.MULTILINE
-        ),
         regex.compile(
             r"\s*(投标文件|响应文件|响应性文件|应答文件)(?:的)?格式\s*",
             regex.MULTILINE
-        )
+        ),
     ]
 
     # 定义排除模式
@@ -195,10 +189,14 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
         # 定义查找结束页的函数
         def find_end_page():
             if total_pages > 200:
-                # print("总页数大于200页，结束页从前往后查找，跳过前30页。") #防止匹配到目录这块
+                # print("总页数大于200页，结束页从前往后查找，跳过前30页。") #且先匹配合同
                 for pattern in end_patterns:
                     for i in range(30, total_pages):
                         text = page_texts[i]
+                        exclusion_pattern = regex.compile(
+                            r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
+                        if regex.search(exclusion_pattern,text):
+                            continue
                         if regex.search(pattern, text):
                             # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
                             return i
@@ -232,9 +230,7 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
                 future_end = executor.submit(find_end_page)
                 start_page = future_start.result()
                 end_page = future_end.result()
-
         # print(f"起始页: {start_page + 1}, 结束页: {end_page + 1}")
-
         # 验证页码范围
         if start_page > end_page:
             print(f"无效的页码范围: 起始页 ({start_page + 1}) > 结束页 ({end_page + 1})")
@@ -356,6 +352,7 @@ def generate_end_pattern(extraction_stage, chapter_type=None):
             r'|' + common_patterns
     )
 
+
     end_pattern_template_third = (
             r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff、()（）/]+\s*$'
             r'|' + common_patterns
@@ -477,10 +474,17 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
         return perform_extraction_and_save(start_page, mid_page, end_page)
 
     print(f"第一次提取tobidders_notice失败，尝试第二次提取: {pdf_path}")
-    start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
+    start_page1, end_page1, start_page2, end_page2 = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
+
+    if start_page1 is not None and end_page1 is not None and start_page2 is not None and end_page2 is not None:
+        if end_page1==start_page2:
+            mid_page=end_page1
+            return perform_extraction_and_save(start_page1, mid_page, end_page2)
+        else:
+            path1=save_extracted_pages(pdf_path,output_folder,start_page1,end_page1,"tobidders_notice_part1",common_header)
+            path2=save_extracted_pages(pdf_path,output_folder,start_page2,end_page2,"tobidders_notice_part2",common_header)
+            return path1,path2
 
-    if start_page is not None and mid_page is not None and end_page is not None:
-        return perform_extraction_and_save(start_page, mid_page, end_page)
     print("第二次提取tobidders_notice失败，尝试第三次提取!")
     new_begin_pattern = regex.compile(
         r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
@@ -503,9 +507,10 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
         r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知)+',
         regex.MULTILINE
     )
-    end_pattern = regex.compile(
+    second_begin_pattern = regex.compile(
         r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', regex.MULTILINE  # 捕获中文部分
     )
+    end_pattern = second_begin_pattern
     exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"]  # 在这里添加需要排除的关键词
 
     exclusion_pattern = regex.compile(
@@ -515,10 +520,9 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
     start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,
                                                    exclusion_pattern, output_suffix)
     if start_page1 is None or end_page1 is None:
-        return None, None, None
+        return None, None, None,None
 
     # 提取第二部分
-    start_page2 = end_page1
 
     # 检查end_page1页面的内容
     text = pdf_document.pages[end_page1].extract_text() or ""
@@ -531,19 +535,20 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
         # 检查是否包含排除关键词
         if any(word in chapter_title for word in exclusion_words):
             # 如果包含排除关键词，直接返回相同的路径
-            return start_page1, end_page1, end_page1
+            second_begin_pattern=begin_pattern     #投标人须知前附表和投标人须知正文不是紧挨着的！eg:（发布稿）恒丰理财有限责任公司智能投研终端项目竞争性磋商
 
     # 如果不包含排除关键词，继续提取第二部分
     output_suffix='tobidders_notice2'
-    _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
+    begin_page=end_page1-1   #传入start_page2-1是因为非notice截取->start_page>begin_page
+    start_page2, end_page2 = extract_pages_generic(pdf_document, second_begin_pattern, end_pattern, begin_page, common_header,
                                          exclusion_pattern, output_suffix)
 
-    if end_page2 is None:
-        return start_page1, end_page1, None
-
-    return start_page1, end_page1, end_page2
+    if start_page2 is None or end_page2 is None:
+        return start_page1, end_page1, end_page1,end_page1
 
+    return start_page1, end_page1, start_page2,end_page2
 
+#return start_page1, end_page1, end_page1
 def get_notice(pdf_path, output_folder, begin_page, common_header,invalid_endpage):
     begin_pattern = regex.compile(
         r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)）]?\s*$',
diff --git a/flask_app/general/无效标和废标公共代码.py b/flask_app/general/无效标和废标公共代码.py
index da8131d..476a824 100644
--- a/flask_app/general/无效标和废标公共代码.py
+++ b/flask_app/general/无效标和废标公共代码.py
@@ -651,13 +651,13 @@ if __name__ == '__main__':
     # clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
     # doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
     # doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
-    pdf_path = r'C:\Users\Administrator\Desktop\货物\test5\磋商采购文件-恩施市森林火灾风险普查样品检测服务.pdf'
+    pdf_path = r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\“天府粮仓”青白江区“一园三片”数字化提升和标准化体系建设项目(三次)招标文件（N510113202400010820250102001）.pdf'
 
-    output_dir = r"C:\Users\Administrator\Desktop\货物\test5"
+    output_dir = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\废标"
     # invalid_added = insert_mark(pdf_path)
-    # invalid_added_docx = pdf2docx(invalid_added)
-    invalid_added_docx=r'C:\Users\Administrator\Downloads\invalid_added.docx'
+    # # invalid_added_docx = pdf2docx(invalid_added)
+    invalid_added_docx=r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\invalid_added.docx'
     results = combine_find_invalid(invalid_added_docx, output_dir)
     end_time = time.time()
     print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
-    # print("Elapsed time:", str(end_time - start_time))
\ No newline at end of file
+    # # print("Elapsed time:", str(end_time - start_time))
\ No newline at end of file
diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py
index 7484156..7003453 100644
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@@ -240,7 +240,7 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
                 elif selection == 5:
                     begin_pattern = regex.compile(
                         r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术|供货).*?要求|'
-                        r'^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购.*?(?:内容|要求|需求)|招标(?:内容|要求|需求)).*|'
+                        r'^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购.*?(?:内容|要求|需求)|(招标|项目)(?:内容|要求|需求)).*|'
                         r'^第[一二三四五六七八九十百千]+(?:章|部分).*?需求书'
                     )
                     end_pattern = regex.compile(
@@ -315,12 +315,11 @@ if __name__ == "__main__":
     logger = get_global_logger("123")
     # input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
     # pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
-    pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件111.pdf"
+    pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\开评定标\（发布稿）恒丰理财有限责任公司智能投研终端项目竞争性磋商.pdf"
     # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件（广水市教育局封闭管理）.pdf"
     # pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
-    output_folder = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output4444"
+    output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\开评定标"
     # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
     selection = 4  # 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求 6-invalid_path
     generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
     print(generated_files)
- 
\ No newline at end of file
diff --git a/flask_app/货物标/技术参数要求提取.py b/flask_app/货物标/技术参数要求提取.py
index b08c88c..1ef44c6 100644
--- a/flask_app/货物标/技术参数要求提取.py
+++ b/flask_app/货物标/技术参数要求提取.py
@@ -10,7 +10,7 @@ from flask_app.general.model_continue_query import continue_answer, process_cont
 from flask_app.general.file2markdown import convert_file_to_markdown
 from flask_app.general.format_change import pdf2docx
 from flask_app.general.多线程提问 import multi_threading
-from flask_app.general.通义千问long import qianwen_long, upload_file
+from flask_app.general.通义千问long import qianwen_long, upload_file, qianwen_plus
 from flask_app.general.json_utils import clean_json_string, combine_json_results
 from flask_app.general.doubao import doubao_model, generate_full_user_query, pdf2txt, read_txt_to_string
 from flask_app.货物标.技术参数要求提取后处理函数 import all_postprocess
@@ -488,7 +488,8 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
     {}
     """
         judge_query = first_query_template.format(f"文件内容：{full_text}")
-        judge_res = doubao_model(judge_query)
+        # judge_res = doubao_model(judge_query)
+        judge_res=qianwen_plus(judge_query)
     if '否' in judge_res or model_type == 2:
         model_type = 2  # 使用qianwen-long+invalid_path
         print("processed_filepath中无采购需求!调用invalid_path")
@@ -500,7 +501,8 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
         print(model_res)
     else:
         user_query = generate_prompt(judge_res, full_text)
-        model_res = doubao_model(user_query)
+        # model_res = doubao_model(user_query)
+        model_res=qianwen_plus(user_query)
         print(model_res)
 
     cleaned_res = clean_json_string(model_res)     #转字典
@@ -611,7 +613,7 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
     if model_type==1:
         results = multi_threading(queries, "", "", 3,True)  # 豆包
     else:
-        results = multi_threading(queries, "", file_id, 2,True)  # 豆包
+        results = multi_threading(queries, "", file_id, 2,True)  # qianwen-long
     temp_final={}
     if not results:
         print("errror!未获得大模型的回答！")