1.17 小bug

2025-01-17 15:45:53 +08:00 · 2025-01-17 15:45:53 +08:00 · 94b945e6b1
commit 94b945e6b1
parent 7e8f368324
7 changed files with 56 additions and 47 deletions
--- a/flask_app/general/model_continue_query.py
+++ b/flask_app/general/model_continue_query.py
@ -48,7 +48,8 @@ def continue_answer(original_query, original_answer, model_type=1, file_id=None)

    # 根据模型类型选择调用的模型
    if model_type == 1:  # 使用 doubao_model
-        model_res = doubao_model(continue_query)
+        # model_res = doubao_model(continue_query)
+        model_res=qianwen_plus(continue_query)
    elif model_type == 2:  # 使用 qianwen_long
        if file_id is None:
            raise ValueError("file_id 必须在使用 qianwen_long 模型时提供！")
--- a/flask_app/general/多线程提问.py
+++ b/flask_app/general/多线程提问.py
@ -13,7 +13,8 @@ from dashscope import Assistants, Messages, Runs, Threads
 from llama_index.indices.managed.dashscope import DashScopeCloudRetriever

 from flask_app.general.doubao import doubao_model
-from flask_app.general.通义千问long import qianwen_long, upload_file
+from flask_app.general.通义千问long import qianwen_long, upload_file, qianwen_plus
+
 prompt = """
 # 角色
 你是一个文档处理专家，专门负责理解和操作基于特定内容的文档任务，这包括解析、总结、搜索或生成与给定文档相关的各类信息。
@ -165,11 +166,12 @@ def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type
                result_queue.put((ans_index, (question, qianwen_res)))
        elif llm_type==3:
            # print(f"doubao! question:{question}")
-            doubao_res=doubao_model(question,need_extra)
-            if not doubao_res:
+            # doubao_res=doubao_model(question,need_extra)
+            qianwen_plus_res=qianwen_plus(question,need_extra)
+            if not qianwen_plus_res:
                result_queue.put((ans_index, None))  # 如果为空字符串，直接返回 None
            else:
-                result_queue.put((ans_index, (question, doubao_res)))
+                result_queue.put((ans_index, (question, qianwen_plus_res)))
        else :
            assistant = pure_assistant()
            ans = send_message(assistant, message=question)
--- a/flask_app/general/截取pdf_main.py
+++ b/flask_app/general/截取pdf_main.py
@ -119,14 +119,14 @@ if __name__ == "__main__":
    # input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
    # pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
    # pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
-    pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
+    pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\“平安城市”三期暨雪亮工程一体化 建设项目.pdf"
    # pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
    # input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
-    output_folder = r"C:\Users\Administrator\Desktop\招标文件\output33"
+    output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)"
    # selections = [1, 4]  # 仅处理 selection 4、1
    # selections = [1, 2, 3, 5]
    # files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods', selections)   #engineering
-    files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'engineering')
+    files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods')
    print(files)
    # print(files[-1])
    # print(files[-2])
--- a/flask_app/general/截取pdf通用函数.py
+++ b/flask_app/general/截取pdf通用函数.py
@ -118,11 +118,9 @@ def convert_to_pdf(file_path):
 def get_invalid_file(file_path, output_folder, common_header, begin_page):
    """
    从指定的PDF文件中提取无效部分并保存到输出文件夹中。
-    begin_pattern从前往后匹配  end_pattern从后往前
+    begin_pattern匹配开头  end_pattern匹配结尾
    页数小于100不进行begin_pattern，默认start_page=0
    页数大于200时end_pattern从前往后匹配
-
-
    Args:
        file_path (str): 输入的PDF文件路径。
        output_folder (str): 提取后文件的输出文件夹路径。
@ -134,29 +132,25 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
    # 定义开始模式
    begin_patterns = [
        regex.compile(
-            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',
+            r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)）]?\s*$',
            regex.MULTILINE
        ),
        regex.compile(
-            r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$',
+            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请',
            regex.MULTILINE
        )
    ]
-
    # 定义结束模式
    end_patterns = [
        regex.compile(
+            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:格式|文件(?:的)?组成|文件(?:的)?构成|文件(?:的)?编制).*|'
            r'第[一二三四五六七八九十]+(?:章|部分).*?合同|[：:]清标报告',
            regex.MULTILINE
        ),
-        regex.compile(
-            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:格式|文件(?:的)?组成|文件(?:的)?构成|文件(?:的)?编制).*',
-            regex.MULTILINE
-        ),
        regex.compile(
            r"\s*(投标文件|响应文件|响应性文件|应答文件)(?:的)?格式\s*",
            regex.MULTILINE
-        )
+        ),
    ]

    # 定义排除模式
@ -195,10 +189,14 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
        # 定义查找结束页的函数
        def find_end_page():
            if total_pages > 200:
-                # print("总页数大于200页，结束页从前往后查找，跳过前30页。") #防止匹配到目录这块
+                # print("总页数大于200页，结束页从前往后查找，跳过前30页。") #且先匹配合同
                for pattern in end_patterns:
                    for i in range(30, total_pages):
                        text = page_texts[i]
+                        exclusion_pattern = regex.compile(
+                            r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
+                        if regex.search(exclusion_pattern,text):
+                            continue
                        if regex.search(pattern, text):
                            # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
                            return i
@ -232,9 +230,7 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
                future_end = executor.submit(find_end_page)
                start_page = future_start.result()
                end_page = future_end.result()
-
        # print(f"起始页: {start_page + 1}, 结束页: {end_page + 1}")
-
        # 验证页码范围
        if start_page > end_page:
            print(f"无效的页码范围: 起始页 ({start_page + 1}) > 结束页 ({end_page + 1})")
@ -356,6 +352,7 @@ def generate_end_pattern(extraction_stage, chapter_type=None):
            r'|' + common_patterns
    )

+
    end_pattern_template_third = (
            r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff、()（）/]+\s*$'
            r'|' + common_patterns
@ -477,10 +474,17 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
        return perform_extraction_and_save(start_page, mid_page, end_page)

    print(f"第一次提取tobidders_notice失败，尝试第二次提取: {pdf_path}")
-    start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
+    start_page1, end_page1, start_page2, end_page2 = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
+
+    if start_page1 is not None and end_page1 is not None and start_page2 is not None and end_page2 is not None:
+        if end_page1==start_page2:
+            mid_page=end_page1
+            return perform_extraction_and_save(start_page1, mid_page, end_page2)
+        else:
+            path1=save_extracted_pages(pdf_path,output_folder,start_page1,end_page1,"tobidders_notice_part1",common_header)
+            path2=save_extracted_pages(pdf_path,output_folder,start_page2,end_page2,"tobidders_notice_part2",common_header)
+            return path1,path2

-    if start_page is not None and mid_page is not None and end_page is not None:
-        return perform_extraction_and_save(start_page, mid_page, end_page)
    print("第二次提取tobidders_notice失败，尝试第三次提取!")
    new_begin_pattern = regex.compile(
        r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
@ -503,9 +507,10 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
        r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知)+',
        regex.MULTILINE
    )
-    end_pattern = regex.compile(
+    second_begin_pattern = regex.compile(
        r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', regex.MULTILINE  # 捕获中文部分
    )
+    end_pattern = second_begin_pattern
    exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"]  # 在这里添加需要排除的关键词

    exclusion_pattern = regex.compile(
@ -515,10 +520,9 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
    start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,
                                                   exclusion_pattern, output_suffix)
    if start_page1 is None or end_page1 is None:
-        return None, None, None
+        return None, None, None,None

    # 提取第二部分
-    start_page2 = end_page1

    # 检查end_page1页面的内容
    text = pdf_document.pages[end_page1].extract_text() or ""
@ -531,19 +535,20 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
        # 检查是否包含排除关键词
        if any(word in chapter_title for word in exclusion_words):
            # 如果包含排除关键词，直接返回相同的路径
-            return start_page1, end_page1, end_page1
+            second_begin_pattern=begin_pattern     #投标人须知前附表和投标人须知正文不是紧挨着的！eg:（发布稿）恒丰理财有限责任公司智能投研终端项目竞争性磋商

    # 如果不包含排除关键词，继续提取第二部分
    output_suffix='tobidders_notice2'
-    _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
+    begin_page=end_page1-1   #传入start_page2-1是因为非notice截取->start_page>begin_page
+    start_page2, end_page2 = extract_pages_generic(pdf_document, second_begin_pattern, end_pattern, begin_page, common_header,
                                         exclusion_pattern, output_suffix)

-    if end_page2 is None:
-        return start_page1, end_page1, None
-
-    return start_page1, end_page1, end_page2
+    if start_page2 is None or end_page2 is None:
+        return start_page1, end_page1, end_page1,end_page1

+    return start_page1, end_page1, start_page2,end_page2

+#return start_page1, end_page1, end_page1
 def get_notice(pdf_path, output_folder, begin_page, common_header,invalid_endpage):
    begin_pattern = regex.compile(
        r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)）]?\s*$',
--- a/flask_app/general/无效标和废标公共代码.py
+++ b/flask_app/general/无效标和废标公共代码.py
@ -651,13 +651,13 @@ if __name__ == '__main__':
    # clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
    # doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
    # doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
-    pdf_path = r'C:\Users\Administrator\Desktop\货物\test5\磋商采购文件-恩施市森林火灾风险普查样品检测服务.pdf'
+    pdf_path = r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\“天府粮仓”青白江区“一园三片”数字化提升和标准化体系建设项目(三次)招标文件（N510113202400010820250102001）.pdf'

-    output_dir = r"C:\Users\Administrator\Desktop\货物\test5"
+    output_dir = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\废标"
    # invalid_added = insert_mark(pdf_path)
-    # invalid_added_docx = pdf2docx(invalid_added)
-    invalid_added_docx=r'C:\Users\Administrator\Downloads\invalid_added.docx'
+    # # invalid_added_docx = pdf2docx(invalid_added)
+    invalid_added_docx=r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\invalid_added.docx'
    results = combine_find_invalid(invalid_added_docx, output_dir)
    end_time = time.time()
    print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
-    # print("Elapsed time:", str(end_time - start_time))
+    # # print("Elapsed time:", str(end_time - start_time))
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -240,7 +240,7 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
                elif selection == 5:
                    begin_pattern = regex.compile(
                        r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术|供货).*?要求|'
-                        r'^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购.*?(?:内容|要求|需求)|招标(?:内容|要求|需求)).*|'
+                        r'^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购.*?(?:内容|要求|需求)|(招标|项目)(?:内容|要求|需求)).*|'
                        r'^第[一二三四五六七八九十百千]+(?:章|部分).*?需求书'
                    )
                    end_pattern = regex.compile(
@ -315,12 +315,11 @@ if __name__ == "__main__":
    logger = get_global_logger("123")
    # input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
    # pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
-    pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件111.pdf"
+    pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\开评定标\（发布稿）恒丰理财有限责任公司智能投研终端项目竞争性磋商.pdf"
    # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件（广水市教育局封闭管理）.pdf"
    # pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
-    output_folder = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output4444"
+    output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\开评定标"
    # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
    selection = 4  # 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求 6-invalid_path
    generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
    print(generated_files)
- 
--- a/flask_app/货物标/技术参数要求提取.py
+++ b/flask_app/货物标/技术参数要求提取.py
@ -10,7 +10,7 @@ from flask_app.general.model_continue_query import continue_answer, process_cont
 from flask_app.general.file2markdown import convert_file_to_markdown
 from flask_app.general.format_change import pdf2docx
 from flask_app.general.多线程提问 import multi_threading
-from flask_app.general.通义千问long import qianwen_long, upload_file
+from flask_app.general.通义千问long import qianwen_long, upload_file, qianwen_plus
 from flask_app.general.json_utils import clean_json_string, combine_json_results
 from flask_app.general.doubao import doubao_model, generate_full_user_query, pdf2txt, read_txt_to_string
 from flask_app.货物标.技术参数要求提取后处理函数 import all_postprocess
@ -488,7 +488,8 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
    {}
    """
        judge_query = first_query_template.format(f"文件内容：{full_text}")
-        judge_res = doubao_model(judge_query)
+        # judge_res = doubao_model(judge_query)
+        judge_res=qianwen_plus(judge_query)
    if '否' in judge_res or model_type == 2:
        model_type = 2  # 使用qianwen-long+invalid_path
        print("processed_filepath中无采购需求!调用invalid_path")
@ -500,7 +501,8 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
        print(model_res)
    else:
        user_query = generate_prompt(judge_res, full_text)
-        model_res = doubao_model(user_query)
+        # model_res = doubao_model(user_query)
+        model_res=qianwen_plus(user_query)
        print(model_res)

    cleaned_res = clean_json_string(model_res)     #转字典
@ -611,7 +613,7 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
    if model_type==1:
        results = multi_threading(queries, "", "", 3,True)  # 豆包
    else:
-        results = multi_threading(queries, "", file_id, 2,True)  # 豆包
+        results = multi_threading(queries, "", file_id, 2,True)  # qianwen-long
    temp_final={}
    if not results:
        print("errror!未获得大模型的回答！")