diff --git a/flask_app/main/基础信息整合.py b/flask_app/main/基础信息整合.py index 3000947..3b8c2e1 100644 --- a/flask_app/main/基础信息整合.py +++ b/flask_app/main/基础信息整合.py @@ -5,33 +5,48 @@ from flask_app.main.投标人须知正文提取指定内容 import extract_from_ from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge from flask_app.main.多线程提问 import read_questions_from_file, multi_threading from flask_app.main.通义千问long import upload_file -def combine_basic_info(baseinfo_list): +def aggregate_basic_info(baseinfo_list): + """ + 将基础信息列表中的数据进行合并和分类。 + + 参数: + - baseinfo_list (list): 包含多个基础信息的列表。 + + 返回: + - list: 合并和分类后的基础信息列表。 + """ combined_baseinfo_list = [] key_groups = { - "招标人/代理信息": ["招标人","招标人联系方式", "招标代理机构","招标代理机构联系方式"], - "项目信息": ["工程名称", "招标编号","工程概况","招标范围","招标控制价","投标竞争下浮率"], - "关键时间/内容":["投标文件递交截止日期","递交方式","投标人要求澄清招标文件的截止时间","投标有效期","评标结果公示媒介"], - "保证金相关":['质量保证金','退还投标保证金'], - "其他信息":["重新招标、不再招标和终止招标","是否退还投标文件","费用承担"] + "招标人/代理信息": ["招标人", "招标人联系方式", "招标代理机构", "招标代理机构联系方式"], + "项目信息": ["工程名称", "招标编号", "工程概况", "招标范围", "招标控制价", "投标竞争下浮率"], + "关键时间/内容": [ + "投标文件递交截止日期", + "递交方式", + "投标人要求澄清招标文件的截止时间", + "投标有效期", + "评标结果公示媒介" + ], + "保证金相关": ["质量保证金", "退还投标保证金"], + "其他信息": [ + "重新招标、不再招标和终止招标", + "是否退还投标文件", + "费用承担" + ] } - # 将所有基础信息合并到一个字典中 + combined_data = {} relevant_keys_detected = set() - # 预处理以决定哪些键名将被使用 + # 合并所有基础信息并收集相关键 for baseinfo in baseinfo_list: json_data = clean_json_string(baseinfo) combined_data.update(json_data) relevant_keys_detected.update(json_data.keys()) - # for key in relevant_keys.keys(): - # if key in json_data: - # relevant_keys[key] = True - # 根据检测到的键动态调整 key_groups + # 动态调整键组 dynamic_key_handling(key_groups, relevant_keys_detected) - - # 使用合并后的字典创建最终输出 + # 按键组分类并嵌套 for group_name, keys in key_groups.items(): group_data = {key: combined_data.get(key, "未提供") for key in keys} combined_json = nest_json_under_key(group_data, group_name) @@ -74,69 +89,72 @@ def judge_consortium_bidding(baseinfo_list): # 更新原始列表,如果你想保留修改 baseinfo_list[:] = updated_list return accept_bidding -def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #投标人须知前附表 - # 调用大模型回答项目基础信息 +def combine_basic_info(knowledge_name, truncate0, output_folder, clause_path): + """ + 综合和处理基础信息,生成最终的基础信息字典。 + + 参数: + - knowledge_name (str): 知识名称。 + - truncate0 (str): 文件路径。 + - output_folder (str): 输出文件夹路径。 + - clause_path (str): 条款路径。 + + 返回: + - dict: 综合后的基础信息。 + """ baseinfo_list = [] - baseinfo_file_path='flask_app/static/提示词/前两章提问总结.txt' - # baseinfo_file_path='D:\\flask_project\\flask_app\\static\\提示词\\前两章提问总结.txt' + baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt' questions = read_questions_from_file(baseinfo_file_path) res1 = multi_threading(questions, knowledge_name) - for _, response in res1: # _占位,代表ques;response[0]也是ques;response[1]是ans + for index, response in res1: try: - if response and len(response) > 1: # 检查response存在且有至少两个元素 + if response and len(response) > 1: baseinfo_list.append(response[1]) else: - print(f"基础信息整合: Warning: Missing or incomplete response data for query index {_}.") + print(f"基础信息整合: Warning: Missing or incomplete response data for query index {index}.") except Exception as e: - print(f"基础信息整合: Error processing response for query index {_}: {e}") + print(f"基础信息整合: Error processing response for query index {index}: {e}") + # 判断是否分包、是否需要递交投标保证金等 - chosen_numbers, merged = judge_whether_main(truncate0,output_folder) + chosen_numbers, merged = judge_whether_main(truncate0, output_folder) baseinfo_list.append(merged) - judge_file_path ='flask_app/static/提示词/是否相关问题.txt' - # judge_file_path='D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题.txt' + judge_file_path = 'flask_app/static/提示词/是否相关问题.txt' judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers) + judge_consortium = judge_consortium_bidding(baseinfo_list) # 通过招标公告判断是否接受联合体投标 - judge_consortium = judge_consortium_bidding(baseinfo_list) #通过招标公告判断是否接受联合体投标 if judge_consortium: - judge_consortium_question = "该招标文件对于联合体投标的要求是怎样的,请按json格式给我提供信息,外层键名为'联合体投标要求',其中有一个嵌套键值对为:\"是否接受联合体投标\":\"是\"" + judge_consortium_question = ( + "该招标文件对于联合体投标的要求是怎样的,请按json格式给我提供信息," + "外层键名为'联合体投标要求',其中有一个嵌套键值对为:\"是否接受联合体投标\":\"是\"" + ) judge_questions.append(judge_consortium_question) - file_id=upload_file(truncate0) - res2 = multi_threading(judge_questions, "",file_id,2) #调用千问-long + file_id = upload_file(truncate0) + res2 = multi_threading(judge_questions, "", file_id, 2) # 调用千问-long + if not res2: - print("基础信息整合: multi_threading errror!") + print("基础信息整合: multi_threading error!") else: - # 打印结果 for question, response in res2: baseinfo_list.append(response) - # for _, response in res2: # _占位,代表ques;response[0]也是ques;response[1]是ans #调用百炼rag - # try: - # if response and len(response) > 1: # 检查response存在且有至少两个元素 - # baseinfo_list.append(response[1]) - # else: - # print(f"基础信息整合: Warning: Missing or incomplete response data for query index {_}.") - # except Exception as e: - # print(f"基础信息整合: Error processing response for query index {_}: {e}") - rebidding_situation = extract_from_notice(clause_path, 3) #"重新招标, 不再招标和终止招标"需从投标人须知正文提取 - - update_json=rename_outer_key(rebidding_situation,"重新招标、不再招标和终止招标") + rebidding_situation = extract_from_notice(clause_path, 3) # "重新招标, 不再招标和终止招标"需从投标人须知正文提取 + update_json = rename_outer_key(rebidding_situation, "重新招标、不再招标和终止招标") baseinfo_list.append(update_json) - update_baseinfo_list=combine_basic_info(baseinfo_list) #整合基础信息核心代码 + aggregated_baseinfo = aggregate_basic_info(baseinfo_list) # 整合基础信息核心代码 + baseinfo_combined_res = combine_json_results(aggregated_baseinfo) # 返回值是字典 - baseinfo_combined_res = combine_json_results(update_baseinfo_list) # 返回值是字典 - # return nest_json_under_key(baseinfo_combined_res, "基础信息") #返回值是json字符串 - return {"基础信息":baseinfo_combined_res} + return {"基础信息": baseinfo_combined_res} if __name__ == "__main__": knowledge_name = "ztb" output_folder="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405" truncate0="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\ztbfile_tobidders_notice_table.pdf" clause_path="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\clause1.json" - res=project_basic_info(knowledge_name,truncate0,output_folder,clause_path) + res=combine_basic_info(knowledge_name,truncate0,output_folder,clause_path) print(json.dumps(res,ensure_ascii=False,indent=4)) diff --git a/flask_app/main/多线程提问.py b/flask_app/main/多线程提问.py index ec52d13..9a2bf34 100644 --- a/flask_app/main/多线程提问.py +++ b/flask_app/main/多线程提问.py @@ -1,10 +1,12 @@ # 基于知识库提问的通用模板, # assistant_id +import json +import os import re import queue import concurrent.futures import time - +import requests from dashscope import Assistants, Messages, Runs, Threads from llama_index.indices.managed.dashscope import DashScopeCloudRetriever from flask_app.main.通义千问long import qianwen_long, upload_file @@ -100,6 +102,98 @@ def rag_assistant(knowledge_name): ) return assistant +#TODO:http格式,有bug还没修改 +def create_assistant(knowledge_name): + """ + Create an assistant using DashScope API via HTTP request based on the provided knowledge name. + + Parameters: + knowledge_name (str): The name of the knowledge base to associate with the assistant. + + Returns: + dict: Response from the API containing assistant details. + + Raises: + ValueError: If the DASHSCOPE_API_KEY environment variable is not set. + Exception: If any error occurs during the HTTP request. + """ + # Step 1: Initialize the Retriever and get the Pipeline ID + try: + retriever = DashScopeCloudRetriever(knowledge_name) + pipeline_id = str(retriever.pipeline_id) + except Exception as e: + print(f"Error retrieving pipeline ID for knowledge '{knowledge_name}': {e}") + return None + + # Step 2: Fetch the API Key from Environment Variables + api_key = os.getenv("DASHSCOPE_API_KEY") + if not api_key: + raise ValueError("DASHSCOPE_API_KEY environment variable is not set.") + + # Step 3: Define the API Endpoint and Headers + url = 'https://dashscope.aliyuncs.com/api/v1/assistants' + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}" + } + + # Step 4: Construct the Instructions + instructions = ( + "请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${documents}" + ) + + # Step 5: Define the Tools + tools = [ + { + "type": "code_interpreter" + }, + { + "type": "rag", + "prompt_ra": { + "pipeline_id": pipeline_id, + "parameters": { + "type": "object", + "properties": { + "query_word": { + "type": "str", + "value": "${documents}" + } + } + } + } + } + ] + + # Step 6: Construct the Payload + payload = { + "model": "qwen-max", + "name": "智能小助手", # "Smart Helper" in Chinese + "description": "智能助手,支持知识库查询和插件调用。", + "temperature": 0.3, + "instructions": instructions, + "tools": tools, + "file_ids": [], # Add file IDs if necessary + "metadata": {} # Add metadata if necessary + } + + # Optional: If you have specific file_ids or metadata, you can modify the payload accordingly + # For example: + # payload["file_ids"] = ["file_id_1", "file_id_2"] + # payload["metadata"] = {"key1": "value1", "key2": "value2"} + + # Step 7: Make the HTTP POST Request + try: + response = requests.post(url, headers=headers, data=json.dumps(payload)) + response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx) + assistant = response.json() + print("Assistant created successfully:") + print(json.dumps(assistant, indent=4, ensure_ascii=False)) + return assistant + except requests.exceptions.HTTPError as http_err: + print(f"HTTP error occurred: {http_err} - Response: {response.text}") + except Exception as err: + print(f"An error occurred: {err}") + def pure_assistant(): assistant = Assistants.create( @@ -119,6 +213,7 @@ def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type if llm_type==1: print(f"rag_assistant! question:{question}") assistant = rag_assistant(knowledge_name) + # assistant=create_assistant(knowledge_name) elif llm_type==2: print(f"qianwen_long! question:{question}") qianwen_res = qianwen_long(file_id,question) @@ -185,24 +280,25 @@ if __name__ == "__main__": # print(f"Question: {question}") # print(f"Response: {response}") - # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\ztb_evaluation_method.pdf" - # file_id = upload_file(file_path) - # questions=["根据该文档中的评标办法前附表,请你列出该文件的技术标,以json的格式返回结果","根据该文档中的评标办法前附表,请你列出该文件的商务标,以json的格式返回结果","根据该文档中的评标办法前附表,请你列出该文件的投标报价,以json的格式返回结果"] - # results=multi_threading(questions,"",file_id,2) #1代表使用百炼rag 2代表使用qianwen-long - # if not results: - # print("errror!") - # else: - # # 打印结果 - # for question, response in results: - # print(f"Question: {question}") - # print(f"Response: {response}") - ques=["关于'资格要求',本采购文件第一章第二款要求的内容是怎样的?请按json格式给我提供信息,键名为'资格要求',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。"] - # ques=["该招标文件的工程名称(项目名称)是?招标编号是?招标人是?招标代理机构是?请按json格式给我提供信息,键名分别是'工程名称','招标编号','招标人','招标代理机构',若存在未知信息,在对应的键值中填'未知'。","该招标文件的工程概况(或项目概况)是?招标范围是?请按json格式给我提供信息,键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。"] - results = multi_threading(ques, "6.2视频会议docx") + file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\6.2定版视频会议磋商文件(1)\\6.2定版视频会议磋商文件_1-21.pdf" + file_id = upload_file(file_path) + questions=["该招标文件的项目名称是?项目编号(或招标编号)是?采购人(或招标人)是?采购代理机构(或招标代理机构)是?请按json格式给我提供信息,键名分别是'项目名称','项目编号','采购人','采购代理机构',若存在未知信息,在对应的键值中填'未知'。","该招标文件的项目概况是?项目基本情况是?请按json格式给我提供信息,键名分别为'项目概况','项目基本情况',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,而嵌套键值必须与原文保持一致,若存在未知信息,在对应的键值中填'未知'。"] + results=multi_threading(questions,"",file_id,2) #1代表使用百炼rag 2代表使用qianwen-long if not results: print("errror!") else: # 打印结果 for question, response in results: print(f"Question: {question}") - print(f"Response: {response}") \ No newline at end of file + print(f"Response: {response}") + + # ques=["关于'资格要求',本采购文件第一章第二款要求的内容是怎样的?请按json格式给我提供信息,键名为'资格要求',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。"] + # # ques=["该招标文件的工程名称(项目名称)是?招标编号是?招标人是?招标代理机构是?请按json格式给我提供信息,键名分别是'工程名称','招标编号','招标人','招标代理机构',若存在未知信息,在对应的键值中填'未知'。","该招标文件的工程概况(或项目概况)是?招标范围是?请按json格式给我提供信息,键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。"] + # results = multi_threading(ques, "6.2视频会议docx") + # if not results: + # print("errror!") + # else: + # # 打印结果 + # for question, response in results: + # print(f"Question: {question}") + # print(f"Response: {response}") \ No newline at end of file diff --git a/flask_app/main/招标文件解析.py b/flask_app/main/招标文件解析.py index 68e2f02..7b88e23 100644 --- a/flask_app/main/招标文件解析.py +++ b/flask_app/main/招标文件解析.py @@ -9,11 +9,11 @@ from flask_app.main.截取pdf import truncate_pdf_multiple from flask_app.main.table_content_extraction import extract_tables_main from flask_app.main.知识库操作 import addfileToKnowledge, deleteKnowledge from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json -from flask_app.main.json_utils import nest_json_under_key, transform_json_values, combine_json_results +from flask_app.main.json_utils import transform_json_values, combine_json_results from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice import concurrent.futures -from flask_app.main.基础信息整合 import project_basic_info +from flask_app.main.基础信息整合 import combine_basic_info from flask_app.main.资格审查模块 import combine_review_standards from flask_app.main.商务标技术标整合 import combine_evaluation_standards from flask_app.main.format_change import pdf2docx, docx2pdf @@ -96,7 +96,7 @@ def post_processing(data,includes): # 基本信息 def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path): # 投标人须知前附表 logger.info("starting基础信息...") - basic_res = project_basic_info(knowledge_name, truncate0, output_folder, clause_path) + basic_res = combine_basic_info(knowledge_name, truncate0, output_folder, clause_path) logger.info("基础信息done") return basic_res @@ -149,18 +149,14 @@ def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpat def fetch_bidding_documents_requirements(clause_path): logger.info("starting投标文件要求...") fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1) - qualify_nested_res = nest_json_under_key(fetch_bidding_documents_requirements_json, "投标文件要求") logger.info("投标文件要求done...") - # return qualify_nested_res return {"投标文件要求":fetch_bidding_documents_requirements_json} # 开评定标流程 def fetch_bid_opening(clause_path): logger.info("starting开评定标流程...") fetch_bid_opening_json = extract_from_notice(clause_path, 2) - qualify_nested_res = nest_json_under_key(fetch_bid_opening_json, "开评定标流程") logger.info("开评定标流程done...") - # return qualify_nested_res return {"开评定标流程":fetch_bid_opening_json} # def main_processing(output_folder, downloaded_file_path, file_type, unique_id): # file_type=1->docx file_type=2->pdf diff --git a/flask_app/main/读取文件/读取docx.py b/flask_app/main/读取文件/读取docx.py index 87b3edf..9694f04 100644 --- a/flask_app/main/读取文件/读取docx.py +++ b/flask_app/main/读取文件/读取docx.py @@ -65,12 +65,41 @@ def read_tables_from_docx(file_path): # 返回符合条件的单元格内容 return cell_contents + +def read_docx_by_paragraphs(file_path): + """ + 按段落读取指定路径的 .docx 文件。 + + 参数: + file_path (str): .docx 文件的路径。 + + 返回: + list: 包含所有段落文本的列表。 + """ + try: + # 打开文档 + doc = Document(file_path) + + # 读取所有段落的文本 + paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()] + + return paragraphs + except Exception as e: + print(f"读取 .docx 文件时发生错误: {e}") + return [] + if __name__ == "__main__": - file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件.docx' - # output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\tmp" # 前附表json文件 - # read_docx(file_path) - read_docx_tables(file_path) - list=read_tables_from_docx(file_path) - for i in list: - print(i) - print("--------------") \ No newline at end of file + file_path = 'D:\\flask_project\\flask_app\\static\\output\\015d997e-c32c-49d1-a611-a2e817ace6a1\\ztbfile.docx' + read_docx(file_path) #按行读取 + + # paragraphs = read_docx_by_paragraphs(file_path) #按段落读取 + # + # print(f"共读取到 {len(paragraphs)} 个段落。\n") + # for idx, para in enumerate(paragraphs, 1): + # print(f"段落 {idx}: {para}\n") + + # read_docx_tables(file_path) + # list=read_tables_from_docx(file_path) + # for i in list: + # print(i) + # print("--------------") \ No newline at end of file diff --git a/flask_app/static/提示词/基本信息货物标.txt b/flask_app/static/提示词/基本信息货物标.txt new file mode 100644 index 0000000..22081be --- /dev/null +++ b/flask_app/static/提示词/基本信息货物标.txt @@ -0,0 +1,26 @@ +1.该招标文件的项目名称是?项目编号(或招标编号)是?采购人(或招标人)是?采购代理机构(或招标代理机构)是?请按json格式给我提供信息,键名分别是'项目名称','项目编号','采购人','采购代理机构',若存在未知信息,在对应的键值中填'未知'。 + +2.该招标文件的项目概况是?项目基本情况是?请按json格式给我提供信息,键名分别为'项目概况','项目基本情况',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,而嵌套键值必须与原文保持一致,若存在未知信息,在对应的键值中填'未知'。 + +3.该招标文件的最高限价(或招标控制价)是?请按json格式给我提供信息,键名为'招标控制价',若存在未知信息,在对应的键值中填'未知'。 + +4.投标文件(或响应文件)递交截止时间是?递交地点(或方式)是?请按json格式给我提供信息,键名分别是'投标文件递交截止日期','递交地点'(或'递交方式'),若存在未知信息,在对应的键值中填'未知'。 + +5.采购人(招标人)和采购代理机构(或招标代理机构)的联系方式是?请按json格式给我提供信息,键名分别是'采购人联系方式','采购代理机构联系方式',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。 + +6.该招标文件的信息公示媒介在哪?请按json格式给我提供信息,键名是'信息公示媒介',若存在未知信息,在对应的键值中填'未知'。 + +7.该招标文件的投标竞争下浮率是多少?请按json格式给我提供信息,键名是'投标竞争下浮率',若存在未知信息,在对应的键值中填'未知'。 + +8.该项目的投标有效期(或响应文件有效期)是什么?请按json格式给我提供信息,键名是'投标有效期',若存在未知信息,在对应的键值中填'未知'。 + +9.该招标文件对投标人准备和参加投标活动发生的费用是如何规定的?请以json的格式给我提供信息,键名是'费用承担',若存在未知信息,在对应的键值中填'未知'。 + +10.求澄清的招标文件截止时间是?请以json的格式给我提供信息,键名是'投标人要求澄清招标文件的截止时间',若存在未知信息,在对应的键值中填'未知'。 + +11.该文档要求扣留的质量保证金百分比是多少,请以json格式给我提供信息,键名为'质量保证金',如果没有则以'未知'填充。 + +12.该项目是否接受联合体投标?请按json格式给我提供信息,键名为'是否接受联合体投标','是否接受联合体投标'的键值仅限于'是'、'否'、'未知'。 + + + diff --git a/flask_app/货物标/基础信息解析main.py b/flask_app/货物标/基础信息解析main.py index 1bdadc5..d7c3864 100644 --- a/flask_app/货物标/基础信息解析main.py +++ b/flask_app/货物标/基础信息解析main.py @@ -1,4 +1,89 @@ +import json + +from flask_app.main.json_utils import clean_json_string, nest_json_under_key,rename_outer_key, combine_json_results +from flask_app.main.多线程提问 import read_questions_from_file, multi_threading +from flask_app.main.通义千问long import upload_file + + +def dynamic_key_handling(key_groups, detected_keys): + # 检查和调整键组配置 + for key in detected_keys: + if "投标保证金" in key or "履约保证金" in key: + key_groups["保证金相关"].append(key) + elif "是否接受联合体" in key: + key_groups["项目信息"].append(key) + elif "联合体投标要求" in key: + key_groups["项目信息"].append(key) + elif "分包" in key: + key_groups["项目信息"].append(key) + elif "踏勘现场" in key: + key_groups["其他信息"].append(key) + elif "投标预备会" in key: + key_groups["其他信息"].append(key) + elif "偏离" in key: + key_groups["其他信息"].append(key) +def aggregate_basic_info(baseinfo_list): + """ + 将基础信息列表中的数据进行合并和分类。 + + 参数: + - baseinfo_list (list): 包含多个基础信息的列表。 + + 返回: + - list: 合并和分类后的基础信息列表。 + """ + combined_baseinfo_list = [] + key_groups = { + "招标人/代理信息": ["招标人", "招标人联系方式", "招标代理机构", "招标代理机构联系方式"], + "项目信息": ["工程名称", "招标编号", "工程概况", "招标范围", "招标控制价", "投标竞争下浮率"], + "关键时间/内容": [ + "投标文件递交截止日期", + "递交方式", + "投标人要求澄清招标文件的截止时间", + "投标有效期", + "评标结果公示媒介" + ], + "保证金相关": ["质量保证金", "退还投标保证金"], + "其他信息": [ + "重新招标、不再招标和终止招标", + "是否退还投标文件", + "费用承担" + ] + } + + combined_data = {} + relevant_keys_detected = set() + + # 合并所有基础信息并收集相关键 + for baseinfo in baseinfo_list: + json_data = clean_json_string(baseinfo) + combined_data.update(json_data) + relevant_keys_detected.update(json_data.keys()) + + # 动态调整键组 + dynamic_key_handling(key_groups, relevant_keys_detected) + + # 按键组分类并嵌套 + for group_name, keys in key_groups.items(): + group_data = {key: combined_data.get(key, "未提供") for key in keys} + combined_json = nest_json_under_key(group_data, group_name) + combined_baseinfo_list.append(combined_json) + + return combined_baseinfo_list + def combine_basic_info(knowledge_name,output_folder,clause_path): + # file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\6.2定版视频会议磋商文件(1)\\6.2定版视频会议磋商文件_1-21.pdf" + # file_id = upload_file(file_path) + # baseinfo_file_path='flask_app/static/提示词/前两章提问总结.txt' + # questions=read_questions_from_file(baseinfo_file_path) + # results = multi_threading(questions, "", file_id, 2) # 1代表使用百炼rag 2代表使用qianwen-long + # if not results: + # print("errror!") + # else: + # # 打印结果 + # for question, response in results: + # print(f"Question: {question}") + # print(f"Response: {response}") baseinfo_combined_res={ "招标人/代理信息": { "招标人": "黄石临空建设管理有限公司", @@ -27,4 +112,12 @@ def combine_basic_info(knowledge_name,output_folder,clause_path): } } } - return {"基础信息":baseinfo_combined_res} \ No newline at end of file + return {"基础信息":baseinfo_combined_res} + +if __name__ == "__main__": + knowledge_name = "ztb" + output_folder="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405" + truncate0="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\ztbfile_tobidders_notice_table.pdf" + clause_path="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\clause1.json" + res=combine_basic_info(knowledge_name,output_folder,clause_path) + print(json.dumps(res,ensure_ascii=False,indent=4)) \ No newline at end of file diff --git a/flask_app/货物标/无效标和废标和禁止投标整合main.py b/flask_app/货物标/无效标和废标和禁止投标整合main.py index fd49bbd..6ff353b 100644 --- a/flask_app/货物标/无效标和废标和禁止投标整合main.py +++ b/flask_app/货物标/无效标和废标和禁止投标整合main.py @@ -57,12 +57,12 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): extracted_paragraphs[active_key] = [text] if match_keywords(text, follow_up_keywords): continue_collecting = True - section_number = re.match(r'(\d+(\s*\.\s*\d+)*)', text) + section_number = re.match(r'^(\d+(\s*\.\s*\d+)*)\s*、', text) # 修改后的正则,支持 '数字 、' 格式 if section_number: current_section_number = section_number.group(1) level_count = current_section_number.count('.') - # Pattern to match current level, e.g., 3.4.5 + # Pattern to match current level, e.g., 3.4.5 或者 3 pattern = r'^' + (r'\d+\s*\.\s*') * level_count + r'\d+' # Generate patterns for next section at same level and parent level @@ -71,20 +71,23 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): # Next section at same level parts[-1] = str(int(parts[-1]) + 1) - next_pattern = r'^' + r'\s*\.\s*'.join(parts) + next_pattern = r'^' + r'\.\s*'.join(parts) matched_patterns.append(next_pattern) # Parent section (if applicable) if len(parts) > 1: parent_section_parts = parts[:-1] parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1) - parent_pattern = r'^' + r'\s*\.\s*'.join(parent_section_parts) + parent_pattern = r'^' + r'\.\s*'.join(parent_section_parts) matched_patterns.append(parent_pattern) + # 添加对 '数字 、' 格式的支持 + digit_comma_pattern = r'^\d+\s*、' + matched_patterns.append(digit_comma_pattern) + # Combine the patterns combined_pattern = r'(' + r')|('.join(matched_patterns) + r')' current_section_pattern = re.compile(combined_pattern) - else: found_next_number = False current_section_pattern = None @@ -93,7 +96,8 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): current_index += 1 next_text = doc.paragraphs[current_index].text.strip() if not found_next_number: - next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))', next_text) + # 修改后的正则,支持 '数字 、' 格式 + next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))|(\d+\s*、)', next_text) if next_section_number: found_next_number = True if next_section_number.group(1): @@ -101,12 +105,14 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b' elif next_section_number.group(2): dynamic_pattern = r'^[\(\(]\d+[\)\)]' + elif next_section_number.group(3): + dynamic_pattern = r'^\d+\s*、' current_section_pattern = re.compile(dynamic_pattern) if current_section_pattern and re.match(current_section_pattern, next_text): extracted_paragraphs[active_key].append(next_text) else: continue_collecting = False - active_key=None + active_key = None break return current_index @@ -118,6 +124,8 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): return extracted_paragraphs + + """ eg: text_list = ["这是第一句。 1. 接下来是第二句! (3) 最后一句。"] @@ -143,6 +151,7 @@ def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达 for key, text_list in extracted_contents.items(): if len(text_list) == 1: for data in text_list: + # print(data) # 检查是否包含任何需要排除的字符串 if any(exclude in data for exclude in excludes): continue # 如果包含任何排除字符串,跳过这个数据 diff --git a/flask_app/货物标/货物标截取pdf.py b/flask_app/货物标/货物标截取pdf.py index d540fac..23ab916 100644 --- a/flask_app/货物标/货物标截取pdf.py +++ b/flask_app/货物标/货物标截取pdf.py @@ -464,9 +464,10 @@ def truncate_pdf_multiple(input_path, output_folder): # TODO:交通智能系统和招标(1)(1)文件有问题 sele=4的时候excludsion有问题 if __name__ == "__main__": - input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles" - output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\outputtest" - # files=truncate_pdf_multiple(input_path,output_folder) - # print(files) - selection = 1 # 例如:1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 - generated_files = truncate_pdf_main(input_path, output_folder, selection) + input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\ztbfile.pdf" + output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9" + files=truncate_pdf_multiple(input_path,output_folder) + print(files) + + # selection = 1 # 例如:1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 + # generated_files = truncate_pdf_main(input_path, output_folder, selection) diff --git a/flask_app/货物标/货物标解析main.py b/flask_app/货物标/货物标解析main.py index 1a98652..aa2933f 100644 --- a/flask_app/货物标/货物标解析main.py +++ b/flask_app/货物标/货物标解析main.py @@ -36,6 +36,9 @@ def preprocess_files(output_folder, file_path, file_type, unique_id): elif file_type == 2: # pdf pdf_path = file_path docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库 + elif file_type ==3: #doc + pdf_path=docx2pdf(file_path) + docx_path=pdf2docx(pdf_path) else: logger.error("Unsupported file type provided. Preprocessing halted.") return None