From d48d7ec8b2b6e5e0892b265da4cd7f9dc573c233 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Wed, 27 Nov 2024 16:58:25 +0800 Subject: [PATCH 1/8] =?UTF-8?q?11.27=20=E4=BC=98=E5=8C=96=E5=95=86?= =?UTF-8?q?=E5=8A=A1=E8=A6=81=E6=B1=82=E6=8F=90=E5=8F=96=E7=9A=84=E6=8F=90?= =?UTF-8?q?=E7=A4=BA=E8=AF=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/json_utils.py | 2 +- flask_app/general/判断截取位置.py | 266 ----------------------- flask_app/general/多线程提问.py | 109 ++++------ flask_app/general/通义千问long.py | 223 +++++++++++++------ flask_app/testdir/test3.py | 171 +-------------- flask_app/货物标/商务服务其他要求提取.py | 252 +++++++++++---------- flask_app/货物标/技术参数要求提取.py | 4 +- flask_app/货物标/提取采购需求main.py | 29 +-- flask_app/货物标/货物标解析main.py | 1 + 9 files changed, 366 insertions(+), 691 deletions(-) delete mode 100644 flask_app/general/判断截取位置.py diff --git a/flask_app/general/json_utils.py b/flask_app/general/json_utils.py index 8bd6f1b..1e0d7e1 100644 --- a/flask_app/general/json_utils.py +++ b/flask_app/general/json_utils.py @@ -104,7 +104,7 @@ def extract_content_from_json(string): # 尝试直接解析原始 JSON 数据 try: parsed = json.loads(original_json) - print("直接解析原始 JSON 成功。") + # print("直接解析原始 JSON 成功。") return parsed except json.JSONDecodeError as original_error: print(f"直接解析原始 JSON 失败: {original_error}") diff --git a/flask_app/general/判断截取位置.py b/flask_app/general/判断截取位置.py deleted file mode 100644 index e4855c2..0000000 --- a/flask_app/general/判断截取位置.py +++ /dev/null @@ -1,266 +0,0 @@ -# -*- encoding:utf-8 -*- -import json -import re -import os -import sys -from pathlib import Path -from openai import OpenAI -import concurrent.futures -import queue -import time - -def qianwen_long(file_id, user_query): - print("call qianwen-long...") - """ - Uses a previously uploaded file to generate a response based on a user query. - """ - client = OpenAI( - api_key=os.getenv("DASHSCOPE_API_KEY"), - base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" - ) - - # Generate a response based on the file ID - completion = client.chat.completions.create( - model="qwen-long", - top_p=0.5, - temperature=0.4, - messages=[ - { - 'role': 'system', - 'content': f'fileid://{file_id}' - }, - { - 'role': 'user', - 'content': user_query - } - ], - stream=False - ) - - # Return the response content - return completion.choices[0].message.content - -def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type): - - if llm_type==2: - print(f"qianwen_long! question:{question}") - qianwen_res = qianwen_long(file_id,question) - result_queue.put((ans_index,(question,qianwen_res))) - return -def multi_threading(queries, knowledge_name="", file_id="", llm_type=1): - if not queries: - return [] - - print("多线程提问:starting multi_threading...") - result_queue = queue.Queue() - - # 使用 ThreadPoolExecutor 管理线程 - with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor: - # 逐个提交任务,每提交一个任务后休眠1秒 - future_to_query = {} - for index, query in enumerate(queries): - future = executor.submit(llm_call, query, knowledge_name, file_id, result_queue, index, llm_type) - future_to_query[future] = index - time.sleep(1) # 每提交一个任务后等待1秒 - - # 收集每个线程的结果 - for future in concurrent.futures.as_completed(future_to_query): - index = future_to_query[future] - try: - future.result() # 捕获异常或确认任务完成 - except Exception as exc: - print(f"Query {index} generated an exception: {exc}") - # 确保在异常情况下也向 result_queue 添加占位符 - result_queue.put((index, None)) - - # 从队列中获取所有结果并按索引排序 - results = [None] * len(queries) - while not result_queue.empty(): - index, result = result_queue.get() - results[index] = result - # 检查是否所有结果都是 None - if all(result is None for result in results): - return [] - # 过滤掉None值 - results = [r for r in results if r is not None] - # 返回一个保证是列表的结构 - return results - -def upload_file(file_path): - """ - Uploads a file to DashScope and returns the file ID. - """ - client = OpenAI( - api_key=os.getenv("DASHSCOPE_API_KEY"), - base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" - ) - file = client.files.create(file=Path(file_path), purpose="file-extract") - return file.id - -def load_data(json_path): - """ - 从指定的JSON文件中加载数据。 - - Args: - json_path (str): JSON文件的路径。 - - Returns: - dict: 加载的JSON数据字典。 - """ - try: - with open(json_path, 'r', encoding='utf-8') as f: - data = json.load(f) - return data - except FileNotFoundError: - print(f"错误:文件未找到 - {json_path}") - sys.exit(1) - except json.JSONDecodeError as e: - print(f"错误:解析JSON文件时出错 - {e}") - sys.exit(1) - - -def define_target_names(): - """ - 定义目标名称列表。 - - Returns: - list: 目标名称列表。 - """ - return [ - "营业执照", - # "开户信息", - "法定代表人身份证", - # "法定代表人授权人身份证", - "人员证书", - "人员社保资料", - # "劳动合同", - "企业证书", - "企业业绩", - "财务审计报告", - "缴纳税收证明", - "公司缴纳社保证明" - ] - - -def generate_user_query(target, chapters, keywords): - """ - 根据目标、章节和关键词生成用户查询模板。 - - Args: - target (str): 目标名称。 - chapters (list): 相关章节列表。 - keywords (list): 相关关键词列表。 - - Returns: - str: 生成的用户查询字符串。 - """ - template3 = f"""这是投标文件模板,作为投标人,我需要把不同的投标材料填充到对应位置,请你根据该文件回答:{target}应该插入在该文件哪个地方?你可能需要查找以下关键词出现的地方:{', '.join([f"'{kw}'" for kw in keywords])},并确认插入的位置。我已在原文中打上若干待插入位置的标记,形如'[$$第5个待插入位置$$]',它的标记与它上面的小节内容关联。你需要返回给我{target}应该插入位置的标记序号,即'[$$第5个待插入位置$$]'中的'5',而不是页码,若有多个位置需要插入,可以返回多个序号,你的回答以数组返回,如[17, 19],若插入位置不明确,那么返回[-1]。 -""" - template2 = f"""你是一个专业撰写投标文件的专家,这是投标文件模板,作为投标人,你需要把不同的投标材料填充到对应位置,请你根据该文件回答:{target}应该插入在该文件哪个地方?你可能需要查找以下关键词出现的地方:{', '.join([f"'{kw}'" for kw in keywords])},或者根据文中'备注'或'附'内的信息确认插入的位置。目前原文中各章节末尾已打上待插入位置的标记,标记格式为'[$$当前章节名:第x个待插入位置$$]'形如'[$$四、投标保证金:第4个待插入位置$$]',每个标记与它紧跟着的上面的这一章内容关联。你需要返回给我{target}应该插入位置的标记序号,即'[$$四、投标保证金:第4个待插入位置$$]'中的'4',而不是当前页码,若有多个位置需要插入,可以返回多个序号,你的回答以数组返回,如[4, 5],若插入位置不明确,那么返回[-1]。 - """ - template1 = f"""这是投标文件模板,作为投标人,我需要把不同的投标材料填充到对应位置,请你根据该文件回答:{target}应该插入在该文件哪个地方?你可能需要查找以下关键词出现的地方:{', '.join([f"'{kw}'" for kw in keywords])},并确认插入的位置。我已在原文中各章节末尾打上待插入位置的标记,标记格式为'[$$当前章节名:第x个待插入位置$$]'形如'[$$四、投标保证金:第4个待插入位置$$]',每个标记与它紧跟着的上面的这一章内容关联。你需要返回给我{target}应该插入位置的标记数字序号,即'[$$四、投标保证金:第4个待插入位置$$]'中的'4',而不是当前页码,若有多个位置需要插入,可以返回多个序号,你的回答以数组返回,如[4, 5],若插入位置不明确,那么返回[-1]。 - """ - - return template1 - - -def generate_user_queries(target_names, data_dict): - """ - 为每个目标生成对应的用户查询。 - - Args: - target_names (list): 目标名称列表。 - data_dict (dict): 数据字典。 - - Returns: - list: 包含目标和查询的字典列表。 - """ - user_queries = [] - for target in target_names: - if target in data_dict: - chapters = data_dict[target].get("章节", []) - keywords = data_dict[target].get("关键字", []) - query = generate_user_query(target, chapters, keywords) - user_queries.append({ - "target": target, - "query": query - }) - else: - print(f"警告:'{target}'未在数据字典中找到相关信息。") - return user_queries - - -def process_string_list(string_list): - """ - 处理字符串列表,提取方括号内的内容并转换为实际列表。 - - Args: - string_list (str): 包含方括号的字符串。 - - Returns: - list: 解析后的列表内容。 - """ - match = re.search(r'\[(.*?)\]', string_list) - if match: - content_inside = match.group(1).strip() - if content_inside: - items = [item.strip() for item in content_inside.split(',')] - if all(item.isdigit() for item in items): - formatted_list = [int(item) for item in items] - else: - formatted_list = items - return formatted_list - return [] - - -def main(): - # 定义JSON文件路径 - # json_path = "flask_app/general/static/插入位置.json" - json_path = "D:\\flask_project\\flask_app\\general\\static\\插入位置.json" - # 加载数据 - data_dict = load_data(json_path) - - # 定义目标名称 - target_names = define_target_names() - - # 生成用户查询列表 - user_query_list = generate_user_queries(target_names, data_dict) - - if not user_query_list: - print("没有生成任何用户查询。") - sys.exit(0) - - # 提取查询 - queries = [item['query'] for item in user_query_list] - - # 定义文件路径 - format_part = "C:\\Users\\Administrator\\Desktop\\bid_format.pdf" - # format_part="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹\\bid_format.docx" - # 检查文件是否存在 - if not os.path.isfile(format_part): - print(f"错误:文件未找到 - {format_part}") - sys.exit(1) - - # 上传文件并获取file_id - file_id = upload_file(format_part) - - if not file_id: - print("错误:文件上传失败。") - sys.exit(1) - - # 使用多线程并行处理查询 - results = multi_threading(queries, "", file_id,2) - - if not results: - print("错误:未收到任何处理结果。") - sys.exit(1) - - # 清理返回结果 - baseinfo_list = [process_string_list(res) for _, res in results] - - # 输出结果 - for i, info in enumerate(baseinfo_list): - print(f'{target_names[i]}:{info}') - -if __name__ == "__main__": - main() diff --git a/flask_app/general/多线程提问.py b/flask_app/general/多线程提问.py index eec33d3..32ff7d6 100644 --- a/flask_app/general/多线程提问.py +++ b/flask_app/general/多线程提问.py @@ -238,82 +238,67 @@ def pure_assistant(): return assistant def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type): - if llm_type==1: - print(f"rag_assistant! question:{question}") - assistant = rag_assistant(knowledge_name) - # assistant=create_assistant(knowledge_name) - elif llm_type==2: - print(f"qianwen_long! question:{question}") - # 获取当前时间 - current_time = datetime.now() - # 输出时分秒 - print(current_time.strftime("%H:%M:%S.%f")[:-3]) - # qianwen_res,usage = qianwen_long(file_id,question) #有bug - qianwen_res = qianwen_long(file_id, question) - result_queue.put((ans_index,(question,qianwen_res))) - return - elif llm_type==3: - # print(f"doubao! question:{question}") - doubao_res=doubao_model(question) - result_queue.put((ans_index, (question, doubao_res))) - return - else : - assistant = pure_assistant() - ans = send_message(assistant, message=question) - result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans) + """ + 调用不同的 LLM 模型并将结果放入结果队列。 + """ + try: + if llm_type==1: + print(f"rag_assistant! question:{question}") + assistant = rag_assistant(knowledge_name) + # assistant=create_assistant(knowledge_name) + ans = send_message(assistant, message=question) + result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans) + elif llm_type==2: + print(f"qianwen_long! question:{question}") + # qianwen_res,usage = qianwen_long(file_id,question) #有bug + qianwen_res = qianwen_long(file_id, question) + if not qianwen_res: + result_queue.put((ans_index, None)) # 如果为空字符串,直接返回 None + else: + result_queue.put((ans_index, (question, qianwen_res))) + elif llm_type==3: + # print(f"doubao! question:{question}") + doubao_res=doubao_model(question) + if not doubao_res: + result_queue.put((ans_index, None)) # 如果为空字符串,直接返回 None + else: + result_queue.put((ans_index, (question, doubao_res))) + else : + assistant = pure_assistant() + ans = send_message(assistant, message=question) + result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans) + except Exception as e: + print(f"LLM 调用失败,查询索引 {ans_index},错误:{e}") + result_queue.put((ans_index, None)) # 使用 None 作为失败的占位符 def multi_threading(queries, knowledge_name="", file_id="", llm_type=1): if not queries: return [] - print("多线程提问:starting multi_threading...") result_queue = queue.Queue() - max_retries = 2 # 设置最大重试次数 - retry_counts = {} # 跟踪每个查询的重试次数 + with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor: + future_to_index = { + executor.submit(llm_call, query, knowledge_name, file_id, result_queue, index, llm_type): index + for index, query in enumerate(queries) + } - with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor: - future_to_query = {} - for index, query in enumerate(queries): - # time.sleep(0.5) # 每提交一个任务后等待0.5秒,目前设置了直接对qianwen-long直接限制,无需sleep - future = executor.submit(llm_call, query, knowledge_name, file_id, result_queue, index, llm_type) - future_to_query[future] = index - retry_counts[index] = 0 # 初始化重试次数 + for future in concurrent.futures.as_completed(future_to_index): + index = future_to_index[future] + try: + future.result() # 确保任务完成,如果有未处理的异常会在这里抛出 + except Exception as exc: + print(f"查询索引 {index} 生成了一个异常:{exc}") + result_queue.put((index, None)) # 使用 None 作为失败的占位符 - while future_to_query: - done, _ = concurrent.futures.wait( - future_to_query.keys(), - return_when=concurrent.futures.FIRST_COMPLETED - ) - for future in done: - index = future_to_query[future] - del future_to_query[future] - try: - future.result() # 捕获异常或确认任务完成 - except Exception as exc: - print(f"Query {index} generated an exception: {exc}") - retry_counts[index] += 1 # 增加重试计数 - #Query 0 generated an exception: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://help.aliyun.com/zh/dashscope/developer-reference/tongyi-thousand-questions-metering-and-billing.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}, 'request_id': 'de10e2e9-78c2-978f-8801-862ffb0892e9'} - if retry_counts[index] <= max_retries: - print(f"Retrying query {index} (attempt {retry_counts[index]})...") - print("重试的问题:" + queries[index]) - # 重新提交任务 - new_future = executor.submit(llm_call, queries[index], knowledge_name, file_id, result_queue, index, llm_type) - future_to_query[new_future] = index - else: - print(f"Query {index} failed after {max_retries} attempts.") - result_queue.put((index, None)) # 添加占位符 - - # 从队列中获取所有结果并按索引排序 + # 初始化结果列表,确保按查询的索引顺序排列 results = [None] * len(queries) while not result_queue.empty(): index, result = result_queue.get() results[index] = result - - # 检查是否所有结果都是 None + # 可选:过滤掉所有结果为 None 的项 + # 如果希望保留 None 以表示失败的查询,可以注释掉以下代码 if all(result is None for result in results): return [] - - # 过滤掉None值 results = [r for r in results if r is not None] return results diff --git a/flask_app/general/通义千问long.py b/flask_app/general/通义千问long.py index 01c6b9d..c2a14f9 100644 --- a/flask_app/general/通义千问long.py +++ b/flask_app/general/通义千问long.py @@ -1,4 +1,6 @@ +import ast import json +import re from functools import wraps from ratelimit import limits, sleep_and_retry @@ -32,95 +34,178 @@ def shared_rate_limit(func): return func(*args, **kwargs) return wrapper @shared_rate_limit -def qianwen_long(file_id, user_query): - print("call qianwen-long...") +def qianwen_long(file_id, user_query, max_retries=2, backoff_factor=1.0): """ - Uses a previously uploaded file to generate a response based on a user query. + 基于上传的文件 ID 和用户查询生成响应,并在失败时自动重试。 + 参数: + - file_id: 上传文件的 ID + - user_query: 用户查询 + - max_retries: 最大重试次数(默认 2 次) + - backoff_factor: 指数退避的基础等待时间(默认 1.0 秒) """ + print("call qianwen_long...") + client = OpenAI( api_key=os.getenv("DASHSCOPE_API_KEY"), base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" ) - # Generate a response based on the file ID - completion = client.chat.completions.create( - model="qwen-long", - # top_p=0.5, - temperature=0.5, - # response_format={"type":"json_object"}, - messages=[ - { - 'role': 'system', - 'content': f'fileid://{file_id}' - }, - { - 'role': 'user', - 'content': user_query - } - ], - stream=False - ) + for attempt in range(1, max_retries + 2): # +1 是为了包括初始调用 + try: + # 调用 API + completion = client.chat.completions.create( + model="qwen-long", + temperature=0.5, + messages=[ + { + 'role': 'system', + 'content': f'fileid://{file_id}' + }, + { + 'role': 'user', + 'content': user_query + } + ], + stream=False + ) + # 如果调用成功,返回响应内容 + return completion.choices[0].message.content + + except Exception as exc: + # 提取错误代码 + error_code, error_code_string = extract_error_details(str(exc)) + print(f"第 {attempt} 次尝试失败,查询:'{user_query}',错误:{exc}") + + if error_code == 429: + if attempt <= max_retries: + sleep_time = backoff_factor * (2 ** (attempt - 1)) # 指数退避 + print(f"错误代码为 429,将在 {sleep_time} 秒后重试...") + time.sleep(sleep_time) + else: + print(f"查询 '{user_query}' 的所有 {max_retries + 1} 次尝试均失败(429 错误)。") + elif error_code == 400 and error_code_string in ['data_inspection_failed', 'ResponseTimeout','DataInspectionFailed','response_timeout']: + if attempt == 1: # 只重试一次 + print(f"错误代码为 400 - {error_code_string},将立即重试...") + continue # 直接跳到下一次循环(即重试一次) + else: + print(f"查询 '{user_query}' 的所有 {max_retries + 1} 次尝试均失败(400 - {error_code_string})。") + + else: + # 对于非 429 和非特定 400 错误,不进行重试,直接抛出异常 + print(f"遇到非 429 或非 'data_inspection_failed' 的 400 错误(错误代码:{error_code}),不进行重试。") + return "" + + +def extract_error_details(error_message): + """ + 从错误消息中提取错误代码和内部错误代码。 + 假设错误消息的格式包含 'Error code: XXX - {...}' + """ + # 提取数值型错误代码 + error_code_match = re.search(r'Error code:\s*(\d+)', error_message) + error_code = int(error_code_match.group(1)) if error_code_match else None + + # 提取内部错误代码字符串(如 'data_inspection_failed') + error_code_string = None + error_dict_match = re.search(r'Error code:\s*\d+\s*-\s*(\{.*\})', error_message) + if error_dict_match: + error_dict_str = error_dict_match.group(1) + try: + # 使用 ast.literal_eval 解析字典字符串 + error_dict = ast.literal_eval(error_dict_str) + error_code_string = error_dict.get('error', {}).get('code') + print(error_code_string) + except Exception as e: + print(f"解析错误消息失败: {e}") + + return error_code, error_code_string + - # Return the response content - # return completion.choices[0].message.content,completion.usage - return completion.choices[0].message.content @shared_rate_limit -def qianwen_long_stream(file_id, user_query): - print("调用 qianwen-long stream...") +def qianwen_long_stream(file_id, user_query, max_retries = 2, backoff_factor = 1.0): """ 使用之前上传的文件,根据用户查询生成响应,并实时显示流式输出。 + 参数: + - file_id: 上传文件的 ID + - user_query: 用户查询 + - max_retries: 最大重试次数(默认 2 次) + - backoff_factor: 指数退避的基础等待时间(默认 1.0 秒) + 返回: + - Optional[str]: 成功时返回响应内容,失败时返回空字符串 """ + print("调用 qianwen-long stream...") + client = OpenAI( api_key=os.getenv("DASHSCOPE_API_KEY"), base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" ) - # 生成基于文件ID的响应 - completion = client.chat.completions.create( - model="qwen-long", - temperature=0.4, - messages=[ - { - 'role': 'system', - 'content': f'fileid://{file_id}' - }, - { - 'role': 'user', - 'content': user_query - } - ], - stream=True # 启用流式响应 - ) + for attempt in range(1, max_retries + 2): # +1 是为了包括初始调用 + try: + # 生成基于文件ID的响应 + completion = client.chat.completions.create( + model="qwen-long", + temperature=0.4, + max_tokens=5000, + messages=[ + { + 'role': 'system', + 'content': f'fileid://{file_id}' + }, + { + 'role': 'user', + 'content': user_query + } + ], + stream=True # 启用流式响应 + ) - full_response = "" # 用于存储完整的响应内容 + full_response = "" # 用于存储完整的响应内容 - try: - for chunk in completion: - # 假设chunk是一个对象,先将其转换为字典 - if hasattr(chunk, 'to_dict'): - chunk_data = chunk.to_dict() + for chunk in completion: + if hasattr(chunk, 'to_dict'): + chunk_data = chunk.to_dict() + else: + chunk_data = json.loads(chunk.model_dump_json()) + choices = chunk_data.get('choices', []) + if not choices: + continue + choice = choices[0] + delta = choice.get('delta', {}) + content = delta.get('content', '') + if content: + full_response += content + # print(content, end='', flush=True) # 实时打印内容 + if choice.get('finish_reason'): + break + + return full_response # 返回完整的响应内容 + + except Exception as exc: + # 提取错误代码 + error_code, error_code_string = extract_error_details(str(exc)) + print(f"第 {attempt} 次尝试失败,查询:'{user_query}',错误:{exc}") + + if error_code == 429: + if attempt <= max_retries: + sleep_time = backoff_factor * (2 ** (attempt - 1)) # 指数退避 + print(f"错误代码为 429,将在 {sleep_time} 秒后重试...") + time.sleep(sleep_time) + else: + print(f"查询 '{user_query}' 的所有 {max_retries + 1} 次尝试均失败(429 错误)。") + elif error_code == 400 and error_code_string in ['data_inspection_failed', 'ResponseTimeout', + 'DataInspectionFailed', 'response_timeout']: + if attempt == 1: # 只重试一次 + print(f"错误代码为 400 - {error_code_string},将立即重试...") + continue # 直接跳到下一次循环(即重试一次) + else: + print(f"查询 '{user_query}' 的所有 {max_retries + 1} 次尝试均失败(400 - {error_code_string})。") else: - # 如果没有to_dict方法,尝试直接转换为JSON - chunk_data = json.loads(chunk.model_dump_json()) - # 检查是否有'choices'字段 - choices = chunk_data.get('choices', []) - if not choices: - continue # 如果没有choices,跳过当前chunk - choice = choices[0] # 获取第一个choice - delta = choice.get('delta', {}) - # 提取'content'字段 - content = delta.get('content', '') - if content: - # print(content, end='', flush=True) # 实时打印内容 - full_response += content - # 检查是否有结束条件 - if choice.get('finish_reason'): - break - except KeyboardInterrupt: - print("\n中断流式响应。") - except Exception as e: - print(f"\n处理流式响应时出错: {e}") - return full_response # 返回完整的响应内容 + # 对于非 429 和非特定 400 错误,不进行重试,直接抛出异常 + print(f"遇到非 429 或非 'data_inspection_failed' 的 400 错误(错误代码:{error_code}),不进行重试。") + + # 如果所有尝试都失败了,返回空字符串 + return "" @shared_rate_limit def qianwen_long_text(file_id, user_query): diff --git a/flask_app/testdir/test3.py b/flask_app/testdir/test3.py index f5c4b4c..4ed3064 100644 --- a/flask_app/testdir/test3.py +++ b/flask_app/testdir/test3.py @@ -1,168 +1,3 @@ -import json -import re - - -def insert_missing_commas(json_str): - """ - 使用正则表达式在缺失逗号的位置插入逗号。 - 具体来说,寻找一个值的结束引号后紧跟着下一个键的开始引号,并在中间插入逗号。 - """ - # 这个正则匹配一个字符串结尾的引号,可能有空白字符,然后是另一个键的引号 - pattern = r'(":\s*"[^"]*)"\s*(")' - replacement = r'\1", \2' - - previous_str = None - while previous_str != json_str: - previous_str = json_str - json_str = re.sub(pattern, replacement, json_str) - - return json_str - - -def fix_json_escape_sequences(json_str): - """ - 修复 JSON 字符串中的非法转义序列。 - 将所有不符合 JSON 规范的反斜杠进行转义。 - """ - # JSON 中合法的转义字符 - valid_escapes = ['"', '\\', '/', 'b', 'f', 'n', 'r', 't', 'u'] - - # 使用正则表达式找到所有反斜杠 - # 如果反斜杠后面不是合法的转义字符,则进行转义 - pattern = re.compile(r'\\(?!["\\/bfnrtu])') - fixed_str = pattern.sub(r'\\\\', json_str) - return fixed_str - - -def replace_latex_expressions(json_str): - """ - 替换 JSON 字符串中的 LaTeX 风格表达式。 - 例如,将 $三 \geq 2 m$ 替换为 三 ≥2米 - """ - # 定义 LaTeX 符号到 Unicode 字符的映射 - latex_mapping = { - r'\geq': '≥', - r'\leq': '≤', - r'\times': '×', - r'\frac': '/', # 简单处理分数 - r'\neq': '≠', - r'\approx': '≈', - r'\pm': '±', - r'\alpha': 'α', - r'\beta': 'β', - r'\gamma': 'γ', - r'\delta': 'δ', - r'\pi': 'π', - r'\sqrt': '√', - r'\infty': '∞', - r'\cup': '∪', - r'\cap': '∩', - r'\subseteq': '⊆', - r'\supseteq': '⊇', - r'\forall': '∀', - r'\exists': '∃', - r'\rightarrow': '→', - r'\leftarrow': '←', - # 添加更多需要的映射 - } - - # 处理每一个 LaTeX 表达式 - def replace_match(match): - expr = match.group(1) - for latex, char in latex_mapping.items(): - expr = expr.replace(latex, char) - # 替换单位符号,例如 ' m' 到 '米', ' s' 到 '秒', 等等 - expr = re.sub(r'(\d+)\s*m', r'\1米', expr) - expr = re.sub(r'(\d+)\s*s', r'\1秒', expr) - expr = re.sub(r'(\d+)\s*kg', r'\1公斤', expr) - expr = re.sub(r'(\d+)\s*A', r'\1安', expr) # 例如电流单位安培 - # 继续添加更多单位 - return expr - - # 替换所有 $...$ 包围的内容 - fixed_str = re.sub(r'\$(.*?)\$', replace_match, json_str) - return fixed_str - - -def extract_content_from_json(string): - """ - 输入字符串,尝试解析 JSON 数据: - 1. 尝试直接解析原始 JSON。 - 2. 如果直接解析失败,按顺序使用不同修复方法尝试解析。 - """ - if not string.strip(): - return {} - - # 提取第一个匹配的 JSON 对象 - match = re.search(r'\{[\s\S]*\}', string) - if not match: - print("未找到有效的 JSON 内容。") - return {} - - original_json = match.group(0) - - # 尝试直接解析原始 JSON 数据 - try: - parsed = json.loads(original_json) - print("直接解析原始 JSON 成功。") - return parsed - except json.JSONDecodeError as original_error: - print(f"直接解析原始 JSON 失败: {original_error}") - - # 方法1:逗号修复 - try: - fixed_json1 = insert_missing_commas(original_json) - parsed = json.loads(fixed_json1) - print("使用方法1:逗号修复成功。") - return parsed - except json.JSONDecodeError as error1: - print(f"方法1(逗号修复)解析失败: {error1}") - - # 方法2:LaTeX 表达式替换 - try: - fixed_json2 = replace_latex_expressions(original_json) - parsed = json.loads(fixed_json2) - print("使用方法2:LaTeX 表达式替换成功。") - return parsed - except json.JSONDecodeError as error2: - print(f"方法2(LaTeX 替换)解析失败: {error2}") - - # 方法3:非法转义序列修复 - try: - fixed_json3 = fix_json_escape_sequences(original_json) - parsed = json.loads(fixed_json3) - print("使用方法3:非法转义序列修复成功。") - return parsed - except json.JSONDecodeError as error3: - print(f"方法3(非法转义修复)解析失败: {error3}") - - # 如果所有方法都失败,返回空字典 - print("所有修复方法均失败,返回空字典。") - return {} - - -def replace_latex_expressions_in_dict(obj): - """ - 递归遍历字典或列表,替换其中的 LaTeX 表达式。 - """ - if isinstance(obj, dict): - return {k: replace_latex_expressions_in_dict(v) for k, v in obj.items()} - elif isinstance(obj, list): - return [replace_latex_expressions_in_dict(item) for item in obj] - elif isinstance(obj, str): - # 仅处理被 $...$ 包围的内容 - def replace_match(match): - expr = match.group(1) - return replace_latex_expressions(expr) - - # 替换所有 $...$ 包围的内容 - return re.sub(r'\$(.*?)\$', replace_match, obj) - else: - return obj -test_json_1 = """ - { - "示例": "速度为 $v = 3 \\times 10^2 m/s$,加速度为 $a \\geq 9.8 m/s^2$。" - } - """ -res=extract_content_from_json(test_json_1) -print(res) \ No newline at end of file +data="" +if "哈哈" in data: + print("yes") \ No newline at end of file diff --git a/flask_app/货物标/商务服务其他要求提取.py b/flask_app/货物标/商务服务其他要求提取.py index a557dd7..d0c5dfd 100644 --- a/flask_app/货物标/商务服务其他要求提取.py +++ b/flask_app/货物标/商务服务其他要求提取.py @@ -2,18 +2,19 @@ import json import re from PyPDF2 import PdfReader - +import textwrap from flask_app.general.doubao import read_txt_to_string, pdf2txt -from flask_app.general.json_utils import combine_json_results,clean_json_string -from flask_app.general.通义千问long import upload_file,qianwen_long_stream +from flask_app.general.json_utils import combine_json_results, clean_json_string +from flask_app.general.通义千问long import upload_file, qianwen_long_stream from flask_app.货物标.截取pdf货物标版 import extract_common_header, clean_page_content +import concurrent.futures from flask_app.general.doubao import doubao_model -#正则表达式判断原文中是否有商务、服务、其他要求 -def find_exists(truncate_file, required_keys): - if not truncate_file: - return ["技术要求", "商务要求", "服务要求", "其他要求"] +# 正则表达式判断原文中是否有商务、服务、其他要求 +def find_exists(truncate_file, required_keys): + # if not truncate_file: + # return ["技术要求", "商务要求", "服务要求", "其他要求"] common_header = extract_common_header(truncate_file) # 假设该函数已定义 pdf_document = PdfReader(truncate_file) @@ -33,31 +34,44 @@ def find_exists(truncate_file, required_keys): end_pattern = re.compile( r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE) - # 遍历所有页面,拼接全文 - text = "" - for page in pdf_document.pages: - page_text = page.extract_text() or "" - cleaned_text = clean_page_content(page_text, common_header) - text += cleaned_text + "\n" + # 只处理第一页和最后一页 + first_page = pdf_document.pages[0].extract_text() or "" + last_page = pdf_document.pages[-1].extract_text() or "" - # 匹配起始位置 - start_match = re.search(begin_pattern, text) + # 清理页面内容 + first_page_clean = clean_page_content(first_page, common_header) + last_page_clean = clean_page_content(last_page, common_header) + + # 在第一页寻找起始位置 + start_match = re.search(begin_pattern, first_page_clean) if not start_match: - print("未找到开始模式") - return [] - - start_index = start_match.end() - - # 匹配结束位置 - end_match = re.search(end_pattern, text[start_index:]) - if end_match: - end_index = start_index + end_match.start() - relevant_text = text[start_index:end_index] + print("未找到开始模式,返回完整第一页") + start_index = 0 + first_content = first_page_clean else: - relevant_text = text[start_index:] + start_index = start_match.end() + first_content = first_page_clean[start_index:] - # 保留换行,避免结构丢失 + # 在最后一页寻找结束位置 + end_match = re.search(end_pattern, last_page_clean) + if not end_match: + print("未找到结束模式,返回完整最后一页") + last_content = last_page_clean + else: + last_content = last_page_clean[:end_match.start()] + + # 获取中间页面的内容 + middle_content = "" + if len(pdf_document.pages) > 2: + for page_num in range(1, len(pdf_document.pages) - 1): + page_text = pdf_document.pages[page_num].extract_text() or "" + cleaned_text = clean_page_content(page_text, common_header) + middle_content += cleaned_text + "\n" + + # 组合所有内容 + relevant_text = first_content + "\n" + middle_content + "\n" + last_content relevant_text = re.sub(r'\s+', ' ', relevant_text) + # print(f"提取的内容范围:\n{relevant_text}") # 匹配所需的要求 @@ -90,6 +104,7 @@ def find_exists(truncate_file, required_keys): return clean_requirements + def generate_queries(truncate_file, required_keys): key_list = find_exists(truncate_file, required_keys) queries = [] @@ -104,48 +119,47 @@ def generate_queries(truncate_file, required_keys): # print(query_base) return queries -def generate_user_query_template(required_keys,processed_filepath): - import textwrap - import json - - # 定义所有可能的键 - all_possible_keys = ["技术要求", "服务要求", "商务要求", "其他要求", "技术、服务要求","总体要求","进度要求","培训要求"] +def generate_template(required_keys, type=1): # 定义每个键对应的示例内容 example_content1 = { "技术要求": ["相关技术要求1", "相关技术要求2"], - "服务要求": ["服务要求1", "服务要求2"], + "服务要求": ["服务要求1", "服务要求2", "服务要求3"], + "商务要求": ["商务要求1", "商务要求2"], + "其他要求": { + "子因素名1": ["关于项目采购的其他要求1...", "关于项目采购的其他要求2..."], + "子因素名2": ["关于项目采购的其他要求3..."] + }, + "技术、服务要求": ["相关技术、服务要求内容1", "相关技术、服务要求内容2", "相关技术、服务要求内容3"] + } + example_content2 = { + "技术要求": { + "子因素名1": ["相关技术要求1", "相关技术要求2"] + }, + "服务要求": { + "子因素名1": ["服务要求1"], + "子因素名2": ["服务要求2", "服务要求3"] + }, "商务要求": { "子因素名1": ["商务要求1"], "子因素名2": ["商务要求2"] }, - "其他要求": { - "子因素名1": ["关于项目采购的其他要求1...", "关于项目采购的其他要求2..."], - "子因素名2": ["关于项目采购的其他要求3...", "关于项目采购的其他要求4..."] - }, - "技术、服务要求": ["相关技术、服务要求内容1", "相关技术、服务要求内容2"] - } - - example_content2 = { - "技术要求": { - "子因素名1": ["相关技术要求1", "相关技术要求2"], - "子因素名2": ["相关技术要求3"] - }, - "服务要求": { - "子因素名1": ["相关服务要求1", "相关服务要求2"], - "子因素名2": ["相关服务要求3", "相关服务要求4"] - }, - "商务要求": ["商务要求1", "商务要求2"], - "其他要求": ["关于项目采购的其他要求1..."], + "其他要求": ["关于项目采购的其他要求1...", "关于项目采购的其他要求2...", "关于项目采购的其他要求3..."], "技术、服务要求": { "子因素名1": ["相关技术、服务要求内容1"], "子因素名2": ["相关技术、服务要求内容2", "相关技术、服务要求内容3"] } } - # 将 required_keys 转换为集合以便于操作 keys = set(required_keys) - + type_to_keys_map = { + 1: ["服务要求", "商务要求", "其他要求"], + 2: ["技术要求", "技术、服务要求"] + } + # 根据 type 获取对应的 all_possible_keys + chosen_keys = type_to_keys_map.get(type, []) + another_keys_list = type_to_keys_map.get(3 - type, []) # 3 - type 将 type 1 映射到 2,反之亦然 + another_keys_str = ', '.join([f"'{key}'" for key in another_keys_list]) # 处理互斥关系:如果 "技术要求" 和 "服务要求" 同时存在,则移除 "技术、服务要求" if "技术要求" in keys and "服务要求" in keys: keys.discard("技术、服务要求") @@ -155,59 +169,62 @@ def generate_user_query_template(required_keys,processed_filepath): keys.discard("服务要求") # 确保 keys 中只包含允许的键 - keys = keys.intersection(all_possible_keys) - + keys = keys.intersection(chosen_keys) # 按照预定义的顺序排序键,以保持一致性 - sorted_keys = [key for key in all_possible_keys if key in keys] + sorted_keys = [key for key in chosen_keys if key in keys] - # 如果没有任何键被选中,返回一个默认的模板或抛出异常 + # 如果没有任何键被选中,返回"" if not sorted_keys: - raise ValueError("required_keys 中没有有效的键。") + return "" - # 生成提示部分,根据 sorted_keys 动态构建 - keys_str = '、'.join(sorted_keys) - outer_keys_str = ', '.join([f"'{key}'" for key in sorted_keys]) - - # 使用三引号定义多行字符串,便于编辑和维护 - prompt_instruction = textwrap.dedent(f"""请你根据该货物类招标文件中的采购要求部分内容(技术、服务及商务要求部分内容),请告诉我该项目采购的{keys_str}分别是什么,请以json格式返回结果,默认情况下外层键名是{outer_keys_str},键值为字符串列表,每个字符串表示具体的一条要求,请按原文内容回答,保留三角▲、五角星★和序号(若有),不要擅自增添内容。 + # 生成模板的通用部分 + def generate_prompt_instruction(keys_str, outer_keys_str, another_keys_str, type): + if type == 1: + specific_instructions = textwrap.dedent( + """4. 若章节开头位置或者采购清单中除了需要采购的货物、数量、单位之外,还有带三角▲或五角星★的描述内容(如工期要求、质保要求等商务要求),请将该部分内容提取出来,添加在外层键名为'商务要求'的键值部分。 + 5. 在提取'服务要求'的时候,通常包含'售后、维护、培训'等要求,若原文中有这些要求,请一并提取置于'服务要求'的键值中,。 + """ + ) + else: + specific_instructions = "4. 在提取技术要求或技术、服务要求时,你无需从采购清单或表格中提取技术要求以及参数要求,你仅需定位到原文中包含'技术要求'或'技术、服务要求'关键字的标题并提取其后相关内容;若内容全在表格中,键值为空列表[]。" + return textwrap.dedent( + f"""请你根据该货物类招标文件中的采购要求部分内容,请告诉我该项目采购的{keys_str}分别是什么,请以json格式返回结果,默认情况下外层键名是{outer_keys_str},键值为字符串列表,每个字符串表示具体的一条要求,可以按原文中的序号作划分(若有序号的话),请按原文内容回答,保留三角▲、五角星★和序号(若有),不要擅自增添内容及序号。请不要提取{another_keys_str}中的内容。 要求与指南: - 1. 默认情况无需嵌套,键值为字符串列表;若存在嵌套结构,嵌套键名是原文中该要求下相应子标题,最多一层嵌套。 - 2. JSON 的结构要求: - - 外层键名为 {outer_keys_str} 中的各项。 + 1. JSON 的结构要求: + - 默认情况无需嵌套键值对,键值为字符串列表;若存在嵌套结构(即有明确标题表示各子要求),则嵌套键名是原文中该要求下相应子标题,最多一层嵌套。 - 每个外层键对应的值可以是: - a. 一个对象(字典),其键为子因素名,值为字符串列表。 - b. 一个字符串列表,表示具体的一条条要求。若只有一条要求,也用字符串列表表示。 + a. 一个字符串列表,表示具体的一条条要求。若只有一条要求,也用字符串列表表示。 + b. 一个对象(字典),其键为子因素名,值为字符串列表。 - 最多只允许一层嵌套。 - 3. 请优先定位正文部分的大标题'xx要求',在其之后提取'xx要求'相关内容, - 4. 若章节开头位置或者采购清单中除了需要采购的货物、数量、单位之外,还有带三角▲或五角星★的描述内容(如工期要求、质保要求等商务要求),请将该部分内容提取出来,添加在键名为'商务要求'的字典的键值部分,注意请不要返回Markdown语法,必要时使用冒号':'将相关信息拼接在一起。 - 5. 在提取技术要求或技术、服务要求时(若有),你无需从采购清单或表格中提取货物名以及参数要求,你仅需定位到原文中大标题'技术要求'或'技术、服务要求'部分提取正文内容,若内容全在表格中,键值为空列表[]。 - 6. 若无相关要求,键值为[] - """ ) + 2. 请优先且准确定位正文部分包含以下关键字的标题:{outer_keys_str},在其之后提取'XX要求'相关内容,尽量避免在无关地方提取内容。 + 3. 注意请不要返回Markdown语法,必要时使用冒号':'将相关信息拼接在一起。若文档中无符合的相关要求,键值为空列表[] + {specific_instructions} + """) - # 过滤 example_content1 和 example_content2 以仅包含 sorted_keys - def filter_content(example_content, keys): + # 过滤示例内容 + def filter_example_content(example_content, keys): return {k: v for k, v in example_content.items() if k in keys} - filtered_example_content1 = filter_content(example_content1, sorted_keys) - filtered_example_content2 = filter_content(example_content2, sorted_keys) + def format_example(example_content): + return json.dumps(example_content, indent=4, ensure_ascii=False) - # 将过滤后的示例转换为格式化的 JSON 字符串 - json_example1_str = json.dumps(filtered_example_content1, indent=4, ensure_ascii=False) - json_example2_str = json.dumps(filtered_example_content2, indent=4, ensure_ascii=False) - # 从文件中读取内容 - # full_text = read_txt_to_string(processed_filepath) + filtered_example_content1 = filter_example_content(example_content1, sorted_keys) + filtered_example_content2 = filter_example_content(example_content2, sorted_keys) + tech_json_example1_str = format_example(filtered_example_content1) + tech_json_example2_str = format_example(filtered_example_content2) + keys_str = '、'.join(sorted_keys) + outer_keys_str = ', '.join([f"'{key}'" for key in sorted_keys]) + prompt_instruction = generate_prompt_instruction(keys_str, outer_keys_str, another_keys_str, type) # 完整的用户查询模板,包含两份示例输出 user_query_template = f""" -{prompt_instruction} -以下为示例输出,仅供格式参考: -示例 1: -{json_example1_str} -示例 2: -{json_example2_str} - -""" - # 文本内容:{full_text} + {prompt_instruction} + 以下为示例输出,仅供格式参考: + 示例 1: + {tech_json_example1_str} + 示例 2: + {tech_json_example2_str} + """ return user_query_template def merge_requirements(input_dict): @@ -246,30 +263,45 @@ def merge_requirements(input_dict): final_dict[key] = final_dict[key].strip() return final_dict -#,"总\s*体\s*要\s*求","进\s*度\s*要\s*求","培\s*训\s*要\s*求" -def get_business_requirements(procurement_path,processed_filepath): - file_id=upload_file(procurement_path) - required_keys = ["技\s*术\s*要\s*求","商\s*务\s*要\s*求", "服\s*务\s*要\s*求", "其\s*他\s*要\s*求"] - contained_keys=find_exists(procurement_path,required_keys) + + +# ,"总\s*体\s*要\s*求","进\s*度\s*要\s*求","培\s*训\s*要\s*求" +def get_business_requirements(procurement_path,procurement_docpath): + file_id = upload_file(procurement_docpath) + print(file_id) + required_keys = ["技\s*术\s*要\s*求", "商\s*务\s*要\s*求", "服\s*务\s*要\s*求", "其\s*他\s*要\s*求"] + contained_keys = find_exists(procurement_path, required_keys) print(contained_keys) if not contained_keys: return {} # queries = generate_queries(truncate_file, contained_keys) - user_query=generate_user_query_template(contained_keys,processed_filepath) - # print(user_query) - model_res=qianwen_long_stream(file_id,user_query) - # model_res=doubao_model(user_query) - # Combine and fill missing keys with default values - final_res = clean_json_string(model_res) - # final_res.update({key: final_res.get(key, "") for key in required_keys}) + busi_user_query = generate_template(contained_keys, 1) + tech_user_query = generate_template(contained_keys, 2) + final_res={} + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + futures = [] + if busi_user_query: + futures.append(executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1)) + if tech_user_query: + futures.append(executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1)) + # 获取结果 + for future in concurrent.futures.as_completed(futures): + try: + result = future.result() + if result: # 确保结果不为空 + final_res.update(clean_json_string(result)) + except Exception as e: + print(f"An error occurred: {e}") return final_res -#TODO:改为先判断,再摘取 + +# TODO:改为先判断,再摘取 if __name__ == "__main__": # truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\e4be098d-b378-4126-9c32-a742b237b3b1\\ztbfile_procurement.docx" - truncate_file=r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a\ztbfile_procurement.pdf" - + truncate_file = r"C:\Users\Administrator\Desktop\货物标\output1\交警支队机动车查验监管系统项目采购_procurement.pdf" + docx_path=r'C:\Users\Administrator\Desktop\货物标\output1\交警支队机动车查验监管系统项目采购_procurement.docx' + # truncate_file=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0519-001-招标文件_procurement.pdf" # file_id = upload_file(truncate_file) processed_filepath = pdf2txt(truncate_file) - res=get_business_requirements(truncate_file,processed_filepath) - print(json.dumps(res, ensure_ascii=False, indent=4)) + final_res= get_business_requirements(truncate_file, docx_path) + print(json.dumps(final_res, ensure_ascii=False, indent=4)) diff --git a/flask_app/货物标/技术参数要求提取.py b/flask_app/货物标/技术参数要求提取.py index ae29084..b247f07 100644 --- a/flask_app/货物标/技术参数要求提取.py +++ b/flask_app/货物标/技术参数要求提取.py @@ -308,8 +308,8 @@ def combine_and_update_results(original_data, updates): #文件内容以markdown格式组织,其中表格部分(若有)以html语法组织, def get_technical_requirements(file_path,invalid_path,processed_filepath): - docx_file_path=pdf2docx(file_path) - file_id=upload_file(docx_file_path) + # docx_file_path=pdf2docx(file_path) + file_id=upload_file(file_path) #docx first_query_template="该文件是否说明了采购需求,即需要采购哪些货物?如果有,请回答'是',否则,回答'否'" #防止截取失败 judge_res=qianwen_long(file_id,first_query_template) prompt_template1 = ''' diff --git a/flask_app/货物标/提取采购需求main.py b/flask_app/货物标/提取采购需求main.py index 19566be..baa0f33 100644 --- a/flask_app/货物标/提取采购需求main.py +++ b/flask_app/货物标/提取采购需求main.py @@ -4,14 +4,15 @@ import time from flask_app.general.doubao import pdf2txt from flask_app.general.file2markdown import convert_pdf_to_markdown +from flask_app.general.format_change import pdf2docx from flask_app.货物标.技术参数要求提取 import get_technical_requirements from flask_app.general.通义千问long import upload_file from flask_app.货物标.商务服务其他要求提取 import get_business_requirements -#获取采购清单 +# 获取采购清单 def fetch_procurement_reqs(procurement_path, invalid_path): - # procurement_docpath = pdf2docx(procurement_path) # 采购需求docx + procurement_docpath = pdf2docx(procurement_path) # 采购需求docx # 定义默认的 procurement_reqs 字典 DEFAULT_PROCUREMENT_REQS = { "采购需求": "", @@ -30,10 +31,12 @@ def fetch_procurement_reqs(procurement_path, invalid_path): processed_filepath = pdf2txt(procurement_path) # 纯文本提取 # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements with concurrent.futures.ThreadPoolExecutor() as executor: + # 提交任务给线程池 - future_technical = executor.submit(get_technical_requirements, procurement_path, invalid_path,processed_filepath) + future_technical = executor.submit(get_technical_requirements, procurement_docpath, invalid_path, + processed_filepath) time.sleep(0.5) # 保持原有的延时 - future_business = executor.submit(get_business_requirements, procurement_path,processed_filepath) + future_business = executor.submit(get_business_requirements, procurement_path, procurement_docpath) # 获取并行任务的结果 technical_requirements = future_technical.result() @@ -60,16 +63,16 @@ def fetch_procurement_reqs(procurement_path, invalid_path): return DEFAULT_PROCUREMENT_REQS.copy() -#TODO:技术要求可以在技术参数之后执行,把完整的技术参数输入,问大模型,除了上述内容还有哪些,这样的话把技术标和其他的区分开。 -#TODO: 094有问题 +# TODO:技术要求可以在技术参数之后执行,把完整的技术参数输入,问大模型,除了上述内容还有哪些,这样的话把技术标和其他的区分开。 +# TODO: 094有问题 if __name__ == "__main__": - start_time=time.time() + start_time = time.time() output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output" # file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_procurement.pdf" - procurement_path = r"C:\Users\Administrator\Desktop\货物标\output1\招标文件(107国道)_procurement.pdf" - procurement_docpath=r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a" - invalid_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\db79e9e0-830e-442c-8cb6-1d036215f8ff\\ztbfile.pdf" - res=fetch_procurement_reqs(procurement_path,invalid_path) + procurement_path = r"C:\Users\Administrator\Desktop\货物标\output1\陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1)_procurement.pdf" + procurement_docpath = r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a" + invalid_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\db79e9e0-830e-442c-8cb6-1d036215f8ff\\ztbfile.pdf" + res = fetch_procurement_reqs(procurement_path, invalid_path) print(json.dumps(res, ensure_ascii=False, indent=4)) - end_time=time.time() - print("耗时:"+str(end_time-start_time)) + end_time = time.time() + print("耗时:" + str(end_time - start_time)) diff --git a/flask_app/货物标/货物标解析main.py b/flask_app/货物标/货物标解析main.py index db69b4e..f519eee 100644 --- a/flask_app/货物标/货物标解析main.py +++ b/flask_app/货物标/货物标解析main.py @@ -213,6 +213,7 @@ def post_process_baseinfo(base_info,logger): logger.error(f"Error in post_process_baseinfo: {e}") return base_info, [] # 返回空列表 +#TODO:错误处理,通过返回值completion来错误处理,而不是正则表达 学习装饰器、 整体后处理 def goods_bid_main(output_folder, file_path, file_type, unique_id): logger = get_global_logger(unique_id) # 预处理文件,获取处理后的数据 From 8078651aad9c7d484cc012c5080a59c8a9a9c1cf Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Thu, 28 Nov 2024 11:57:32 +0800 Subject: [PATCH 2/8] =?UTF-8?q?11.27=20=E6=B8=85=E7=90=86file=5Fid?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/接口_小解析.py | 2 +- flask_app/general/清除file_id.py | 81 +++- flask_app/general/通义千问long.py | 25 +- .../old_version/不得存在及禁止投标情形.py | 172 ------- flask_app/old_version/回答来源.py | 216 --------- flask_app/old_version/基础信息整合.py | 166 ------- flask_app/old_version/多线程分类.py | 373 --------------- flask_app/old_version/废标项.py | 66 --- flask_app/old_version/招标文件解析.py | 305 ------------ flask_app/old_version/文件分类普通版.py | 350 -------------- .../文档理解大模型版知识库处理/删除知识库.py | 55 --- .../文档理解大模型版知识库处理/知识库操作.py | 58 --- .../调用文件解析状态查询.py | 33 -- .../调用文档解析异步提交.py | 40 -- .../调用文档解析结果获取.py | 35 -- .../old_version/无效标和废标和禁止投标整合.py | 440 ------------------ flask_app/old_version/解析old.py | 221 --------- flask_app/old_version/资格审查模块old.py | 42 -- flask_app/old_version/资格评审old.py | 182 -------- flask_app/old_version/资格评审前判断.py | 29 -- flask_app/old_version/通义千问.py | 48 -- flask_app/start_up.py | 28 +- flask_app/货物标/商务服务其他要求提取.py | 7 +- 23 files changed, 106 insertions(+), 2868 deletions(-) delete mode 100644 flask_app/old_version/不得存在及禁止投标情形.py delete mode 100644 flask_app/old_version/回答来源.py delete mode 100644 flask_app/old_version/基础信息整合.py delete mode 100644 flask_app/old_version/多线程分类.py delete mode 100644 flask_app/old_version/废标项.py delete mode 100644 flask_app/old_version/招标文件解析.py delete mode 100644 flask_app/old_version/文件分类普通版.py delete mode 100644 flask_app/old_version/文档理解大模型版知识库处理/删除知识库.py delete mode 100644 flask_app/old_version/文档理解大模型版知识库处理/知识库操作.py delete mode 100644 flask_app/old_version/文档理解大模型版知识库处理/调用文件解析状态查询.py delete mode 100644 flask_app/old_version/文档理解大模型版知识库处理/调用文档解析异步提交.py delete mode 100644 flask_app/old_version/文档理解大模型版知识库处理/调用文档解析结果获取.py delete mode 100644 flask_app/old_version/无效标和废标和禁止投标整合.py delete mode 100644 flask_app/old_version/解析old.py delete mode 100644 flask_app/old_version/资格审查模块old.py delete mode 100644 flask_app/old_version/资格评审old.py delete mode 100644 flask_app/old_version/资格评审前判断.py delete mode 100644 flask_app/old_version/通义千问.py diff --git a/flask_app/general/接口_小解析.py b/flask_app/general/接口_小解析.py index 7a25baf..0fc9bda 100644 --- a/flask_app/general/接口_小解析.py +++ b/flask_app/general/接口_小解析.py @@ -12,7 +12,7 @@ from flask_app.货物标.基础信息解析main import aggregate_basic_info_good from flask_app.货物标.截取pdf货物标版 import truncate_pdf_specific_goods from flask_app.main.截取pdf import truncate_pdf_specific_engineering,truncate_pdf_main from flask_app.general.post_processing import inner_post_processing -from flask_app.old_version.基础信息整合 import aggregate_basic_info_engineering +from flask_app.old_version.基础信息整合_old import aggregate_basic_info_engineering def get_global_logger(unique_id): diff --git a/flask_app/general/清除file_id.py b/flask_app/general/清除file_id.py index cab29ba..089f95b 100644 --- a/flask_app/general/清除file_id.py +++ b/flask_app/general/清除file_id.py @@ -8,18 +8,75 @@ client = OpenAI( base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", ) -# 获取文件列表 -file_stk = client.files.list() -# 将文件信息解析为 JSON 格式 -file_data = json.loads(file_stk.model_dump_json()) +def delete_all_files(): + """ + 查询所有文件并删除。 + """ + try: + # 获取文件列表 + file_stk = client.files.list() -# 提取所有文件的 id -file_ids = [file["id"] for file in file_data["data"]] + # 将文件信息解析为 JSON 格式 + file_data = json.loads(file_stk.model_dump_json()) -# num=len(file_ids) -# print(num) -# 循环删除每个文件 -for file_id in file_ids: - file_object = client.files.delete(file_id) - print(file_object.model_dump_json()) + # 提取所有文件的 id + file_ids = [file["id"] for file in file_data["data"]] + + # 循环删除每个文件 + for file_id in file_ids: + file_object = client.files.delete(file_id) + print(f"Deleted file with id: {file_id} - {file_object.model_dump_json()}") + + except Exception as e: + print(f"An error occurred while deleting files: {e}") + + +def read_file_ids(output_folder): + file_ids = [] + file_ids_path = os.path.join(output_folder, 'file_ids.txt') + # 检查文件是否存在 + if os.path.exists(file_ids_path): + try: + with open(file_ids_path, 'r', encoding='utf-8') as file: + # 按行读取文件内容 + file_ids = [line.strip() for line in file.readlines()] + print(f"读取到的文件 ID 列表:{file_ids}") + except Exception as e: + print(f"读取 file_ids.txt 文件时发生错误: {e}") + else: + print(f"文件 {file_ids_path} 不存在。") + + return file_ids + + +def delete_file_by_ids(file_ids): + """ + 根据传入的 file_id 列表删除指定的文件。 + + :param file_ids: 一个包含文件 ID 的字符串列表 + """ + if not isinstance(file_ids, list): + print("Error: file_ids should be a list.") + return + + if not all(isinstance(file_id, str) for file_id in file_ids): + print("Error: Each file_id should be a string.") + return + + try: + # 删除指定文件 + for file_id in file_ids: + try: + # 假设 client.files.delete 会返回一个文件对象 + file_object = client.files.delete(file_id) + # print(f"Deleted file with id: {file_id} - {file_object.model_dump_json()}") + except Exception as e: + # 处理删除单个文件时的异常 + print(f"Failed to delete file with id {file_id}: {e}") + + except Exception as e: + print(f"An error occurred while processing the file_ids: {e}") + +if __name__ == '__main__': + delete_all_files() \ No newline at end of file diff --git a/flask_app/general/通义千问long.py b/flask_app/general/通义千问long.py index c2a14f9..67cfbe8 100644 --- a/flask_app/general/通义千问long.py +++ b/flask_app/general/通义千问long.py @@ -1,8 +1,10 @@ import ast import json import re +import threading from functools import wraps +from flask import g from ratelimit import limits, sleep_and_retry import random import time @@ -10,16 +12,35 @@ from pathlib import Path from openai import OpenAI import os -def upload_file(file_path): +file_write_lock = threading.Lock() +def upload_file(file_path,output_folder=""): """ Uploads a file to DashScope and returns the file ID. + Additionally, saves the file ID to 'file_ids.txt' in the given output folder. """ + if not output_folder: + output_folder=os.path.dirname(file_path) client = OpenAI( api_key=os.getenv("DASHSCOPE_API_KEY"), base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" ) + + # 上传文件并获取 file_id file = client.files.create(file=Path(file_path), purpose="file-extract") - return file.id + file_id = file.id + + # 创建output_folder路径,如果它不存在 + if not os.path.exists(output_folder): + os.makedirs(output_folder) + + # 确保文件写入是互斥的 + with file_write_lock: # 在这个代码块中,其他线程无法进入 + file_ids_path = os.path.join(output_folder, 'file_ids.txt') + # 如果文件不存在,就创建它并写入 file_id + with open(file_ids_path, 'a') as f: + f.write(f'{file_id}\n') + + return file_id @sleep_and_retry @limits(calls=4, period=1) # 每秒最多调用4次 diff --git a/flask_app/old_version/不得存在及禁止投标情形.py b/flask_app/old_version/不得存在及禁止投标情形.py deleted file mode 100644 index 9a4fe1b..0000000 --- a/flask_app/old_version/不得存在及禁止投标情形.py +++ /dev/null @@ -1,172 +0,0 @@ -import json -import os -import re - -from PyPDF2 import PdfWriter, PdfReader - -from flask_app.general.通义千问long import upload_file, qianwen_long -from flask_app.general.通用功能函数 import process_string_list - - -def extract_and_format_from_paths(json_paths, includes, excludes): - """ - 从多个 JSON 文件路径读取数据,提取包含特定关键词的内容,并按照格式要求合并。 - - 参数: - json_paths (list): 包含多个 JSON 文件路径的列表。 - includes (list): 包含要检查的关键词的列表。 - excludes (list): 包含要排除的关键词的列表。 - - 返回: - list: 包含所有文件中满足条件的格式化字符串列表。 - """ - all_formatted_results = [] - - # 遍历每个文件路径 - for path in json_paths: - # 检查路径是否为空或不存在 - if not path or not os.path.isfile(path): - print(f"禁止投标情形: Error: The file path '{path}' is invalid or does not exist.") - continue # 跳过无效路径 - try: - with open(path, 'r', encoding='utf-8') as file: - # 加载 JSON 数据 - json_data = json.load(file) - formatted_results = [] - - # 遍历 JSON 数据的每个键值对 - for key, value in json_data.items(): - if isinstance(value, dict): - # 如果值是字典,检查嵌套字典的每个键值对 - for sub_key, sub_value in value.items(): - if any(include in sub_key for include in includes): - # 如果子值包含关键词,格式化并添加到结果列表 - formatted_results.append(f"{sub_value}") - elif isinstance(value, str): # clause - # 检查是否包含任何 include 关键词 - for include in includes: - if include in value: - # 找到 include 之前的内容 - prefix = value.split(include)[0] - # 检查 prefix 是否不包含任何 exclude 关键词 - if not any(exclude in prefix for exclude in excludes): - # 如果不包含任何 exclude 关键词,添加整个 value 到结果列表 - if '\n' in value: - value = value.split('\n', 1)[-1] - formatted_results.append(value) - break # 找到一个符合条件的就跳出循环 - - # 将当前文件的结果添加到总结果列表 - all_formatted_results.extend(formatted_results) - except FileNotFoundError: - print(f"禁止投标情形: Error: The file '{path}' does not exist.") - except json.JSONDecodeError: - print(f"禁止投标情形: Error: The file '{path}' contains invalid JSON.") - - return all_formatted_results - -def extract_unique_items_from_texts(texts): - # 更新正则表达式以包括更广泛的序号类型,包括中文序号 - pattern = re.compile(r'(?:\d+\.|\(\d+\)|\(\d+\)|\d+\)|\①|\②|\③|\④|\⑤|\⑥|\⑦|\⑧|\⑨|\⑩|\⑪|\⑫)\s*') - intro_pattern = re.compile(r'^.*?[::]') - punctuation_pattern = re.compile(r'[;。,、..,:;!?!?]+$') - url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') - content_check_pattern = re.compile(r'[\u4e00-\u9fa5a-zA-Z0-9]{2,}') # 检查是否含有至少2个连续的字母、数字或汉字 - - all_results = [] - seen = set() - - for text in texts: - # 去除文本中的制表符和换行符 - text = text.replace('\t', '').replace('\n', '') - - # 删除引导性的文本(直到冒号,但保留冒号后的内容) - text = intro_pattern.sub('', text) - - # 替换URL为占位符,并保存URL以便后续还原 - urls = [] - def url_replacer(match): - urls.append(match.group(0)) - return f"{{URL{len(urls)}}}" - text = url_pattern.sub(url_replacer, text) - - # 使用数字和括号的模式分割文本 - items = pattern.split(text) - - for item in items: - cleaned_item = item.strip() - if cleaned_item: - # 进一步清理每个条目 - cleaned_item = pattern.sub('', cleaned_item) - cleaned_item = punctuation_pattern.sub('', cleaned_item) - cleaned_item = cleaned_item.strip() - - # 还原URL - for i, url in enumerate(urls, 1): - cleaned_item = cleaned_item.replace(f"{{URL{i}}}", url) - - # 添加未见过的独特条目,确保它包含足够的实质内容并长度大于3个字符 - if cleaned_item and cleaned_item not in seen and len(cleaned_item) > 3 and content_check_pattern.search(cleaned_item): - seen.add(cleaned_item) - all_results.append(cleaned_item) - - return all_results - -def merge_pdfs(paths, output_filename): - pdf_writer = PdfWriter() - output_path = None - - for path in paths: - pdf_reader = PdfReader(path) - for page in range(len(pdf_reader.pages)): - # 将每页添加到写入对象中 - pdf_writer.add_page(pdf_reader.pages[page]) - - if output_path is None: - # 确定输出文件路径 - output_path = os.path.join(os.path.dirname(path), output_filename) - - # 写入合并的PDF到文件 - if output_path: - with open(output_path, 'wb') as out: - pdf_writer.write(out) - print(f"禁止投标情形: Merged PDF saved to {output_path}") - else: - print("禁止投标情形: No files to merge.") - return output_path - - -def find_forbidden(qualification=""): - try: - if qualification: - file_id = upload_file(qualification) - user_query_forbidden = ( - "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...]," - "请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。" - ) - qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden) - else: - qianwen_forbidden_str = "[]" - - actual_list = process_string_list(qianwen_forbidden_str) # 提取出字符串列表 ["xxx","xx"] - includes = ["不得存在", "不得与", "禁止投标", "对投标人的纪律"] - excludes = ["招标", "评标", "定标"] - # forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes, excludes) - # processed_results = extract_unique_items_from_texts(forbidden_results) - # merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results)) - merged_forbidden_list = list(dict.fromkeys(actual_list)) - forbidden_dict = {'不得存在的其他情形': merged_forbidden_list} - return forbidden_dict - except Exception as e: - print(f"find_forbidden 在处理时发生异常: {e}") - return {'不得存在的其他情形': ""} - -#TODO:不得存在的情况文中有很多内容,货物标中采用了全文搜索的逻辑。 -if __name__ == '__main__': - # truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\truncate_output.json" - # clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json" - qualification = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf" - output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f" - doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile.docx' - res = find_forbidden(qualification) - print(json.dumps(res, ensure_ascii=False, indent=4)) \ No newline at end of file diff --git a/flask_app/old_version/回答来源.py b/flask_app/old_version/回答来源.py deleted file mode 100644 index 27bb7fe..0000000 --- a/flask_app/old_version/回答来源.py +++ /dev/null @@ -1,216 +0,0 @@ -#基于多线程提问,现已废弃 -# assistant_id -import queue -import concurrent.futures -from dashscope import Assistants, Messages, Runs, Threads -from llama_index.indices.managed.dashscope import DashScopeCloudRetriever -from flask_app.general.json_utils import extract_content_from_json -prompt = """ -# 角色 -你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。 - -## 技能 -### 技能 1:文档解析与摘要 -- 深入理解并分析${document1}的内容,提取关键信息。 -- 根据需求生成简洁明了的摘要,保持原文核心意义不变。 - -### 技能 2:信息检索与关联 -- 在${document1}中高效检索特定信息或关键词。 -- 能够识别并链接到文档内部或外部的相关内容,增强信息的连贯性和深度。 - -## 限制 -- 所有操作均需基于${document1}的内容,不可超出此范围创造信息。 -- 在处理敏感或机密信息时,需遵守严格的隐私和安全规定。 -- 确保所有生成或改编的内容逻辑连贯,无误导性信息。 - -请注意,上述技能执行时将直接利用并参考${document1}的具体内容,以确保所有产出紧密相关且高质量。 -""" -prom = '请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}' - - - - -#正文和文档名之间的内容 -def extract_content_between_tags(text): - results = [] - - # 使用“【正文】”来分割文本 - parts = text.split('【正文】')[1:] # 跳过第一个分割结果,因为它前面不会有内容 - - for index, part in enumerate(parts): - # 查找“【文档名】”标签的位置 - doc_name_index = part.find('【文档名】') - # 查找 'file_ids' 标签的位置 - file_ids_index = part.find("'file_ids'") - - # 根据是否找到“【文档名】”来决定提取内容的截止点 - if doc_name_index != -1: - end_index = doc_name_index - elif file_ids_index != -1: - end_index = file_ids_index - else: - end_index = len(part) - - # 提取内容 - content = part[:end_index].strip() - results.append(content) - - # 如果存在 file_ids,处理最后一部分特别提取 file_ids 前的内容 - if "'file_ids'" in parts[-1]: - file_ids_index = parts[-1].find("'file_ids'") - if file_ids_index != -1: - last_content = parts[-1][:file_ids_index].strip() - results[-1] = last_content # 更新最后一部分的内容,确保只到 file_ids - - return results - -def find_references_in_extracted(formatted_ans, extracted_references): - results = {} # 用来存储匹配结果的字典 - - # 递归函数,用于处理多层嵌套的字典 - def recurse_through_dict(current_dict, path=[]): - for key, value in current_dict.items(): - # 检查值是否还是字典,如果是,进行递归 - if isinstance(value, dict): - recurse_through_dict(value, path + [key]) - else: - # 特定值处理:如果值为'未知',直接设置索引为-1 - if value == '未知': - results['.'.join(path + [key])] = -1 - else: - # 进行匹配检查 - found = False - for index, reference in enumerate(extracted_references): - if str(value) in reference: # 转换为字符串,确保兼容性 - results['.'.join(path + [key])] = index # 使用点表示法记录路径 - found = True - break - if not found: - results['.'.join(path + [key])] = None - - # 从根字典开始递归 - recurse_through_dict(formatted_ans) - return results - -def send_message(assistant, message='百炼是什么?'): - ans = [] - print(f"Query: {message}") - - # create thread. - thread = Threads.create() - print(thread) - - # create a message. - message = Messages.create(thread.id, content=message) - # create run - - run = Runs.create(thread.id, assistant_id=assistant.id) - print(run) - - # wait for run completed or requires_action - run_status = Runs.wait(run.id, thread_id=thread.id) - print(run_status) - reference_txt = str(run_status) - extracted_references = extract_content_between_tags(reference_txt) #引用的文章来源list - # get the thread messages. - msgs = Messages.list(thread.id) - for message in msgs['data'][::-1]: - ans.append(message['content'][0]['text']['value']) - return ans,extracted_references - -def rag_assistant(knowledge_name): - retriever = DashScopeCloudRetriever(knowledge_name) - pipeline_id = str(retriever.pipeline_id) - assistant = Assistants.create( - model='qwen-max', - name='smart helper', - description='智能助手,支持知识库查询和插件调用。', - temperature='0.3', - instructions=prom, - tools=[ - { - "type": "code_interpreter" - }, - { - "type": "rag", - "prompt_ra": { - "pipeline_id": pipeline_id, - "parameters": { - "type": "object", - "properties": { - "query_word": { - "type": "str", - "value": "${document1}" - } - } - } - } - }] - ) - return assistant - - -def pure_assistant(): - assistant = Assistants.create( - model='qwen-max', - name='smart helper', - description='智能助手,能基于用户的要求精准简洁地回答用户的提问', - instructions='智能助手,能基于用户的要求精准简洁地回答用户的提问', - tools=[ - { - "type": "code_interpreter" - }, - ] - ) - return assistant - -def llm_call(question, knowledge_name, result_queue, ans_index, use_rag=True): - if use_rag: - assistant = rag_assistant(knowledge_name) - else: - assistant = pure_assistant() - ans,extracted_references = send_message(assistant, message=question) - for index, reference in enumerate(extracted_references, start=0): - print(f"{index}. {reference}") - formatted_ans=extract_content_from_json(ans[1]) - print(formatted_ans) - - results = find_references_in_extracted(formatted_ans, extracted_references) - for key, index in results.items(): - print(f"{key}: Found at index {index}") - result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans) - -def multi_threading(queries, knowledge_name, use_rag=True): - result_queue = queue.Queue() - - # 使用 ThreadPoolExecutor 管理线程 - with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: - # 使用字典保存每个提交的任务的Future对象,以便按顺序访问结果 - future_to_query = {executor.submit(llm_call, query, knowledge_name, result_queue, index, use_rag): index for - index, query in enumerate(queries)} - - # 收集每个线程的结果 - for future in concurrent.futures.as_completed(future_to_query): - index = future_to_query[future] - # 由于 llm_call 函数本身会处理结果,这里只需要确保任务执行完成 - try: - future.result() # 可以用来捕获异常或确认任务完成 - except Exception as exc: - print(f"Query {index} generated an exception: {exc}") - - # 从队列中获取所有结果并按索引排序 - results = [None] * len(queries) - while not result_queue.empty(): - index, result = result_queue.get() - results[index] = result - return results - -if __name__ == "__main__": - # 读取问题列表 - questions = ["该招标文件的工程概况(或项目概况)是?招标范围是?请按json格式给我提供信息,键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。"] - knowledge_name = "招标解析5word" - results = multi_threading(questions, knowledge_name, use_rag=True) - # 打印结果 - for question, response in results: - print(f"Question: {question}") - print(f"Response: {response}") diff --git a/flask_app/old_version/基础信息整合.py b/flask_app/old_version/基础信息整合.py deleted file mode 100644 index e8be244..0000000 --- a/flask_app/old_version/基础信息整合.py +++ /dev/null @@ -1,166 +0,0 @@ -import json - -from flask_app.general.json_utils import clean_json_string, rename_outer_key -from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice -from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge -from flask_app.general.多线程提问 import read_questions_from_file, multi_threading -from flask_app.general.通义千问long import upload_file -from flask_app.general.通用功能函数 import judge_consortium_bidding - -def aggregate_basic_info_engineering(baseinfo_list): - """ - 将基础信息列表中的数据进行合并和分类。 - - 参数: - - baseinfo_list (list): 包含多个基础信息的列表。 - - 返回: - - dict: 合并和分类后的基础信息字典。 - """ - key_groups = { - "招标人/代理信息": ["招标人", "招标人联系方式", "招标代理机构", "招标代理机构联系方式"], - "项目信息": ["项目名称", "招标编号", "项目概况", "招标范围", "招标控制价", "投标竞争下浮率"], - "关键时间/内容": [ - "投标文件递交截止日期", - "投标文件递交方式", - "开标时间", - "开标地点", - "投标人要求澄清招标文件的截止时间", - "投标有效期", - "评标结果公示媒介" - ], - "保证金相关": ["质量保证金", "退还投标保证金"], - "其他信息": [ - "重新招标、不再招标和终止招标", - "投标费用承担", - "招标代理服务费", - "是否退还投标文件", - ] - } - - combined_data = {} - relevant_keys_detected = set() - - # 合并所有基础信息并收集相关键 - for baseinfo in baseinfo_list: - combined_data.update(baseinfo) - relevant_keys_detected.update(baseinfo.keys()) - - # 动态调整键组 - dynamic_key_handling(key_groups, relevant_keys_detected) - - # 创建一个副本以存储未分类的项目 - unclassified_items = {k: v for k, v in combined_data.items() if k not in [item for sublist in key_groups.values() for item in sublist]} - - # 按键组分类并嵌套 - for group_name, keys in key_groups.items(): - group_data = {key: combined_data.get(key, "未提供") for key in keys} - combined_data[group_name] = group_data - # 从 unclassified_items 中移除已分类的键 - for key in keys: - unclassified_items.pop(key, None) - - # 将剩余未分类的键值对添加到 "其他信息" 组 - combined_data["其他信息"].update(unclassified_items) - - # 移除顶层的未分类键值对 - for key in list(combined_data.keys()): - if key not in key_groups: - del combined_data[key] - - return combined_data - -def dynamic_key_handling(key_groups, detected_keys): - # 检查和调整键组配置 - for key in detected_keys: - # 处理“保证金相关”组,插到"质量保证金"前 - if "保证金" in key: - group = key_groups["保证金相关"] - insert_before = "质量保证金" - if insert_before in group: - index = group.index(insert_before) - if key not in group: # 避免重复插入 - group.insert(index, key) - else: - group.append(key) # 如果没有找到特定键,则追加到末尾 - elif "联合体" in key: - key_groups["项目信息"].append(key) - elif "分包" in key: - key_groups["项目信息"].append(key) - elif "踏勘现场" in key: - key_groups["其他信息"].append(key) - elif "投标预备会" in key: - key_groups["其他信息"].append(key) - elif "偏离" in key: - key_groups["其他信息"].append(key) - -def combine_basic_info(knowledge_name, truncate0, output_folder, clause_path): - """ - 综合和处理基础信息,生成最终的基础信息字典。 - - 参数: - - knowledge_name (str): 知识名称。 - - truncate0 (str): 文件路径。 - - output_folder (str): 输出文件夹路径。 - - clause_path (str): 条款路径。 - - 返回: - - dict: 综合后的基础信息。 - """ - baseinfo_list = [] - baseinfo_file_path = 'flask_app/static/提示词/基本信息工程标.txt' - # baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息工程标.txt' - questions = read_questions_from_file(baseinfo_file_path) - res1 = multi_threading(questions, knowledge_name) - - for index, response in res1: - try: - if response and len(response) > 1: - baseinfo_list.append(clean_json_string(response[1])) - else: - print(f"基础信息整合: Warning: Missing or incomplete response data for query index {index}.") - except Exception as e: - print(f"基础信息整合: Error processing response for query index {index}: {e}") - - # 判断是否分包、是否需要递交投标保证金等 - chosen_numbers, merged = judge_whether_main(truncate0, output_folder) - baseinfo_list.append(merged) - - judge_file_path = 'flask_app/static/提示词/是否相关问题.txt' - # judge_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题.txt' - judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers) - judge_consortium = judge_consortium_bidding(baseinfo_list) # 通过招标公告判断是否接受联合体投标 - - if judge_consortium: - judge_consortium_question = ( - "该招标文件对于联合体投标的要求是怎样的,请按json格式给我提供信息," - "外层键名为'联合体投标要求',其中有一个嵌套键值对为:\"是否接受联合体投标\":\"是\"" - ) - judge_questions.append(judge_consortium_question) - - file_id = upload_file(truncate0) - res2 = multi_threading(judge_questions, "", file_id, 2) # 调用千问-long - - if not res2: - print("基础信息整合: multi_threading error!") - else: - for question, response in res2: - baseinfo_list.append(clean_json_string(response)) - - rebidding_situation = extract_from_notice(clause_path, 3) # "重新招标, 不再招标和终止招标"需从投标人须知正文提取 - update_json = rename_outer_key(rebidding_situation, "重新招标、不再招标和终止招标") - baseinfo_list.append(update_json) - aggregated_baseinfo = aggregate_basic_info_engineering(baseinfo_list) # 现在是一个字典 - return {"基础信息": aggregated_baseinfo} - -if __name__ == "__main__": - knowledge_name = "ztb" - output_folder="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405" - truncate0="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\ztbfile_tobidders_notice_table.pdf" - clause_path="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\clause1.json" - res=combine_basic_info(knowledge_name,truncate0,output_folder,clause_path) - print(json.dumps(res,ensure_ascii=False,indent=4)) - - - - diff --git a/flask_app/old_version/多线程分类.py b/flask_app/old_version/多线程分类.py deleted file mode 100644 index c50c288..0000000 --- a/flask_app/old_version/多线程分类.py +++ /dev/null @@ -1,373 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import shutil -from PyPDF2 import PdfReader, PdfWriter - - -def validate_pdf(file_path): - """ 验证PDF文件是否损坏 """ - try: - with open(file_path, "rb") as file: - pdf = PdfReader(file) - return len(pdf.pages) > 0 - except Exception as e: - print(f"Error reading PDF {file_path}: {str(e)}") - return False - - -def truncate_pdf(source_path, target_path, max_pages=15): - """截取PDF文件的前15页并保存""" - try: - with open(source_path, "rb") as infile: - reader = PdfReader(infile) - writer = PdfWriter() - for i in range(min(max_pages, len(reader.pages))): - writer.add_page(reader.pages[i]) - with open(target_path, "wb") as outfile: - writer.write(outfile) - except Exception as e: - print(f"Error processing PDF {source_path}: {str(e)}") - -def copy_file(src, dest): - """ 复制单个文件 """ - os.makedirs(os.path.dirname(dest), exist_ok=True) - shutil.copy2(src, dest) - -def copy_directory(source, destination): - """复制整个目录""" - os.makedirs(destination, exist_ok=True) - for item in os.listdir(source): - src_path = os.path.join(source, item) - dest_path = os.path.join(destination, item) - if os.path.isfile(src_path): - copy_file(src_path, dest_path) - else: - copy_directory(src_path, dest_path) - -def unique_file_name(base_path, file_name): - counter = 1 - name_part, extension = os.path.splitext(file_name) - new_file_name = file_name - # 检查文件是否存在,若存在则修改文件名 - while os.path.exists(os.path.join(base_path, new_file_name)): - new_file_name = f"{name_part}_{counter}{extension}" - counter += 1 - return new_file_name - -def process_pdf_folders(source_dir, target_dir): - """ 处理源目录中的PDF文件,并基于条件选择目标文件夹 """ - for root, dirs, files in os.walk(source_dir, topdown=False): - for file in files: - if file.lower().endswith('.pdf'): - source_file_path = os.path.join(root, file) - if validate_pdf(source_file_path): - relative_path = os.path.relpath(root, source_dir) - target_file_dir = os.path.join(target_dir, relative_path) - target_file_path = os.path.join(target_file_dir, file) - copy_file(source_file_path, target_file_path) - else: - print(f"Deleted corrupt file: {source_file_path}") - # 清除空目录 - if not os.listdir(root): - os.rmdir(root) - - -def classify_folders(source_dir, target_dir, project_index): - """Classifies folders and processes files based on specific criteria.""" - temp_dir = os.path.join(source_dir, 'temp') - os.makedirs(temp_dir, exist_ok=True) - - target_project_dir = None - processed_dirs = set() # Set to track processed directories - - for subdir in os.listdir(source_dir): - subdir_path = os.path.join(source_dir, subdir) - if not os.path.isdir(subdir_path): - continue - - files = [f.lower() for f in os.listdir(subdir_path)] - if 'zb.pdf' in files: - target_project_dir, target_zb_name= process_tender_files(subdir_path, temp_dir, target_dir, project_index) - processed_dirs.add(subdir_path) # Mark this directory as processed - elif subdir.lower() == "输出文件": - process_evaluation_files(subdir_path, target_project_dir) - processed_dirs.add(subdir_path) # Mark this directory as processed - - # Process remaining folders, skipping already processed ones - process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs,target_zb_name) - #删除tmp目录 - # if os.path.exists(temp_dir): - # shutil.rmtree(temp_dir) - if os.path.exists(source_dir): - shutil.rmtree(source_dir) - -def process_tender_files(subdir_path, temp_dir, target_dir, project_index): - """Processes tender files and returns the target project directory and updated index.""" - zb_path = os.path.join(subdir_path, "zb.pdf") - truncated_zb_path = os.path.join(temp_dir, "truncate_zb.pdf") - truncate_pdf(zb_path, truncated_zb_path, 30) # Truncate to the first 30 pages - bot_response = file_parse(truncated_zb_path, project_index, 1) - zb_response = extract_answer(bot_response[1], 1) - zb_response[0]=zb_response[0].replace('/', '-').replace('\\', '-') #用'-'代替'/' ,目录名不允许出现斜杠 - target_zb_name, category = zb_response[2] + ".pdf", zb_response[3] - - new_zb_path = os.path.join(subdir_path, target_zb_name) - os.rename(zb_path, new_zb_path) - - target_category_dir = os.path.join(target_dir, category) - target_project_dir = os.path.join(target_category_dir, zb_response[0] + "_" + zb_response[1]) - copy_directory(subdir_path, os.path.join(target_project_dir, "招标文件夹")) - - os.remove(truncated_zb_path) - shutil.rmtree(subdir_path) - return target_project_dir, zb_response[2] - - -def process_evaluation_files(subdir_path, target_project_dir): - """Processes evaluation folders.""" - copy_directory(subdir_path, os.path.join(target_project_dir, "评标文件夹")) - shutil.rmtree(subdir_path) - - -def process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs, target_zb_name): - """Processes remaining folders containing bid files.""" - target_tb_dir = os.path.join(target_project_dir, "投标文件夹") - - for subdir in os.listdir(source_dir): - subdir_path = os.path.join(source_dir, subdir) - if not os.path.isdir(subdir_path) or subdir_path in processed_dirs: - continue # Skip processed directories - - target_tbcom_dir = None # Initialize outside the file loop - - for item in os.listdir(subdir_path): - item_src_path = os.path.join(subdir_path, item) - new_name = "truncate_" + item - truncated_tb_path = os.path.join(temp_dir, new_name) - truncate_pdf(item_src_path, truncated_tb_path) - bot_response = file_parse(truncated_tb_path, project_index, 2) - tb_response = extract_answer(bot_response[1], 2) - if not tb_response: - continue # If there's no response, skip this file - - # Initialize target_tbcom_dir only once based on the first processed file - if not target_tbcom_dir: - target_tb_name, _ = tb_response - if(target_tb_name != target_zb_name and target_tb_name != "未知"): - target_tbcom_dir = os.path.join(target_tb_dir, target_tb_name) - print(target_tbcom_dir) - os.makedirs(target_tbcom_dir, exist_ok=True) - - tb_section = tb_response[1] + ".pdf" - new_tb_path = os.path.join(subdir_path, tb_section) - # 获取唯一文件名 - new_tb_path = os.path.join(subdir_path, unique_file_name(subdir_path, tb_section)) - os.rename(item_src_path, new_tb_path) - - # Remove temporary truncated file - os.remove(truncated_tb_path) - - # Copy the whole directory at once after all files have been processed and renamed - if target_tbcom_dir: - copy_directory(subdir_path, target_tbcom_dir) - shutil.rmtree(subdir_path) # Optionally remove the original directory after copying - -import re - -def extract_answer(input_string, type): - # 使用正则表达式匹配所需的信息 - # 第一种模式:项目名称、项目编号、招标人、类别 - # 在每个字段值后都添加了\s*以忽略尾随的空格 - pattern1 = r"项目名称[::]\s*(.*?)[;;]\s*项目编号[::]\s*(.*?)[;;]\s*招标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*" - # 第二种模式:投标人、类别 - pattern2 = r"投标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*" - - if type == 1: - match = re.search(pattern1, input_string) - if match: - print(f"1: {match.group(1).strip()} 2: {match.group(2).strip()} 3: {match.group(3).strip()} 4: {match.group(4).strip()}") - return [match.group(1).strip(), match.group(2).strip(), match.group(3).strip(), match.group(4).strip()] - else: - print("No match found for type 1.") - return [] - elif type == 2: - match = re.search(pattern2, input_string) - if match: - # 检查是否包含“投标函”,如果是则替换部分内容为“商务文件” - part = match.group(2).strip() - if "投标函" in part: - part = "商务文件" - return [match.group(1).strip(), part] - else: - print("No match found for type 2.") - return [] - - -from llama_index.readers.dashscope.base import DashScopeParse -from llama_index.readers.dashscope.utils import ResultType -from llama_index.indices.managed.dashscope import DashScopeCloudIndex -from dashscope import Assistants, Messages, Runs, Threads - - -def send_message(assistant, index,documents,message='百炼是什么?'): - print(f"Query: {message}") - - # create thread. - # create a thread. - thread = Threads.create() - - print(thread) - - # create a message. - message = Messages.create(thread.id, content=message) - # create run - - run = Runs.create(thread.id, assistant_id=assistant.id) - print("run:" + str(run)) - - # # get run statue - run_status = Runs.get(run.id, thread_id=thread.id) - # print(run_status) - - # wait for run completed or requires_action - run_status = Runs.wait(run.id, thread_id=thread.id) - # print(run_status) - - # if prompt input tool result, submit tool result. - - run_status = Runs.get(run.id, thread_id=thread.id) - # print(run_status) - # verify_status_code(run_status) - - # get the thread messages. - msgs = Messages.list(thread.id) - # print(msgs) - # print(json.dumps(msgs, default=lambda o: o.__dict__, sort_keys=True, indent=4)) - - ans = [] - - print("运行结果:") - for message in msgs['data'][::-1]: - ans.append(message['content'][0]['text']['value']) - print("content: ", message['content'][0]['text']['value']) - print("\n") - deleteFileFromKnowledge(index,documents) - return ans - - -def file_parse(filepath, knowledge_index, type): - parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND) - documents = parse.load_data(file_path=filepath) - # 创建一个字典来映射index值到知识库名 - index_to_name = { - 0: "文件分类知识库0", - 1: "文件分类知识库1", - 2: "文件分类知识库2", - 3: "文件分类知识库3" - } - # 使用get方法获取对应的知识库名,如果index不存在,返回"默认文件分类知识库" - cloud_index_name = index_to_name.get(knowledge_index, "0") - - index = DashScopeCloudIndex(cloud_index_name) - index._insert(documents) - retriever = index.as_retriever() - pipeline_id = str(retriever.pipeline_id) - assistant = Assistants.create( - model='qwen-plus', - name='smart helper', - description='智能助手,支持知识库查询和插件调用。', - instructions='请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}', - tools=[ - { - "type": "code_interpreter" - }, - { - "type": "rag", - "prompt_ra": { - "pipeline_id": pipeline_id, - "parameters": { - "type": "object", - "properties": { - "query_word": { - "type": "str", - "value": "${document1}" - } - - } - } - } - }] - ) - questions1 = "这份招标文件的项目名称是什么?这份文件的招标编号是多少?这份文件的招标人是谁?这份招标文件属于以下哪类招标:服务类、工程类、还是货物类?你可以结合你对招投标的了解和以下内容:工程类招标投标是指建设单位对拟建的工程项目通过法定的程序和方式吸引建设项目的承包单位竞争;\ - 货物类招投标是指以货物作为采购对象的招标,业主或称购货方为获得货物通过招标的形式选择合格的供货商或称供货方,包括原材料、产品、设备、电能和固态、液态、气态物体等;服务类招标又称为服务采购,指的是除了工程和货之外的其他招标投标活动,服物招标投标范围包含建设工程的勘察、设计、监理、工程咨询评估、科技项目、科研课题、物业管理、金融保险服务等\ - 请按下列键值对格式给我提供信息,避免无关内容:‘项目名称:XXX;项目编号:XXX;招标人:XXX;类别:XXX‘" - questions2 = "这份投标文件的投标人是谁?如果文件中未明确提及投标人,投标人以未知替代;这份文件属于哪类投标文件?你的回答仅限于以下三种:商务文件、技术文件、报价文件,不要给出其他回答。请按下列键值对格式给我提供信息,避免无关内容:‘投标人:XXX;类别:XXX’" - if (type == 1): - questions = questions1 - elif (type == 2): - questions = questions2 - return send_message(assistant,index,documents, message=questions) - -def deleteFileFromKnowledge(index,documents): - # 初始化一个列表来存储所有文档的 ID - file_ids = [] - - # 检查documents是否为列表且不为空 - if isinstance(documents, list) and documents: - # 遍历每个文档 - for document in documents: - # 使用属性访问方式获取每个文档的 id_ - # 确保 document 对象有一个名为 id_ 的属性 - file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常 - if file_id: - file_ids.append(file_id) # 将 id 添加到列表中 - index.delete_ref_doc(file_ids) - -import concurrent.futures -import logging - -# 配置日志,只输出到控制台 -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') - -def process_directory(subdir_path, base_intermediate_directory, final_directory, project_index): - logging.info(f"Starting to process directory: {subdir_path} with index {project_index}") - - # 为每个子目录创建一个专用的临时目录 - intermediate_directory = os.path.join(base_intermediate_directory, f"intermediate_{project_index}") - os.makedirs(intermediate_directory, exist_ok=True) - logging.info(f"Created intermediate directory: {intermediate_directory}") - - # 这里可以添加具体的目录处理函数,例如: - process_pdf_folders(subdir_path, intermediate_directory) - classify_folders(intermediate_directory, final_directory, project_index % 4) - - # 处理完毕后清理该目录 - shutil.rmtree(intermediate_directory) - logging.info(f"Deleted intermediate directory: {intermediate_directory}") - -def main(base_directory, base_intermediate_directory, final_directory): - os.makedirs(final_directory, exist_ok=True) - logging.info(f"Final directory ensured at: {final_directory}") - - project_index = 0 - with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: - futures = [] - for subdir in os.listdir(base_directory): - subdir_path = os.path.join(base_directory, subdir) - if os.path.isdir(subdir_path): - logging.info(f"Submitting job for directory: {subdir_path}") - future = executor.submit(process_directory, subdir_path, base_intermediate_directory, final_directory, project_index) - futures.append(future) - project_index += 1 - - for future in concurrent.futures.as_completed(futures): - try: - future.result() # 如果执行没有抛出异常,完成该任务 - except Exception as e: - logging.error(f"Thread resulted in an error: {e}") - -if __name__ == "__main__": - base_directory = 'D:\\bidding_trading_files\\招投标文件2023\\2023' - base_intermediate_directory = 'D:\\tmp' - final_directory = 'D:\\output' - main(base_directory, base_intermediate_directory, final_directory) diff --git a/flask_app/old_version/废标项.py b/flask_app/old_version/废标项.py deleted file mode 100644 index 77bec71..0000000 --- a/flask_app/old_version/废标项.py +++ /dev/null @@ -1,66 +0,0 @@ -import fitz # PyMuPDF -import re - -def extract_text_with_keywords(pdf_path, keywords, follow_up_keywords): - """ - 提取PDF文档中包含特定关键词的段落,包括正文和表格。如果段落中包含特定的后续关键词,也提取其后的段落,直到遇到下一个相似的序列编号。 - :param pdf_path: PDF文件的路径。 - :param keywords: 包含关键词的列表。 - :param follow_up_keywords: 触发连续提取的关键词列表。 - :return: 包含关键词的段落列表。 - """ - doc = fitz.open(pdf_path) - extracted_paragraphs = [] - continue_collecting = False - current_section_pattern = None - - for page in doc: - text_blocks = page.get_text("blocks") - for index, block in enumerate(text_blocks): - text = block[4].strip() # Text content of the block - if text == "": # Skip empty lines - continue - if continue_collecting: - if current_section_pattern and re.match(current_section_pattern, text): - continue_collecting = False - else: - extracted_paragraphs.append(text) - - if any(keyword in text for keyword in keywords): - extracted_paragraphs.append(text) - if any(follow_up in text for follow_up in follow_up_keywords): - continue_collecting = True - section_number = re.match(r'(\d+(\.\d+)*)', text) - if section_number: - current_section_number = section_number.group(1) - base_section_number = current_section_number.rsplit('.', 1)[0] - current_section_pattern = re.compile(rf'^{re.escape(base_section_number)}\.\d+\b') - else: - found_next_number = False - for next_index in range(index + 1, len(text_blocks)): - next_text = text_blocks[next_index][4].strip() - next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)', next_text) - if next_section_number: - found_next_number = True - current_section_pattern = re.compile(rf'^{next_section_number.group(1)}\b') - elif found_next_number: - break - doc.close() - return extracted_paragraphs - -# Example usage -doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标02_invalid.docx' -keywords = ['否决', '无效投标', '被拒绝', '予以拒绝'] -follow_up_keywords = ['情形之一'] -extracted_contents = extract_text_with_keywords(doc_path, keywords, follow_up_keywords) - -# Writing to file and handling duplicates -output_file = "C:\\Users\\Administrator\\Desktop\\temp.txt" -with open(output_file, 'w', encoding='utf-8') as file: - for content in extracted_contents: - file.write(content + '\n') - -# file_id = upload_file(output_file) -# user_query="根据文本中的内容,请你回答,否决投标和拒绝投标的情况有哪些?由于文本中存在冗余且无关的信息,且文本中的序号是混乱的,请你重新编排答案返回给我,以json格式返回,你的回答仅限于原文内容但删去原有序号,请不要自己组织语言。" -# res=qianwen_long(file_id,user_query) -# print("Query Result:", res) \ No newline at end of file diff --git a/flask_app/old_version/招标文件解析.py b/flask_app/old_version/招标文件解析.py deleted file mode 100644 index 8209c31..0000000 --- a/flask_app/old_version/招标文件解析.py +++ /dev/null @@ -1,305 +0,0 @@ -# -*- encoding:utf-8 -*- -import json -import logging -import time -from concurrent.futures import ThreadPoolExecutor -from flask_app.main.截取pdf import truncate_pdf_multiple -from flask_app.general.table_content_extraction import extract_tables_main -from flask_app.old_version.文档理解大模型版知识库处理.知识库操作 import addfileToKnowledge, deleteKnowledge -from flask_app.main.提取json工程标版 import convert_clause_to_json -from flask_app.general.json_utils import transform_json_values -from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid -from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice -import concurrent.futures -from flask_app.old_version.基础信息整合 import combine_basic_info -from flask_app.old_version.资格审查模块old import combine_review_standards -from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards -from flask_app.general.format_change import pdf2docx, docx2pdf -from flask_app.general.docx截取docx import copy_docx - -def get_global_logger(unique_id): - if unique_id is None: - return logging.getLogger() # 获取默认的日志器 - logger = logging.getLogger(unique_id) - return logger - -logger=None - -# 创建全局线程池 -executor = ThreadPoolExecutor() -def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id): - logger.info("starting 文件预处理...") - logger.info("output_folder..." + output_folder) - - # 根据文件类型处理文件路径 - if file_type == 1: # docx - docx_path = downloaded_file_path - pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理 - elif file_type == 2: # pdf - pdf_path = downloaded_file_path - docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库 - else: - logger.error("Unsupported file type provided. Preprocessing halted.") - return None - - # # 异步上传知识库 - future_knowledge = executor.submit(addfileToKnowledge, docx_path, "招标解析" + unique_id) - - # 调用截取PDF多次 - truncate_files = truncate_pdf_multiple(pdf_path, output_folder) - - # 处理各个部分 - truncate0_docpath = pdf2docx(truncate_files[0]) # 投标人须知前附表转docx - invalid_docpath = copy_docx(docx_path) # docx截取无效标部分 - truncate_jsonpath = extract_tables_main(truncate0_docpath, output_folder) # 投标人须知前附表docx->json - truncate0 = truncate_files[0] - truncate1 = truncate_files[1] - truncate3 = truncate_files[3] - merged_baseinfo_path=truncate_files[-1] - clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json - - logger.info("文件预处理done") - - # 提前返回,不等待 future_knowledge 完成,返回包含 Future 对象 - return { - 'file_path': downloaded_file_path, - 'output_folder': output_folder, - 'truncate0': truncate0, - 'truncate1': truncate1, - 'truncate3': truncate3, - 'knowledge_future': future_knowledge, # 返回 Future 对象 - 'truncate0_jsonpath': truncate_jsonpath, - 'merged_baseinfo_path':merged_baseinfo_path, - 'clause_path': clause_path, - 'invalid_docpath': invalid_docpath - } - - -def post_processing(data,includes): - # 初始化结果字典,预设'其他'分类为空字典 - result = {"其他": {}} - - # 遍历原始字典的每一个键值对 - for key, value in data.items(): - if key in includes: - # 如果键在includes列表中,直接保留这个键值对 - result[key] = value - else: - # 如果键不在includes列表中,将这个键值对加入到'其他'分类中 - result["其他"][key] = value - - # 如果'其他'分类没有任何内容,可以选择删除这个键 - if not result["其他"]: - del result["其他"] - - return result -# 基本信息 -def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path): # 投标人须知前附表 - logger.info("starting基础信息...") - basic_res = combine_basic_info(knowledge_name, truncate0, output_folder, clause_path) - logger.info("基础信息done") - return basic_res - - -# 形式、响应、资格评审 -def fetch_qualification_review(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path,input_file,output_folder): - logger.info("starting资格审查...") - review_standards_res = combine_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath, - clause_path,input_file,output_folder) - logger.info("资格审查done") - return review_standards_res - - -# 评分细则 流式 -def fetch_evaluation_standards(truncate1): # 评标办法前附表 - logger.info("starting 商务标和技术标...") - - # 获取评标办法前附表的字典结果 - evaluation_standards_res = combine_evaluation_standards(truncate1) - - # 获取技术标和商务标 - technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})} - commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})} - - logger.info("商务标和技术标 done") - - # 返回将 "技术标" 和 "商务标" 包含在新的键中 - return { - "technical_standards": technical_standards, - "commercial_standards": commercial_standards - } - -# def fetch_evaluation_standards(truncate1): # 评标办法前附表 -# logger.info("starting商务标技术标...") -# evaluation_standards_res = combine_evaluation_standards(truncate1) -# logger.info("商务标技术标done") -# return evaluation_standards_res - - -# 无效、废标项解析 -def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3): - # 废标项要求:千问 - logger.info("starting无效标与废标...") - find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3) - logger.info("无效标与废标done...") - return find_invalid_res - - -# 投标文件要求 -def fetch_bidding_documents_requirements(clause_path): - logger.info("starting投标文件要求...") - fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1) - logger.info("投标文件要求done...") - return {"投标文件要求":fetch_bidding_documents_requirements_json} - -# 开评定标流程 -def fetch_bid_opening(clause_path): - logger.info("starting开评定标流程...") - fetch_bid_opening_json = extract_from_notice(clause_path, 2) - logger.info("开评定标流程done...") - return {"开评定标流程":fetch_bid_opening_json} - -# def main_processing(output_folder, downloaded_file_path, file_type, unique_id): # file_type=1->docx file_type=2->pdf -# global logger -# logger = get_global_logger(unique_id) -# # Preprocess files and get necessary data paths and knowledge index -# processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id) -# if not processed_data: -# return "" -# -# with concurrent.futures.ThreadPoolExecutor() as executor: -# # Submit all tasks to the executor -# futures = { -# 'base_info': executor.submit(fetch_project_basic_info, processed_data['knowledge_name'], -# processed_data['truncate0'], output_folder, -# processed_data['clause_path']), -# 'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'], -# processed_data['truncate3'], -# processed_data['knowledge_name'], processed_data['truncate0_jsonpath'], -# processed_data['clause_path'],processed_data['file_path'],processed_data['output_folder']), -# 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']), -# 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], -# output_folder, processed_data['truncate0_jsonpath'], -# processed_data['clause_path'], processed_data['truncate3']), -# 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, -# processed_data['clause_path']), -# 'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path']) -# } -# -# comprehensive_responses = [] -# # Collect results in the defined order -# for key in ['base_info', 'qualification_review', 'evaluation_standards', 'invalid_requirements', -# 'bidding_documents_requirements', 'opening_bid']: -# try: -# # Wait for the future to complete and get the result -# result = futures[key].result() -# comprehensive_responses.append(result) -# except Exception as exc: -# logger.error(f"Error processing {key}: {exc}") -# # 合并 JSON 结果 -# combined_final_result = combine_json_results(comprehensive_responses) -# includes = ["基础信息", "资格审查", "商务标", "技术标", "无效标与废标项", "投标文件要求", "开评定标流程"] -# result = post_processing(combined_final_result, includes) -# modified_json = transform_json_values(result) -# -# final_result_path = os.path.join(output_folder, "final_result.json") -# with open(final_result_path, 'w', encoding='utf-8') as file: -# json.dump(modified_json, file, ensure_ascii=False, indent=2) -# logger.info("final_result.json has been saved") -# deleteKnowledge(processed_data['knowledge_index']) -# return final_result_path - - -#分段返回 -def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_id): - global logger - logger = get_global_logger(unique_id) - - # 预处理文件,获取处理后的数据 - processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id) - if not processed_data: - yield json.dumps({}) # 如果处理数据失败,返回空的 JSON - - with concurrent.futures.ThreadPoolExecutor() as executor: - # 立即启动不依赖 knowledge_name 和 index 的任务 - futures = { - 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']), - 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], - output_folder, processed_data['truncate0_jsonpath'], - processed_data['clause_path'], processed_data['truncate3']), - 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, processed_data['clause_path']), - 'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path']) - } - - # 提前处理这些不依赖的任务,按完成顺序返回 - for future in concurrent.futures.as_completed(futures.values()): - key = next(k for k, v in futures.items() if v == future) - try: - result = future.result() - - # 如果是 evaluation_standards,拆分技术标和商务标 - if key == 'evaluation_standards': - technical_standards = result["technical_standards"] - commercial_standards = result["commercial_standards"] - - # 分别返回技术标和商务标 - yield json.dumps({'technical_standards': transform_json_values(technical_standards)}, ensure_ascii=False) - yield json.dumps({'commercial_standards': transform_json_values(commercial_standards)}, ensure_ascii=False) - - else: - # 处理其他任务的结果 - yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False) - - except Exception as exc: - logger.error(f"Error processing {key}: {exc}") - yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False) - - # 只有在需要 knowledge_name 和 index 时才等待 future_knowledge 完成 - try: - knowledge_name = "招标解析" + unique_id - index = processed_data['knowledge_future'].result() # 阻塞等待知识库上传任务完成 - - # 提交依赖 knowledge_name 和 index 的任务 - future_dependencies = { - 'base_info': executor.submit(fetch_project_basic_info, knowledge_name, processed_data['truncate0'], - output_folder, processed_data['clause_path']), - 'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'], - processed_data['truncate3'], knowledge_name, - processed_data['truncate0_jsonpath'], - processed_data['clause_path'], processed_data['file_path'], - processed_data['output_folder']), - } - - # 按完成顺序返回依赖任务的结果 - for future in concurrent.futures.as_completed(future_dependencies.values()): - key = next(k for k, v in future_dependencies.items() if v == future) - try: - result = future.result() - yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False) - except Exception as exc: - logger.error(f"Error processing {key}: {exc}") - yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False) - - except Exception as e: - logger.error(f"Error uploading to knowledge base: {e}") - yield json.dumps({'error': f'Knowledge upload failed: {str(e)}'}, ensure_ascii=False) - - # 删除知识索引 - deleteKnowledge(index) - -if __name__ == "__main__": - output_folder = "flask_app/static/output/zytest1" - # truncate0 = os.path.join(output_folder, "ztb_tobidders_notice_table.pdf") - # truncate1=os.path.join(output_folder,"ztb_evaluation_method.pdf") - # clause_path = convert_clause_to_json(truncate1, output_folder) - # truncate0_jsonpath = os.path.join(output_folder, "truncate_output3.json") - - start_time = time.time() - file_type = 1 #1:docx 2:pdf 3:其他 - input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest11.pdf" - # file_path = main_processing(output_folder, input_file, file_type, "uuidzyzy11") - - preprocess_files(output_folder, input_file, file_type, "zytest1") - end_time = time.time() - elapsed_time = end_time - start_time # 计算耗时 - print(f"Function execution took {elapsed_time} seconds.") diff --git a/flask_app/old_version/文件分类普通版.py b/flask_app/old_version/文件分类普通版.py deleted file mode 100644 index dc333ad..0000000 --- a/flask_app/old_version/文件分类普通版.py +++ /dev/null @@ -1,350 +0,0 @@ -# -*- coding: utf-8 -*- -import os -import shutil -from PyPDF2 import PdfReader, PdfWriter - - -def validate_pdf(file_path): - """ 验证PDF文件是否损坏 """ - try: - with open(file_path, "rb") as file: - pdf = PdfReader(file) - return len(pdf.pages) > 0 - except Exception as e: - print(f"Error reading PDF {file_path}: {str(e)}") - return False - - -def truncate_pdf(source_path, target_path, max_pages=15): - """截取PDF文件的前15页并保存""" - try: - with open(source_path, "rb") as infile: - reader = PdfReader(infile) - writer = PdfWriter() - for i in range(min(max_pages, len(reader.pages))): - writer.add_page(reader.pages[i]) - with open(target_path, "wb") as outfile: - writer.write(outfile) - except Exception as e: - print(f"Error processing PDF {source_path}: {str(e)}") - -def copy_file(src, dest): - """ 复制单个文件 """ - os.makedirs(os.path.dirname(dest), exist_ok=True) - shutil.copy2(src, dest) - -def copy_directory(source, destination): - """复制整个目录""" - os.makedirs(destination, exist_ok=True) - for item in os.listdir(source): - src_path = os.path.join(source, item) - dest_path = os.path.join(destination, item) - if os.path.isfile(src_path): - copy_file(src_path, dest_path) - else: - copy_directory(src_path, dest_path) - -def unique_file_name(base_path, file_name): - counter = 1 - name_part, extension = os.path.splitext(file_name) - new_file_name = file_name - # 检查文件是否存在,若存在则修改文件名 - while os.path.exists(os.path.join(base_path, new_file_name)): - new_file_name = f"{name_part}_{counter}{extension}" - counter += 1 - return new_file_name - -def process_pdf_folders(source_dir, target_dir): - """ 处理源目录中的PDF文件,并基于条件选择目标文件夹 """ - for root, dirs, files in os.walk(source_dir, topdown=False): - for file in files: - if file.lower().endswith('.pdf'): - source_file_path = os.path.join(root, file) - if validate_pdf(source_file_path): - relative_path = os.path.relpath(root, source_dir) - target_file_dir = os.path.join(target_dir, relative_path) - target_file_path = os.path.join(target_file_dir, file) - copy_file(source_file_path, target_file_path) - else: - print(f"Deleted corrupt file: {source_file_path}") - # 清除空目录 - if not os.listdir(root): - os.rmdir(root) - - -def classify_folders(source_dir, target_dir, project_index): - """Classifies folders and processes files based on specific criteria.""" - temp_dir = os.path.join(source_dir, 'temp') - os.makedirs(temp_dir, exist_ok=True) - - target_project_dir = None - processed_dirs = set() # Set to track processed directories - - for subdir in os.listdir(source_dir): - subdir_path = os.path.join(source_dir, subdir) - if not os.path.isdir(subdir_path): - continue - - files = [f.lower() for f in os.listdir(subdir_path)] - if 'zb.pdf' in files: - target_project_dir, project_index ,target_zb_name= process_tender_files(subdir_path, temp_dir, target_dir, project_index) - processed_dirs.add(subdir_path) # Mark this directory as processed - elif subdir.lower() == "输出文件": - process_evaluation_files(subdir_path, target_project_dir) - processed_dirs.add(subdir_path) # Mark this directory as processed - - # Process remaining folders, skipping already processed ones - process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs,target_zb_name) - #删除tmp目录 - # if os.path.exists(temp_dir): - # shutil.rmtree(temp_dir) - if os.path.exists(source_dir): - shutil.rmtree(source_dir) - -def process_tender_files(subdir_path, temp_dir, target_dir, project_index): - """Processes tender files and returns the target project directory and updated index.""" - zb_path = os.path.join(subdir_path, "zb.pdf") - truncated_zb_path = os.path.join(temp_dir, "truncate_zb.pdf") - truncate_pdf(zb_path, truncated_zb_path, 30) # Truncate to the first 30 pages - bot_response = file_parse(truncated_zb_path, "file_" + str(project_index), 1) - project_index += 1 - zb_response = extract_answer(bot_response[1], 1) - zb_response[0]=zb_response[0].replace('/', '-').replace('\\', '-') #用'-'代替'/' ,目录名不允许出现斜杠 - target_zb_name, category = zb_response[2] + ".pdf", zb_response[3] - - new_zb_path = os.path.join(subdir_path, target_zb_name) - os.rename(zb_path, new_zb_path) - - target_category_dir = os.path.join(target_dir, category) - target_project_dir = os.path.join(target_category_dir, zb_response[0] + "_" + zb_response[1]) - copy_directory(subdir_path, os.path.join(target_project_dir, "招标文件夹")) - - os.remove(truncated_zb_path) - shutil.rmtree(subdir_path) - return target_project_dir, project_index,zb_response[2] - - -def process_evaluation_files(subdir_path, target_project_dir): - """Processes evaluation folders.""" - copy_directory(subdir_path, os.path.join(target_project_dir, "评标文件夹")) - shutil.rmtree(subdir_path) - - -def process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs, target_zb_name): - """Processes remaining folders containing bid files.""" - target_tb_dir = os.path.join(target_project_dir, "投标文件夹") - - for subdir in os.listdir(source_dir): - subdir_path = os.path.join(source_dir, subdir) - if not os.path.isdir(subdir_path) or subdir_path in processed_dirs: - continue # Skip processed directories - - target_tbcom_dir = None # Initialize outside the file loop - - for item in os.listdir(subdir_path): - item_src_path = os.path.join(subdir_path, item) - new_name = "truncate_" + item - truncated_tb_path = os.path.join(temp_dir, new_name) - truncate_pdf(item_src_path, truncated_tb_path) - bot_response = file_parse(truncated_tb_path, "file_" + str(project_index), 2) - project_index += 1 - tb_response = extract_answer(bot_response[1], 2) - if not tb_response: - continue # If there's no response, skip this file - - # Initialize target_tbcom_dir only once based on the first processed file - if not target_tbcom_dir: - target_tb_name, _ = tb_response - if(target_tb_name != target_zb_name and target_tb_name != "未知"): - target_tbcom_dir = os.path.join(target_tb_dir, target_tb_name) - print(target_tbcom_dir) - os.makedirs(target_tbcom_dir, exist_ok=True) - - tb_section = tb_response[1] + ".pdf" - new_tb_path = os.path.join(subdir_path, tb_section) - # 获取唯一文件名 - new_tb_path = os.path.join(subdir_path, unique_file_name(subdir_path, tb_section)) - os.rename(item_src_path, new_tb_path) - - # Remove temporary truncated file - os.remove(truncated_tb_path) - - # Copy the whole directory at once after all files have been processed and renamed - if target_tbcom_dir: - copy_directory(subdir_path, target_tbcom_dir) - shutil.rmtree(subdir_path) # Optionally remove the original directory after copying - - - -import re - - -import re - -def extract_answer(input_string, type): - # 使用正则表达式匹配所需的信息 - # 第一种模式:项目名称、项目编号、招标人、类别 - # 在每个字段值后都添加了\s*以忽略尾随的空格 - pattern1 = r"项目名称[::]\s*(.*?)[;;]\s*项目编号[::]\s*(.*?)[;;]\s*招标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*" - # 第二种模式:投标人、类别 - pattern2 = r"投标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*" - - if type == 1: - match = re.search(pattern1, input_string) - if match: - print(f"1: {match.group(1).strip()} 2: {match.group(2).strip()} 3: {match.group(3).strip()} 4: {match.group(4).strip()}") - return [match.group(1).strip(), match.group(2).strip(), match.group(3).strip(), match.group(4).strip()] - else: - print("No match found for type 1.") - return [] - elif type == 2: - match = re.search(pattern2, input_string) - if match: - # 检查是否包含“投标函”,如果是则替换部分内容为“商务文件” - part = match.group(2).strip() - if "投标函" in part: - part = "商务文件" - return [match.group(1).strip(), part] - else: - print("No match found for type 2.") - return [] - - -from llama_index.readers.dashscope.base import DashScopeParse -from llama_index.readers.dashscope.utils import ResultType -from llama_index.indices.managed.dashscope import DashScopeCloudIndex -from dashscope import Assistants, Messages, Runs, Threads - - -def send_message(assistant, index,documents,message='百炼是什么?'): - print(f"Query: {message}") - - # create thread. - # create a thread. - thread = Threads.create() - - print(thread) - - # create a message. - message = Messages.create(thread.id, content=message) - # create run - - run = Runs.create(thread.id, assistant_id=assistant.id) - print("run:" + str(run)) - - # # get run statue - run_status = Runs.get(run.id, thread_id=thread.id) - # print(run_status) - - # wait for run completed or requires_action - run_status = Runs.wait(run.id, thread_id=thread.id) - # print(run_status) - - # if prompt input tool result, submit tool result. - - run_status = Runs.get(run.id, thread_id=thread.id) - # print(run_status) - # verify_status_code(run_status) - - # get the thread messages. - msgs = Messages.list(thread.id) - # print(msgs) - # print(json.dumps(msgs, default=lambda o: o.__dict__, sort_keys=True, indent=4)) - - ans = [] - - print("运行结果:") - for message in msgs['data'][::-1]: - ans.append(message['content'][0]['text']['value']) - print("content: ", message['content'][0]['text']['value']) - print("\n") - deleteFileFromKnowledge(index,documents) - return ans - - -def file_parse(filepath, knowledge_index, type): - parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND) - documents = parse.load_data(file_path=filepath) - index = DashScopeCloudIndex("文件分类临时知识库") - index._insert(documents) - retriever = index.as_retriever() - pipeline_id = str(retriever.pipeline_id) - assistant = Assistants.create( - model='qwen-max', - name='smart helper', - description='智能助手,支持知识库查询和插件调用。', - instructions='请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}', - tools=[ - { - "type": "code_interpreter" - }, - { - "type": "rag", - "prompt_ra": { - "pipeline_id": pipeline_id, - "parameters": { - "type": "object", - "properties": { - "query_word": { - "type": "str", - "value": "${document1}" - } - - } - } - } - }] - ) - questions1 = "这份招标文件的项目名称是什么?这份文件的招标编号是多少?这份文件的招标人是谁?这份招标文件属于以下哪类招标:服务类、工程类、还是货物类?你可以结合你对招投标的了解和以下内容:工程类招标投标是指建设单位对拟建的工程项目通过法定的程序和方式吸引建设项目的承包单位竞争;\ - 货物类招投标是指以货物作为采购对象的招标,业主或称购货方为获得货物通过招标的形式选择合格的供货商或称供货方,包括原材料、产品、设备、电能和固态、液态、气态物体等;服务类招标又称为服务采购,指的是除了工程和货之外的其他招标投标活动,物招标投标范围包含建设工程的勘察、设计、监理、工程咨询评估、科技项目、科研课题、物业管理、金融保险服务等\ - 请按下列格式给我提供信息,避免无关内容:‘项目名称:XXX;项目编号:XXX;招标人:XXX;类别:XXX‘" - questions2 = "这份投标文件的投标人是谁?如果文件中未明确提及投标人,投标人以未知替代;这份文件属于哪类投标文件?你的回答仅限于以下三种:商务文件、技术文件、报价文件,不要给出其他回答。请按下列格式给我提供信息,避免无关内容:‘投标人:XXX;类别:XXX’" - if (type == 1): - questions = questions1 - elif (type == 2): - questions = questions2 - return send_message(assistant,index,documents, message=questions) - -def deleteFileFromKnowledge(index,documents): - # 初始化一个列表来存储所有文档的 ID - file_ids = [] - - # 检查documents是否为列表且不为空 - if isinstance(documents, list) and documents: - # 遍历每个文档 - for document in documents: - # 使用属性访问方式获取每个文档的 id_ - # 确保 document 对象有一个名为 id_ 的属性 - file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常 - if file_id: - file_ids.append(file_id) # 将 id 添加到列表中 - index.delete_ref_doc(file_ids) - -def process_directory(base_directory, intermediate_directory, final_directory): - # 遍历base_directory下的所有子目录 - for subdir in os.listdir(base_directory): - subdir_path = os.path.join(base_directory, subdir) - # 确保是目录 - if os.path.isdir(subdir_path): - # 处理每个项目的目录 - process_pdf_folders(subdir_path, intermediate_directory) - classify_folders(intermediate_directory, final_directory, 0) - - # 清理临时目录以备下一个项目使用 - if os.path.exists(intermediate_directory): - shutil.rmtree(intermediate_directory) - -def main(base_directory, intermediate_directory, final_directory): - # 确保中间目录和最终目录存在 - os.makedirs(intermediate_directory, exist_ok=True) - os.makedirs(final_directory, exist_ok=True) - process_directory(base_directory, intermediate_directory, final_directory) - - -if __name__ == "__main__": - base_directory = 'D:\\bidding_trading_files\\招投标文件2023\\2023' - intermediate_directory = 'D:\\tmp' - final_directory = 'D:\\output' - main(base_directory, intermediate_directory, final_directory) #处理多级目录 - # process_pdf_folders(base_directory, intermediate_directory) ##处理单级目录 - # classify_folders(intermediate_directory, final_directory, 0) diff --git a/flask_app/old_version/文档理解大模型版知识库处理/删除知识库.py b/flask_app/old_version/文档理解大模型版知识库处理/删除知识库.py deleted file mode 100644 index fa48cdd..0000000 --- a/flask_app/old_version/文档理解大模型版知识库处理/删除知识库.py +++ /dev/null @@ -1,55 +0,0 @@ -# -*- coding: utf-8 -*- -# This file is auto-generated, don't edit it. Thanks. -import os -from alibabacloud_bailian20231229.client import Client as bailian20231229Client -from alibabacloud_tea_openapi import models as open_api_models -from alibabacloud_bailian20231229 import models as bailian_20231229_models -from alibabacloud_tea_util import models as util_models -from alibabacloud_tea_util.client import Client as UtilClient - -def create_client() -> bailian20231229Client: - """ - 使用AK&SK初始化账号Client - @return: Client - @throws Exception - """ - config = open_api_models.Config( - access_key_id=os.environ['ALIBABA_CLOUD_ACCESS_KEY_ID'], - access_key_secret=os.environ['ALIBABA_CLOUD_ACCESS_KEY_SECRET'] - ) - config.endpoint = 'bailian.cn-beijing.aliyuncs.com' - return bailian20231229Client(config) - -def delete_index(client: bailian20231229Client, workspace_id: str, index_id: str) -> None: - delete_index_request = bailian_20231229_models.DeleteIndexRequest( - index_id=index_id - ) - runtime = util_models.RuntimeOptions() - headers = {} - try: - response = client.delete_index_with_options(workspace_id, delete_index_request, headers, runtime) - print("routes Response:", response) - except Exception as error: - print(error.message) - print(error.data.get("Recommend")) - UtilClient.assert_as_string(error.message) - -async def delete_index_async(client: bailian20231229Client, workspace_id: str, index_id: str) -> None: - delete_index_request = bailian_20231229_models.DeleteIndexRequest( - index_id=index_id - ) - runtime = util_models.RuntimeOptions() - headers = {} - try: - response = await client.delete_index_with_options_async(workspace_id, delete_index_request, headers, runtime) - print("routes Response:", response) - except Exception as error: - print(error.message) - print(error.data.get("Recommend")) - UtilClient.assert_as_string(error.message) - -if __name__ == '__main__': - workspace_id = os.environ['DASHSCOPE_WORKSPACE_ID'] - index_id = 'pg5rrsv26x' - client = create_client() - delete_index(client, workspace_id, index_id) diff --git a/flask_app/old_version/文档理解大模型版知识库处理/知识库操作.py b/flask_app/old_version/文档理解大模型版知识库处理/知识库操作.py deleted file mode 100644 index 19ee2ac..0000000 --- a/flask_app/old_version/文档理解大模型版知识库处理/知识库操作.py +++ /dev/null @@ -1,58 +0,0 @@ -import os -import uuid -from llama_index.readers.dashscope.base import DashScopeParse -from llama_index.readers.dashscope.utils import ResultType -from llama_index.indices.managed.dashscope import DashScopeCloudIndex -from flask_app.old_version.文档理解大模型版知识库处理.删除知识库 import delete_index, create_client - - -def addfileToKnowledge(filepath,knowledge_name): - parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND) - documents = parse.load_data(file_path=filepath) - index = DashScopeCloudIndex.from_documents( - documents, - knowledge_name, - verbose=True, - ) - print("knowledge created successfully!!!") - # index = DashScopeCloudIndex(knowledge_name) - # index._insert(documents) - # return index, documents - return index - -def deleteKnowledge(index): - retriever = index.as_retriever() - index_id = str(retriever.pipeline_id) - workspace_id = os.environ.get('DASHSCOPE_WORKSPACE_ID') - client = create_client() - delete_index(client,workspace_id,index_id) - print("knowledge old_version successfully!!!") - - - -def deleteFileFromKnowledge(index, documents): - # 初始化一个列表来存储所有文档的 ID - file_ids = [] - # 检查documents是否为列表且不为空 - if isinstance(documents, list) and documents: - # 遍历每个文档 - for document in documents: - # 使用属性访问方式获取每个文档的 id_ - # 确保 document 对象有一个名为 id_ 的属性 - file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常 - if file_id: - file_ids.append(file_id) # 将 id 添加到列表中 - print("old_version successfully") - index.delete_ref_doc(file_ids) - - -# 示例用法 -if __name__ == "__main__": - filepath = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标01.pdf" - unique_id = str(uuid.uuid4()) - knowledge_name="招标解析"+unique_id - # index = addfileToKnowledge(filepath,knowledge_name) - index = DashScopeCloudIndex("招标解析e8cc45f4-cd41-47cf-a5e6-2b7885debfff") - # 删除文件 - # deleteFileFromKnowledge(index, document) - deleteKnowledge(index) diff --git a/flask_app/old_version/文档理解大模型版知识库处理/调用文件解析状态查询.py b/flask_app/old_version/文档理解大模型版知识库处理/调用文件解析状态查询.py deleted file mode 100644 index 6a5547e..0000000 --- a/flask_app/old_version/文档理解大模型版知识库处理/调用文件解析状态查询.py +++ /dev/null @@ -1,33 +0,0 @@ -from typing import List -from alibabacloud_docmind_api20220711.client import Client as docmind_api20220711Client -from alibabacloud_tea_openapi import models as open_api_models -from alibabacloud_docmind_api20220711 import models as docmind_api20220711_models -from alibabacloud_tea_util.client import Client as UtilClient -from alibabacloud_credentials.client import Client as CredClient - -def query(): - # 使用默认凭证初始化Credentials Client。 - cred=CredClient() - config = open_api_models.Config( - # 通过credentials获取配置中的AccessKey ID - cred.get_credential().access_key_id, - # 通过credentials获取配置中的AccessKey Secret - cred.get_credential().access_key_secret, - ) - # 访问的域名 - config.endpoint = f'docmind-api.cn-hangzhou.aliyuncs.com' - client = docmind_api20220711Client(config) - request = docmind_api20220711_models.QueryDocParserStatusRequest( - # id : 任务提交接口返回的id - id='docmind-20241008-24363df30d274863894f037dbb7244e8' - ) - try: - # 复制代码运行请自行打印 routes 的返回值 - response = client.query_doc_parser_status(request) - # API返回值格式层级为 body -> data -> 具体属性。可根据业务需要打印相应的结果。获取属性值均以小写开头 - # 获取返回结果。建议先把response.body.data转成json,然后再从json里面取具体需要的值。 - print(response.body.data.status) - except Exception as error: - # 如有需要,请打印 error - UtilClient.assert_as_string(error.message) -query() \ No newline at end of file diff --git a/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析异步提交.py b/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析异步提交.py deleted file mode 100644 index 2088e74..0000000 --- a/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析异步提交.py +++ /dev/null @@ -1,40 +0,0 @@ -from alibabacloud_docmind_api20220711.client import Client as docmind_api20220711Client -from alibabacloud_tea_openapi import models as open_api_models -from alibabacloud_docmind_api20220711 import models as docmind_api20220711_models -from alibabacloud_tea_util.client import Client as UtilClient -from alibabacloud_tea_util import models as util_models -from alibabacloud_credentials.client import Client as CredClient - -def submit_file(): - # 使用默认凭证初始化Credentials Client。 - cred=CredClient() - config = open_api_models.Config( - # 通过credentials获取配置中的AccessKey ID - # access_key_id=cred.get_access_key_id(), - cred.get_credential().access_key_id, - # 通过credentials获取配置中的AccessKey Secret - # access_key_secret=cred.get_access_key_secret() - cred.get_credential().access_key_secret, - ) - # 访问的域名 - config.endpoint = f'docmind-api.cn-hangzhou.aliyuncs.com' - client = docmind_api20220711Client(config) - request = docmind_api20220711_models.SubmitDocParserJobAdvanceRequest( - # file_url_object : 本地文件流 - file_url_object=open("zbtest4.pdf", "rb"), - # file_name :文件名称。名称必须包含文件类型 - file_name='zbtest4.pdf' - # file_name_extension : 文件后缀格式。与文件名二选一 - # file_name_extension='pdf' - ) - runtime = util_models.RuntimeOptions() - try: - # 复制代码运行请自行打印 routes 的返回值 - response = client.submit_doc_parser_job_advance(request, runtime) - # API返回值格式层级为 body -> data -> 具体属性。可根据业务需要打印相应的结果。如下示例为打印返回的业务id格式 - # 获取属性值均以小写开头, - print(response.body.data.id) - except Exception as error: - # 如有需要,请打印 error - UtilClient.assert_as_string(error.message) -submit_file() \ No newline at end of file diff --git a/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析结果获取.py b/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析结果获取.py deleted file mode 100644 index 21b85f8..0000000 --- a/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析结果获取.py +++ /dev/null @@ -1,35 +0,0 @@ -from typing import List -from alibabacloud_docmind_api20220711.client import Client as docmind_api20220711Client -from alibabacloud_tea_openapi import models as open_api_models -from alibabacloud_docmind_api20220711 import models as docmind_api20220711_models -from alibabacloud_tea_util.client import Client as UtilClient -from alibabacloud_credentials.client import Client as CredClient - -def query(): - # 使用默认凭证初始化Credentials Client。 - cred=CredClient() - config = open_api_models.Config( - # 通过credentials获取配置中的AccessKey ID - cred.get_credential().access_key_id, - # 通过credentials获取配置中的AccessKey Secret - cred.get_credential().access_key_secret, - ) - # 访问的域名 - config.endpoint = f'docmind-api.cn-hangzhou.aliyuncs.com' - client = docmind_api20220711Client(config) - request = docmind_api20220711_models.GetDocParserResultRequest( - # id : 任务提交接口返回的id - id='docmind-20241008-24363df30d274863894f037dbb7244e8', - layout_step_size=10, - layout_num=0 - ) - try: - # 复制代码运行请自行打印 routes 的返回值 - response = client.get_doc_parser_result(request) - # API返回值格式层级为 body -> data -> 具体属性。可根据业务需要打印相应的结果。获取属性值均以小写开头 - # 获取返回结果。建议先把response.body.data转成json,然后再从json里面取具体需要的值。 - print(response.body.data) - except Exception as error: - # 如有需要,请打印 error - UtilClient.assert_as_string(error.message) -query() \ No newline at end of file diff --git a/flask_app/old_version/无效标和废标和禁止投标整合.py b/flask_app/old_version/无效标和废标和禁止投标整合.py deleted file mode 100644 index 20fdb94..0000000 --- a/flask_app/old_version/无效标和废标和禁止投标整合.py +++ /dev/null @@ -1,440 +0,0 @@ -# -*- coding: utf-8 -*- -import json -import os.path -import time -import re - -from flask_app.general.format_change import pdf2docx -from flask_app.general.通义千问long import upload_file, qianwen_long -from concurrent.futures import ThreadPoolExecutor - -from flask_app.general.table_content_extraction import extract_tables_main -from flask_app.old_version.不得存在及禁止投标情形 import find_forbidden, process_string_list - - -#处理跨页的段落 -def preprocess_paragraphs(paragraphs): - processed = [] # 初始化处理后的段落列表 - index = 0 - while index < len(paragraphs): - current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白 - - # 检查当前段落是否为空 - if current_text == '': - # 确保有前一个和后一个段落 - if 0 < index < len(paragraphs) - 1: - prev_text = paragraphs[index - 1].text.strip() # 获取前一个段落的文本 - next_text = paragraphs[index + 1].text.strip() # 获取后一个段落的文本 - - # 检查前一个段落的文本是否不以标点符号结尾 - if not prev_text.endswith((',', ',', '。', '!', '?')): - # 定义列表项的模式 - list_item_pattern = r'^\s*([($]\d+[)$]|[A-Za-z]\.\s*|[一二三四五六七八九十]+、)' - # 检查前一个段落是否以列表模式开头,且后一个段落不以列表模式开头 - # is_prev_list = re.match(list_item_pattern, prev_text) - is_next_list = re.match(list_item_pattern, next_text) - if not is_next_list and len(prev_text) > 30: - # 合并前一个和后一个段落的文本 - merged_text = prev_text + next_text # 为了可读性添加空格 - if processed: - processed[-1] = merged_text # 更新处理后的最后一个段落 - else: - processed.append(merged_text) # 如果列表为空,直接添加合并后的文本 - - # 跳过下一个段落,因为它已经被合并 - index += 2 - continue - else: - # 非空段落,添加到处理后的列表中 - processed.append(current_text) - index += 1 - - return processed - -#如果当前段落有序号,则向下匹配直接遇到相同的序号样式 -#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。 - -def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): - from collections import OrderedDict - from docx import Document - import re - - if isinstance(keywords, str): - keywords = [keywords] - - doc = Document(doc_path) - extracted_paragraphs = OrderedDict() - continue_collecting = False - current_section_pattern = None - active_key = None - - def match_keywords(text, patterns): - return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns) - - def extract_from_text(text, current_index): - nonlocal continue_collecting, current_section_pattern, active_key - if text == "": - return current_index - - if continue_collecting: - if current_section_pattern and re.match(current_section_pattern, text): - continue_collecting = False - active_key = None - else: - if active_key is not None: - extracted_paragraphs[active_key].append(text) - return current_index - - if match_keywords(text, keywords): - active_key = text - extracted_paragraphs[active_key] = [text] - if match_keywords(text, follow_up_keywords): - continue_collecting = True - section_number = re.match(r'^(\d+([..]\d+)*)\s*[..]?', text) # 修改后的正则,支持 '数字 、' 和 '数字.' - if section_number: - current_section_number = section_number.group(1) - level_count = current_section_number.count('.') - - # Pattern to match current level, e.g., 3.4.5 - pattern = r'^' + (r'\d+\s*[..]\s*') * level_count + r'\d+' - # Generate patterns for next section at same level and parent level - parts = current_section_number.split('.') - matched_patterns = [pattern] # start with the full pattern - - # Next section at same level - parts[-1] = str(int(parts[-1]) + 1) - next_pattern = r'^' + r'\s*\.\s*'.join(parts) - matched_patterns.append(next_pattern) - - # Parent section (if applicable) - if len(parts) > 1: - parent_section_parts = parts[:-1] - parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1) - parent_pattern = r'^' + r'\s*\.\s*'.join(parent_section_parts) - matched_patterns.append(parent_pattern) - - # Combine the patterns - combined_pattern = r'(' + r')|('.join(matched_patterns) + r')' - current_section_pattern = re.compile(combined_pattern) - - else: - found_next_number = False - current_section_pattern = None - - while current_index < len(doc.paragraphs) - 1: - current_index += 1 - next_text = doc.paragraphs[current_index].text.strip() - if not found_next_number: - next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))', next_text) - if next_section_number: - found_next_number = True - if next_section_number.group(1): - section_parts = next_section_number.group(1).split('.') - dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b' - elif next_section_number.group(2): - dynamic_pattern = r'^[\(\(]\d+[\)\)]' - current_section_pattern = re.compile(dynamic_pattern) - if current_section_pattern and re.match(current_section_pattern, next_text): - extracted_paragraphs[active_key].append(next_text) - else: - continue_collecting = False - active_key=None - break - - return current_index - - processed_paragraphs = preprocess_paragraphs(doc.paragraphs) - index = 0 - while index < len(processed_paragraphs): - index = extract_from_text(processed_paragraphs[index].strip(), index) - index += 1 - - return extracted_paragraphs - -def preprocess_text_list(text_list): - new_text_list = [] - # 正则表达式匹配中文字符或标点后的空格,该空格后紧跟字母、数字或带括号的数字 - split_pattern = re.compile(r'(?<=[\u4e00-\u9fff。;!??!;])(?=\s+[a-zA-Z\d]|\s+\([1-9]\d*\)|\s+\([1-9]\d*\))') - for text in text_list: - # 使用正则表达式检查并拆分元素 - parts = split_pattern.split(text) - new_text_list.extend(part.strip() for part in parts if part.strip()) # 添加非空字符串检查 - - return new_text_list - -def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达式提取到的东西格式化 - all_texts1 = [] - all_texts2=[] - # 定义用于分割句子的正则表达式,包括中文和西文的结束标点 - split_pattern = r'(?<=[。!?\!\?])' - - for key, text_list in extracted_contents.items(): - if len(text_list) == 1: - for data in text_list: - # 检查是否包含任何需要排除的字符串 - if any(exclude in data for exclude in excludes): - continue # 如果包含任何排除字符串,跳过这个数据 - # 去掉开头的序号,包括字母+数字的格式 以及括号+数字 - pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)' - data = re.sub(pattern, '', data).strip() - keyword_match = re.search(keywords, data) - if keyword_match: - # 从关键词位置开始查找结束标点符号 - start_pos = keyword_match.start() - # 截取从关键词开始到后面的内容 - substring = data[start_pos:] - # 按定义的结束标点分割 - sentences = re.split(split_pattern, substring, 1) - if len(sentences) > 0 and sentences[0]: - # 只取第一句,保留标点 - cleaned_text = data[:start_pos] + sentences[0] #eg:经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。潜在投标人应自行承担现场考察的全部费用、责任和风险。 - # 经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。 - else: - cleaned_text = data # 如果没有标点,使用整个字符串 - else: - # 如果没有找到关键词,保留原文本 - cleaned_text = data - # 删除空格 - cleaned_text_no_spaces = cleaned_text.replace(' ', '').replace(' ', '') - # 如果长度大于8,则添加到结果列表 - if len(cleaned_text_no_spaces) > 8: - all_texts1.append(cleaned_text_no_spaces) - - else: - # print(text_list) - new_text_list=preprocess_text_list(text_list) - # print(new_text_list) - pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)' - # data = re.sub(pattern, '', new_text_list[0]).strip() #去除序号 - data = re.sub(pattern, '', new_text_list[0]).replace(' ','').strip() - # 将修改后的第一个元素和剩余的元素连接起来 - new_text_list[0] = data # 更新列表中的第一个元素 - joined_text = "\n".join(new_text_list) # 如果列表中有多个元素,则连接它们 - # 删除空格 - joined_text_no_spaces = joined_text.replace(' ', '').replace(' ', '') - all_texts2.append(joined_text_no_spaces) # 将每个列表的内容添加到 all_texts 中 - - return all_texts1,all_texts2 #all_texts1要额外用gpt all_text2直接返回结果 -def find_sentences_with_keywords(data, keywords, follow_up_keywords): - """递归查找并返回包含关键词的句子列表,并根据是否存在后续关键词分别存储到两个列表中。""" - sentences1 = [] # 保存没有后续关键词的情况 - sentences2 = [] # 保存有后续关键词的情况 - - if isinstance(data, dict): - for value in data.values(): - result1, result2 = find_sentences_with_keywords(value, keywords, follow_up_keywords) - sentences1.extend(result1) - sentences2.extend(result2) - elif isinstance(data, list): - for item in data: - result1, result2 = find_sentences_with_keywords(item, keywords, follow_up_keywords) - sentences1.extend(result1) - sentences2.extend(result2) - elif isinstance(data, str): - # 分割句子,保证句子完整性(按标点符号和序号分割) - # split_sentences = re.split(r'(?<=[。!?\!\?])|(?=\d+[\、\.])|(?=[((]\d+[))])', data) # 扩展匹配序号分割 - split_sentences = re.split(r'(?<=[。!?\!\?])|(?=\d+\.\d+)|(?=\d+[\、\.])|(?=[((]\d+[))])', data) - i = 0 - while i < len(split_sentences): - sentence = split_sentences[i].strip() - if re.search(keywords, sentence, re.IGNORECASE): - follow_up_present = any( - re.search(follow_up, sentence, re.IGNORECASE) for follow_up in follow_up_keywords) - if follow_up_present: - # 如果存在后续关键词,则从当前位置开始截取 - start_index = i - end_index = start_index - found_next_section = False - for j in range(start_index + 1, len(split_sentences)): - if re.match(r'\d+\.\d+(\.\d+)?', split_sentences[j].strip()): - end_index = j - found_next_section = True - break - - if found_next_section: - full_text = ' '.join(split_sentences[start_index:end_index]).strip() - else: - full_text = ' '.join(split_sentences[start_index:]).strip() - # pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)' - pattern = r'^\s*([((]\d+[))]|[A-Za-z]\.\s*|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)' - # data=re.sub(pattern,'',full_text) - data = re.sub(pattern, '', full_text).replace(' ','').strip() - sentences2.append(data) # 存储有后续关键词的情况 - i = end_index if found_next_section else len(split_sentences) - else: - # pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)' - pattern = r'^\s*([((]\d+[))]|[A-Za-z]\.\s*|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)' - # data = re.sub(pattern, '', sentence).replace('\n','').strip() - cleaned_sentence = re.sub(pattern, '', sentence).replace('\n', '').replace(' ','').strip() - if len(cleaned_sentence) > 8: - sentences1.append(cleaned_sentence) # 存储没有后续关键词的情况 - i += 1 - else: - i += 1 - - return sentences1, sentences2 # 返回两个列表 - -def extract_sentences_from_json(json_path, keywords,follow_up_keywords): - if not json_path: - return [],[] - with open(json_path, 'r', encoding='utf-8') as file: - data = json.load(file) - """从JSON数据中提取包含关键词的句子。""" - return find_sentences_with_keywords(data, keywords,follow_up_keywords) - -#处理无效投标 -def extract_values_if_contains(data, includes): - """ - 递归检查字典中的值是否包含列表 'includes' 中的内容。 - 如果包含,将这些值添加到一个列表中并返回。 - - 参数: - data (dict): 字典或从 JSON 解析得到的数据。 - includes (list): 包含要检查的关键词的列表。 - - 返回: - list: 包含满足条件的值的列表。 - """ - included_values = [] # 初始化结果列表 - - # 定义递归函数来处理嵌套字典 - def recursive_search(current_data): - if isinstance(current_data, dict): - for key, value in current_data.items(): - if isinstance(value, dict): - # 如果值是字典,递归搜索 - recursive_search(value) - elif isinstance(value, str): - # 如果值是字符串,检查是否包含任何 includes 中的关键词 - if any(include in value for include in includes): - included_values.append(value) - elif isinstance(current_data, list): - for item in current_data: - # 如果是列表,递归每个元素 - recursive_search(item) - - # 开始递归搜索 - recursive_search(data) - - return included_values - - -#你是一个文本助手,文本内的信息以'...............'分割,你负责准确筛选所需的信息并返回,每块信息要求完整,不遗漏,你不得擅自进行总结或删减。 -#以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号。 -#以上是原文内容,文本内的信息以'...............'分割,请你根据该信息回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选所需的信息并返回。最终结果以json列表格式返回给我,键名为'否决和无效投标情形',你的回答完全忠于原文内容,且回答内容与原文内容一致,要求完整与准确,不能擅自总结或者概括。", - -#TODO:truncate_json_path为空的时候,单独提取表格数据 -def handle_query(file_path, user_query, output_file, result_key, keywords, truncate_json_path): - try: - excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:"] - follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下'] - extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) # 提取正文(除表格) - # print(extracted_contents) - all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表 - all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords) # 提取表格数据(json_data) - qianwen_txt = all_texts1 + all_tables1 - selected_contents = set() # 使用 set 去重 - - if qianwen_txt: - with open(output_file, 'w', encoding='utf-8') as file: - counter = 1 - for content in qianwen_txt: - file.write("..............." + '\n') - file.write(f"{counter}. {content}\n") - counter += 1 - - file_id = upload_file(output_file) - # qianwen_ans = qianwen_long(file_id, user_query) - qianwen_ans = qianwen_long(file_id, user_query) - num_list = process_string_list(qianwen_ans) - print(result_key + "选中的序号:" + str(num_list)) - - for index in num_list: - if 1 <= index <= len(qianwen_txt): - content = qianwen_txt[index - 1] - selected_contents.add(content) - - # 无论 qianwen_txt 是否为空,都添加 all_texts2 和 all_tables2 的内容 - selected_contents.update(all_texts2) - selected_contents.update(all_tables2) - - # 如果 selected_contents 不为空,则返回结果,否则返回空字符串 - if selected_contents: - res = {result_key: list(selected_contents)} - else: - res = {result_key: ""} - - return res - except Exception as e: - print(f"handle_query 在处理 {result_key} 时发生异常: {e}") - return {result_key: ""} - -def combine_find_invalid(invalid_docpath, output_dir, tobidders_notice_table, clause_path, qualification): - tobidders_notice_table_docx = pdf2docx(tobidders_notice_table) # 投标人须知前附表转docx - truncate_jsonpath = extract_tables_main(tobidders_notice_table_docx, output_dir) # 投标人须知前附表docx->json - queries = [ - ( - r'否\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效', - "以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。", - os.path.join(output_dir, "temp1.txt"), - "否决和无效投标情形" - ), - ( - r'废\s*标', - "以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:废标项的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。", - os.path.join(output_dir, "temp2.txt"), - "废标项" - ) - ] - results = [] - # 使用线程池来并行处理查询 - with ThreadPoolExecutor() as executor: - futures = [] - for keywords, user_query, output_file, result_key in queries: - future = executor.submit( - handle_query, - invalid_docpath, - user_query, - output_file, - result_key, - keywords, - truncate_jsonpath - ) - futures.append((future, result_key)) - time.sleep(0.5) # 暂停0.5秒后再提交下一个任务 - # 按照提交的顺序收集结果 - for future, result_key in futures: - try: - result = future.result() - except Exception as e: - print(f"线程处理 {result_key} 时出错: {e}") - result = {result_key: ""} - results.append(result) - - # 禁止投标(find_forbidden)部分 - try: - # print("starting不得存在的情形...") - forbidden_res = find_forbidden(truncate_jsonpath, clause_path, qualification) - except Exception as e: - print(f"find_forbidden 处理时出错: {e}") - forbidden_res = {'不得存在的其他情形': ""} - results.append(forbidden_res) - - combined_dict = {} - for d in results: - combined_dict.update(d) - # print("无效标与废标done...") - return {"无效标与废标项": combined_dict} - -if __name__ == '__main__': - start_time = time.time() - tobidders_notice_table="" - clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json" - qualification="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf" - output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\zbout" - # doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx' - invalid_docpath = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_invalid.docx' - results = combine_find_invalid(invalid_docpath, output_dir,tobidders_notice_table,clause_path,qualification) - end_time = time.time() - print("Elapsed time:", str(end_time - start_time)) - print("Results:", json.dumps(results,ensure_ascii=False,indent=4)) diff --git a/flask_app/old_version/解析old.py b/flask_app/old_version/解析old.py deleted file mode 100644 index c977194..0000000 --- a/flask_app/old_version/解析old.py +++ /dev/null @@ -1,221 +0,0 @@ -# -*- encoding:utf-8 -*- -import json -import logging -import time -from concurrent.futures import ThreadPoolExecutor -from flask_app.main.截取pdf import truncate_pdf_multiple -from flask_app.general.table_content_extraction import extract_tables_main -from flask_app.main.提取json工程标版 import convert_clause_to_json -from flask_app.general.json_utils import transform_json_values -from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid -from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice -import concurrent.futures -from flask_app.main.基础信息整合快速版 import combine_basic_info -from flask_app.main.资格审查模块 import combine_review_standards -from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards -from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx -from flask_app.general.docx截取docx import copy_docx - -def get_global_logger(unique_id): - if unique_id is None: - return logging.getLogger() # 获取默认的日志器 - logger = logging.getLogger(unique_id) - return logger - -logger=None - -# 创建全局线程池 -executor = ThreadPoolExecutor() -def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id): - logger.info("starting 文件预处理...") - logger.info("output_folder..." + output_folder) - - # 根据文件类型处理文件路径 - if file_type == 1: # docx - docx_path = downloaded_file_path - pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理 - elif file_type == 2: # pdf - pdf_path = downloaded_file_path - docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库 - elif file_type == 3: #doc - pdf_path=docx2pdf(downloaded_file_path) - docx_path=doc2docx(downloaded_file_path) - else: - logger.error("Unsupported file type provided. Preprocessing halted.") - return None - - # 调用截取PDF多次 - truncate_files = truncate_pdf_multiple(pdf_path, output_folder,unique_id) - - # 处理各个部分 - truncate0_docpath = pdf2docx(truncate_files[0]) # 投标人须知前附表转docx - - invalid_path=truncate_files[5] - invalid_docpath = copy_docx(docx_path) # docx截取无效标部分 - # invalid_docpath=pdf2docx(invalid_path) - truncate_jsonpath = extract_tables_main(truncate0_docpath, output_folder) # 投标人须知前附表docx->json - truncate0 = truncate_files[0] - truncate1 = truncate_files[1] - truncate2=truncate_files[2] - truncate3 = truncate_files[3] - merged_baseinfo_path=truncate_files[-1] - clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json - - logger.info("文件预处理done") - - # 返回包含预处理后文件路径的字典 - return { - 'file_path': downloaded_file_path, - 'output_folder': output_folder, - 'invalid_path':invalid_path, - 'truncate0': truncate0, - 'truncate1': truncate1, - 'truncate2':truncate2, - 'truncate3': truncate3, - 'truncate0_jsonpath': truncate_jsonpath, - 'merged_baseinfo_path':merged_baseinfo_path, - 'clause_path': clause_path, - 'invalid_docpath': invalid_docpath - } - - -def post_processing(data,includes): - # 初始化结果字典,预设'其他'分类为空字典 - result = {"其他": {}} - - # 遍历原始字典的每一个键值对 - for key, value in data.items(): - if key in includes: - # 如果键在includes列表中,直接保留这个键值对 - result[key] = value - else: - # 如果键不在includes列表中,将这个键值对加入到'其他'分类中 - result["其他"][key] = value - - # 如果'其他'分类没有任何内容,可以选择删除这个键 - if not result["其他"]: - del result["其他"] - - return result -# 基本信息 -def fetch_project_basic_info(merged_baseinfo_path, truncate0, truncate2, clause_path): # 投标人须知前附表 - logger.info("starting基础信息...") - basic_res = combine_basic_info(merged_baseinfo_path, truncate0, truncate2, clause_path) - logger.info("基础信息done") - return basic_res - - -# 形式、响应、资格评审 -def fetch_qualification_review(truncate1, truncate3,output_folder, truncate0_jsonpath, clause_path,invalid_path,merged_baseinfo_path): - logger.info("starting资格审查...") - review_standards_res = combine_review_standards(truncate1, truncate3,output_folder, truncate0_jsonpath, - clause_path,invalid_path,merged_baseinfo_path) - logger.info("资格审查done") - return review_standards_res - - -# 评分细则 流式 -def fetch_evaluation_standards(truncate1): # 评标办法前附表 - logger.info("starting 商务标和技术标...") - - # 获取评标办法前附表的字典结果 - evaluation_standards_res = combine_evaluation_standards(truncate1) - - # 获取技术标和商务标 - technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})} - commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})} - - logger.info("商务标和技术标 done") - - # 返回将 "技术标" 和 "商务标" 包含在新的键中 - return { - "technical_standards": technical_standards, - "commercial_standards": commercial_standards - } - - -# 无效、废标项解析 -def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3): - # 废标项要求:千问 - logger.info("starting无效标与废标...") - find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3) - logger.info("无效标与废标done...") - return find_invalid_res - - -# 投标文件要求 -def fetch_bidding_documents_requirements(clause_path): - logger.info("starting投标文件要求...") - fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1) - logger.info("投标文件要求done...") - return {"投标文件要求":fetch_bidding_documents_requirements_json} - -# 开评定标流程 -def fetch_bid_opening(clause_path): - logger.info("starting开评定标流程...") - fetch_bid_opening_json = extract_from_notice(clause_path, 2) - logger.info("开评定标流程done...") - return {"开评定标流程":fetch_bid_opening_json} - -#分段返回 -def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_id): - global logger - logger = get_global_logger(unique_id) - # 预处理文件,获取处理后的数据 - processed_data = preprocess_files(output_folder, downloaded_file_path, file_type) - if not processed_data: - yield json.dumps({}) # 如果处理数据失败,返回空的 JSON - - with concurrent.futures.ThreadPoolExecutor() as executor: - # 立即启动不依赖 knowledge_name 和 index 的任务 - futures = { - 'base_info': executor.submit(fetch_project_basic_info, processed_data['merged_baseinfo_path'], - processed_data['truncate0'], - processed_data['truncate2'], processed_data['clause_path']), - 'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'], - processed_data['truncate3'], output_folder, - processed_data['truncate0_jsonpath'], - processed_data['clause_path'], processed_data['invalid_path'], - processed_data['merged_baseinfo_path']), - 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']), - 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], - output_folder, processed_data['truncate0_jsonpath'], - processed_data['clause_path'], processed_data['truncate3']), - 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, processed_data['clause_path']), - 'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path']) - } - - # 提前处理这些不依赖的任务,按完成顺序返回 - for future in concurrent.futures.as_completed(futures.values()): - key = next(k for k, v in futures.items() if v == future) - try: - result = future.result() - - # 如果是 evaluation_standards,拆分技术标和商务标 - if key == 'evaluation_standards': - technical_standards = result["technical_standards"] - commercial_standards = result["commercial_standards"] - - # 分别返回技术标和商务标 - yield json.dumps({'technical_standards': transform_json_values(technical_standards)}, ensure_ascii=False) - yield json.dumps({'commercial_standards': transform_json_values(commercial_standards)}, ensure_ascii=False) - - else: - # 处理其他任务的结果 - yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False) - - except Exception as exc: - logger.error(f"Error processing {key}: {exc}") - yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False) - -if __name__ == "__main__": - start_time = time.time() - output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test1" - file_type = 2 #1:docx 2:pdf 3:其他 - input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf" - print("yes") - for output in engineering_bid_main(output_folder, input_file, file_type, "zytest"): - print(output) - end_time = time.time() - elapsed_time = end_time - start_time # 计算耗时 - print(f"Function execution took {elapsed_time} seconds.") diff --git a/flask_app/old_version/资格审查模块old.py b/flask_app/old_version/资格审查模块old.py deleted file mode 100644 index 4f640f2..0000000 --- a/flask_app/old_version/资格审查模块old.py +++ /dev/null @@ -1,42 +0,0 @@ -import os - -from flask_app.main.提取json工程标版 import convert_clause_to_json -from flask_app.general.json_utils import extract_content_from_json -from flask_app.old_version.形式响应评审old import process_reviews -from flask_app.old_version.资格评审old import process_qualification -from flask_app.general.通义千问long import upload_file, qianwen_long -from concurrent.futures import ThreadPoolExecutor - - -def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder): #评标办法前附表 - # 形式评审、响应评审:千问 - file_id=upload_file(truncate1) #评标办法前附表 - user_query_1 = "根据该文档中的评标办法前附表,请你列出该文件中的形式评审标准和响应性评审标准和资格评审标准,请以json格式返回,外层键名为'形式评审标准'和'响应性评审标准'和'资格评审标准',嵌套键名为'评审因素'中的内容,相应的键值为对应'评审标准'中的内容。" - results = qianwen_long(file_id, user_query_1) - original_dict_data = extract_content_from_json(results) - qualification_review = original_dict_data.pop('资格评审标准', '默认值或None') #qianwen-long有关资格评审的内容 - with ThreadPoolExecutor() as executor: - # 创建Future对象 - future_qualification = executor.submit(process_qualification, qualification_review, truncate3, knowledge_name) - future_form_response = executor.submit(process_reviews, original_dict_data, knowledge_name, truncate0_jsonpath, - clause_path,input_file,output_folder) - - # 等待执行结果 - final_qualify_json = future_qualification.result() - form_response_dict = future_form_response.result() - form_response_dict.update(final_qualify_json) - # return nest_json_under_key(form_response_dict,"资格审查") - return {"资格审查":form_response_dict} - -if __name__ == "__main__": - input_file="D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21\\ztbfile.pdf" - output_folder = "D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21" - # truncate0 = os.path.join(output_folder, "zbtest20_tobidders_notice_table.pdf") - truncate2=os.path.join(output_folder,"ztbfile_tobidders_notice.pdf") - knowledge_name="zbtest20" - truncate1=os.path.join(output_folder,"ztbfile_evaluation_method.pdf") - truncate3=os.path.join(output_folder,"ztbfile_qualification.pdf") - clause_path = convert_clause_to_json(truncate2, output_folder) - truncate0_jsonpath = os.path.join(output_folder, "truncate_output.json") - res=combine_review_standards(truncate1,truncate3, knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder) - print(res) \ No newline at end of file diff --git a/flask_app/old_version/资格评审old.py b/flask_app/old_version/资格评审old.py deleted file mode 100644 index 58a3c84..0000000 --- a/flask_app/old_version/资格评审old.py +++ /dev/null @@ -1,182 +0,0 @@ -# -*- encoding:utf-8 -*- -# 资格审查中,首先排除'联合体投标'和'不得存在的情况',有'符合'等的,加入matching_keys列表,否则保留原字典 -import json -import re -from flask_app.general.json_utils import clean_json_string, combine_json_results, add_keys_to_json -from flask_app.general.多线程提问 import multi_threading, read_questions_from_file -from flask_app.general.通义千问long import upload_file - - -def merge_dictionaries_under_common_key(dicts, common_key): - # 初始化一个空字典来保存合并的结果 - merged_dict = {common_key: {}} - - # 遍历列表中的每个字典 - for d in dicts: - if common_key in d: - # 使用字典解包来合并字典 - merged_dict[common_key].update(d[common_key]) - else: - print(f"资格评审: Warning: Dictionary does not contain the key {common_key}") - - return merged_dict - - -def generate_qual_question(matching_keys_list): # 这里假设资质、信誉与人员要求 要不都有、要不都没 - if not matching_keys_list: - return [] - else: - questions = [] - # 将列表转换为单引号包裹的格式,并用逗号和空格分隔 - formatted_keys = ["'{}'".format(key) for key in matching_keys_list] - # 将格式化后的关键词列表连接成字符串 - keys_string = "、".join(formatted_keys) - # 构造完整的问题语句 - question1 = (f"该招标文件中资格评审的内容是怎样的?具体内容包括{keys_string}," - "请你以json格式返回结果,外层键名为'资格评审',嵌套键名为具体的字段,请你忠于原文,回答要求完整准确,不要擅自总结、删减。") - question2 = "该招标文件中资格评审中有关人员资格的要求是怎样的?请依次给出所需的岗位、需要的数量、资格要求、需要提交的证明材料(如具体的社保证明、技能证书等,若有时间要求请注明时间范围)、在岗要求、备注,若相关要求不存在,则无需返回该键值对。请你以json格式返回结果,外层键名为'资格评审',嵌套键名为具体的要求,请你忠于原文,回答要求完整准确,不要擅自总结、删减。" - questions.append(question1) - questions.append(question2) - return questions - - -def extract_matching_keys_qual(dict_data): - # 定义包含模式的列表 - include_patterns = [re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"), - re.compile(r"符合")] - # 初始化列表,用于存储匹配的键 - matching_keys = [] - non_matching_keys = {} - # 定义排除项 - excludes = ['联合体', '禁止投标', '不存在', '不得存在', '资格', '管理机构', '负责人', '人员'] # 联合体、禁止投标的情况、人员需要额外问 - # 遍历字典中的每个键值对 - for key, value in dict_data.items(): - if "附件" in key and "资质" in key: - return [], [] # 如果同时出现,立即返回空列表 - # 检查值是否符合任何一个包含模式 - if any(pattern.search(value) for pattern in include_patterns): - # 如果值符合包含模式,再检查键是否包含任何排除项 - if not any(ex in key for ex in excludes): - # 如果键不包含排除项,则添加到匹配键列表中 - matching_keys.append(key) - else: - # value中有实质的内容,不需要额外问。 - non_matching_keys[key] = value - - return matching_keys, non_matching_keys # matching:['资质条件', '财务状况'] non_matching_keys:{'营业执照': '具备有效的营业执照', '施工机械设备': '具备完善的施工设备'} - - -# 获取联合体投标的要求,由于它不一定在资格审查表中,故调用rag -def get_consortium_dict(knowledge_name): - qualify_list = [] - consortium_questions = [ - "该招标文件对于联合体投标的要求是怎样的,请按json格式给我提供信息,外层键名为'联合体投标要求(如有)',嵌套键名为你对该要求的总结,而键值需要完全与原文保持一致,不要擅自总结、删减。"] - results1 = multi_threading(consortium_questions, knowledge_name) - for _, response in results1: # _占位,代表ques;response[0]也是ques;response[1]是ans - try: - if response and len(response) > 1: # 检查response存在且有至少两个元素 - qualify_list.append(response[1]) - else: - print(f"资格评审: Warning: Missing or incomplete response data for query index {_}.") - except Exception as e: - print(f"资格评审: Error processing response for query index {_}: {e}") - consortium_dict = combine_json_results(qualify_list) - return consortium_dict - - -def get_all_dict(knowledge_name): - qualification_review_file_path = 'flask_app/static/提示词/资格评审.txt' - questions = read_questions_from_file(qualification_review_file_path) - qualification_list = [] - res1 = multi_threading(questions, knowledge_name) - for _, response in res1: # _占位,代表ques;response[0]也是ques;response[1]是ans - try: - if response and len(response) > 1: # 检查response存在且有至少两个元素 - qualification_list.append(response[1]) - else: - print(f"资格评审: Warning: Missing or incomplete response data for query index {_}.") - except Exception as e: - print(f"资格评审: Error processing response for query index {_}: {e}") - qualification_combined_res = combine_json_results(qualification_list) - return {'资格评审': qualification_combined_res} - - -def process_qualification(qualification_review, truncate3, knowledge_name): - # 资格评审 - matching_keys_list, non_matching_dict = extract_matching_keys_qual( - qualification_review) # matching_keys_list:['资质条件', '财务状况'] non_matching_dict:{'营业执照': '具备有效的营业执照', '施工机械设备': '具备完善的施工设备'} - if not matching_keys_list: - if not non_matching_dict: # 古法提取 - if truncate3 != "": # 提取到资格审查附件的情况 - print("资格评审: type1") - matching_keys_list = ["资质条件", "财务要求", "业绩要求", "信誉要求", "其他要求"] - ques = generate_qual_question(matching_keys_list) - file_id2 = upload_file(truncate3) - results2 = multi_threading(ques, "", file_id2, 2) # 资格评审表,调用qianwen-long - res_list = [] - if not results2: - print("资格评审: 调用大模型未成功获取资格评审文件中的要求!") - else: - # 打印结果 - for question, response in results2: - res_list.append(clean_json_string(response)) # 都是问资格评审表得出的 - if res_list: - merged_dict = merge_dictionaries_under_common_key(res_list, '资格评审') - consortium_dict = get_consortium_dict(knowledge_name) - updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) - return updated_qualify_json - else: - print("资格评审: 无法获取大模型结果,返回空值") - return {"资格评审": ""} - else: - print("资格评审: type2") - return get_all_dict(knowledge_name) or {"资格评审": ""} - - else: # 此时要求全部写在评分办法前附表中,不需要额外提取。 - print("资格评审: type3") - new_non_matching_json = {'资格评审': non_matching_dict} - substring = '联合体' - found_key = any(substring in key for key in non_matching_dict.keys()) # 没有联合体投标,则需生成,防止重复 - if not found_key: - consortium_dict = get_consortium_dict(knowledge_name) - final_qualify_json = add_keys_to_json(new_non_matching_json, consortium_dict) - return final_qualify_json - else: - return new_non_matching_json or {"资格评审": ""} - - elif matching_keys_list and truncate3 == "": # 这种情况是评分办法前附表中有要求,但是没有正确截取到'资格审查表' - print("资格评审: type4") - final_qualification = get_all_dict(knowledge_name) - final_qualify_json = add_keys_to_json(final_qualification, non_matching_dict) - return final_qualify_json or {"资格评审": ""} - else: # 大多数情况 - print("资格评审: type5") - user_querys = generate_qual_question(matching_keys_list) # 生成提问->‘附件:资格审查’ - file_id2 = upload_file(truncate3) - results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表,调用qianwen-long - res_list = [] - if not results2: - print("资格评审: 调用大模型未成功获取资格评审文件中的要求!") - return {"资格评审": ""} - else: - # 打印结果 - for question, response in results2: - cleaned_res = clean_json_string(response) - res_list.append(cleaned_res) # 都是问资格评审表得出的 - merged_dict = merge_dictionaries_under_common_key(res_list, '资格评审') - consortium_dict = get_consortium_dict(knowledge_name) - updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) # 合并字典 - final_qualify_json = add_keys_to_json(updated_qualify_json, non_matching_dict) - return final_qualify_json or {"资格评审": ""} - - -if __name__ == "__main__": - # qualification_review={'营业执照': '具备有效的营业执照', '资质等级': '具备建设行政主管部门颁发的市政公用工程监理乙级及以上资质或房屋建筑工程监理乙级及以上资质或工程监理综合资质证书', '财务状况': '投标人须提供近三年(2018 年、2019 年、2020 年)完', '类似项目业绩': '投标人近 5 年(2017 年至今)须具有至少一项投资概算在4000 万元及以上房屋建筑工程或市政公用工程监理业绩,同时提供中标通知书及监理服务合同,项目规模及项目时间认定以监理服务合同内信息为准', '信誉': '根据《关于在招标投标活动中对失信被执行', '主要人员': '监理工程师:至少提供 1 名具有房屋建筑工程专','不存在禁止投标的':'不存在第二章“投标人须知”第 1.4.3 项规定的情形','联合体投标':'hha'} - qualification_review = {'营业执照': '具备有效的营业执照', '安全生产许可证': '具备有效的安全生产许可证', - '资质等级': '符合第二章“投标人须知”规定', '财务状况': '符合第二章“投标人须知”规定'} - truncate3 = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_qualification.pdf" - knowledge_name = "招标解析word13" - res = process_qualification(qualification_review, truncate3, knowledge_name) - print(json.dumps(res, ensure_ascii=False, indent=4)) - -# 该招标文件中资格评审关于财务状况的内容是怎样的?请你以json格式返回结果,外层键名为'财务状况',请你忠于原文,回答要求完整准确,不要擅自总结、删减,且不要回答诸如'见投标人须知前附表'或'见第x.x项规定'这类无实质性内容的回答。 diff --git a/flask_app/old_version/资格评审前判断.py b/flask_app/old_version/资格评审前判断.py deleted file mode 100644 index eb3dc87..0000000 --- a/flask_app/old_version/资格评审前判断.py +++ /dev/null @@ -1,29 +0,0 @@ -from flask_app.general.读取文件.按页读取pdf import extract_text_by_page - -def check_strings_in_pdf(file_path): - judge_list=['施工机械设备', '企业信息登记'] - # Read text from PDF - text = extract_text_by_page(file_path) # Assuming this returns all text from the PDF - full_text = ''.join(text).replace('\n', '').replace(' ', '') # Clean up the text - - # Initialize the questions list - ques_list = [] - - # Check for each string in the judge_list and construct questions accordingly - if judge_list[0] in full_text: - ques_list.append(f"该招标文件对于'{judge_list[0]}'的要求是怎样的,请按json格式给我提供信息,键名为'{judge_list[0]}',若存在未知信息,在对应的键值中填'未知'。") - if len(judge_list) > 1 and judge_list[1] in full_text: - ques_list.append(f"该招标文件对于'{judge_list[1]}'的要求是怎样的,请按json格式给我提供信息,键名为'{judge_list[1]}',若存在未知信息,在对应的键值中填'未知'。") - - if not ques_list: - return None - return ques_list - -# Test cases or example usage -if __name__ == '__main__': - file_path = 'C:/Users/Administrator/Desktop/zbtest18_39-45.pdf' # Replace with your actual PDF file path - judge_list = ['施工机械设备', '企业信息登记'] # List of strings to check in the PDF - - questions = check_strings_in_pdf(file_path, judge_list) - for question in questions: - print(question) diff --git a/flask_app/old_version/通义千问.py b/flask_app/old_version/通义千问.py deleted file mode 100644 index 4a71a37..0000000 --- a/flask_app/old_version/通义千问.py +++ /dev/null @@ -1,48 +0,0 @@ -import json -import random -from http import HTTPStatus -from dashscope import Generation - -def call_with_messages(messages): - response = Generation.call(model="qwen-max", - messages=messages, - seed=random.randint(1, 10000), - temperature=0.5, - top_p=0.5, - top_k=50, - result_format='message') - if response.status_code == HTTPStatus.OK: - content = response.output['choices'][0]['message']['content'] - return content - else: - raise Exception(f'Request id: {response.request_id}, Status code: {response.status_code}, error code: {response.code}, error message: {response.message}') - -def prepare_question_from_json(json_path, user_query): - with open(json_path, 'r', encoding='utf-8') as file: - json_data = json.load(file) - question = json.dumps(json_data, ensure_ascii=False) + user_query - return question - -#专用于判断是否 -def qianwen_ask(json_path, user_query): - messages = [] - question = prepare_question_from_json(json_path, user_query) - messages.append({'role': 'system', 'content': 'You are a helpful assistant.'}) - messages.append({'role': 'user', 'content': question}) - return call_with_messages(messages) - -#通用问题 -def qianwen_ask2(questions): - messages = [] - messages.append({'role': 'system', 'content': 'You are a helpful assistant.'}) - messages.append({'role': 'user', 'content': questions}) - return call_with_messages(messages) - -if __name__ == '__main__': - json_path = 'judge_exist.json' - prompt = "请你依据以上信息回答,是否组织踏勘现场?是否召开投标预备会?是否允许偏离?是否退还投标文件?是否允许分包? 是否需要递交投标保证金?是否有履约保证金(履约担保)?是否有招标代理服务费?请按json格式给我提供信息,键名分别为'是否组织踏勘现场','是否召开投标预备会','是否允许偏离','是否退还投标文件',是否允许分包','是否需要递交投标保证金','是否有履约保证金','是否有招标代理服务费',键值仅限于'是','否','未知',若存在未知或者矛盾信息,请回答'未知'。" - try: - content = qianwen_ask(json_path, prompt) - print(content) - except Exception as e: - print(f"An error occurred: {e}") diff --git a/flask_app/start_up.py b/flask_app/start_up.py index d3c8077..9194eb3 100644 --- a/flask_app/start_up.py +++ b/flask_app/start_up.py @@ -9,7 +9,7 @@ from flask_app.routes.get_deviation import get_deviation_bp from flask_app.routes.little_zbparse import little_zbparse_bp from flask_app.routes.upload import upload_bp from flask_app.routes.test_zbparse import test_zbparse_bp - +from flask_app.general.清除file_id import delete_file_by_ids,read_file_ids class FlaskAppWithLimiter(Flask): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -23,25 +23,6 @@ def create_app(): handler.setFormatter(CSTFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) app.logger.addHandler(handler) app.logger.setLevel(logging.INFO) - # @app.before_request - # def before_request(): - # """ - # 每个请求开始前初始化 logger 和 output_folder, - # 根据请求的端点选择不同的子文件夹。 - # """ - # # 确定当前请求的端点 - # blueprint = request.blueprint - # # 映射端点到子文件夹 - # subfolder_map = { - # 'get_deviation': 'output3', - # 'little_zbparse': 'output2', - # 'upload': 'output1', - # 'test_zbparse': 'test_output' - # } - # # 获取对应的子文件夹,默认为 'output1' - # subfolder = subfolder_map.get(blueprint, 'output1') - # # 创建 logger 和 output_folder - # create_logger(app, subfolder) # 注册蓝图 app.register_blueprint(get_deviation_bp) @@ -58,6 +39,13 @@ def create_app(): monitor = getattr(g, 'monitor', None) if limiter and not monitor.is_timeout: limiter.semaphore.release() + output_folder = getattr(g, 'output_folder', None) + if output_folder: + # 执行与output_folder相关的清理操作(例如删除临时文件) + logger = g.logger + logger.info(f"正在清理输出文件夹: {output_folder}") + file_ids=read_file_ids(output_folder) + delete_file_by_ids(file_ids) return app #TODO:培训要求、总体要求、进度要求、'建设要求'到技术要求中,归类到其他要求中 diff --git a/flask_app/货物标/商务服务其他要求提取.py b/flask_app/货物标/商务服务其他要求提取.py index d0c5dfd..3691297 100644 --- a/flask_app/货物标/商务服务其他要求提取.py +++ b/flask_app/货物标/商务服务其他要求提取.py @@ -46,7 +46,6 @@ def find_exists(truncate_file, required_keys): start_match = re.search(begin_pattern, first_page_clean) if not start_match: print("未找到开始模式,返回完整第一页") - start_index = 0 first_content = first_page_clean else: start_index = start_match.end() @@ -186,7 +185,11 @@ def generate_template(required_keys, type=1): """ ) else: - specific_instructions = "4. 在提取技术要求或技术、服务要求时,你无需从采购清单或表格中提取技术要求以及参数要求,你仅需定位到原文中包含'技术要求'或'技术、服务要求'关键字的标题并提取其后相关内容;若内容全在表格中,键值为空列表[]。" + specific_instructions = textwrap.dedent( + """4. 在提取技术要求或技术、服务要求时,你无需从采购清单或表格中提取技术要求以及参数要求,你仅需定位到原文中包含'技术要求'或'技术、服务要求'关键字的标题并提取其后相关内容;若技术要求的内容全在表格中,键值为空列表[]。 + 5. 在提取'技术要求'时,注意不要提取有关'售后、维护、运维、培训、质保'等要求,它们不属于'技术要求'。 + """ + ) return textwrap.dedent( f"""请你根据该货物类招标文件中的采购要求部分内容,请告诉我该项目采购的{keys_str}分别是什么,请以json格式返回结果,默认情况下外层键名是{outer_keys_str},键值为字符串列表,每个字符串表示具体的一条要求,可以按原文中的序号作划分(若有序号的话),请按原文内容回答,保留三角▲、五角星★和序号(若有),不要擅自增添内容及序号。请不要提取{another_keys_str}中的内容。 From 4b4937780ee1b6bdeca739fb1d889b6ac1304576 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Thu, 28 Nov 2024 13:37:26 +0800 Subject: [PATCH 3/8] =?UTF-8?q?11.27=20=E6=B8=85=E7=90=86file=5Fid?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 1ed5d90..23b8124 100644 --- a/.gitignore +++ b/.gitignore @@ -42,7 +42,7 @@ flask_app/static/output/ *.rar # 忽略旧版本代码 -flask_app/old_version/ +#flask_app/old_version/ # 忽略操作系统特定的文件 .DS_Store From 493e8abf3134f967e2e47acb662946050ef16546 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Thu, 28 Nov 2024 13:38:17 +0800 Subject: [PATCH 4/8] =?UTF-8?q?11.28=20=E6=B8=85=E7=90=86file=5Fid?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../old_version/不得存在及禁止投标情形_old.py | 172 +++++++ flask_app/old_version/判断截取位置_old.py | 266 +++++++++++ flask_app/old_version/回答来源_old.py | 216 +++++++++ flask_app/old_version/基础信息整合_old.py | 166 +++++++ flask_app/old_version/多线程分类_old.py | 373 +++++++++++++++ flask_app/old_version/废标项_old.py | 66 +++ flask_app/old_version/招标文件解析_old.py | 305 ++++++++++++ flask_app/old_version/文件分类普通版_old.py | 350 ++++++++++++++ .../删除知识库_old.py | 55 +++ .../知识库操作_old.py | 58 +++ .../调用文件解析状态查询_old.py | 33 ++ .../调用文档解析异步提交_old.py | 40 ++ .../调用文档解析结果获取_old.py | 35 ++ .../无效标和废标和禁止投标整合_old.py | 440 ++++++++++++++++++ flask_app/old_version/解析old_old.py | 221 +++++++++ flask_app/old_version/资格审查模块old_old.py | 42 ++ flask_app/old_version/资格评审old_old.py | 182 ++++++++ flask_app/old_version/资格评审前判断_old.py | 29 ++ flask_app/old_version/通义千问_old.py | 48 ++ 19 files changed, 3097 insertions(+) create mode 100644 flask_app/old_version/不得存在及禁止投标情形_old.py create mode 100644 flask_app/old_version/判断截取位置_old.py create mode 100644 flask_app/old_version/回答来源_old.py create mode 100644 flask_app/old_version/基础信息整合_old.py create mode 100644 flask_app/old_version/多线程分类_old.py create mode 100644 flask_app/old_version/废标项_old.py create mode 100644 flask_app/old_version/招标文件解析_old.py create mode 100644 flask_app/old_version/文件分类普通版_old.py create mode 100644 flask_app/old_version/文档理解大模型版知识库处理/删除知识库_old.py create mode 100644 flask_app/old_version/文档理解大模型版知识库处理/知识库操作_old.py create mode 100644 flask_app/old_version/文档理解大模型版知识库处理/调用文件解析状态查询_old.py create mode 100644 flask_app/old_version/文档理解大模型版知识库处理/调用文档解析异步提交_old.py create mode 100644 flask_app/old_version/文档理解大模型版知识库处理/调用文档解析结果获取_old.py create mode 100644 flask_app/old_version/无效标和废标和禁止投标整合_old.py create mode 100644 flask_app/old_version/解析old_old.py create mode 100644 flask_app/old_version/资格审查模块old_old.py create mode 100644 flask_app/old_version/资格评审old_old.py create mode 100644 flask_app/old_version/资格评审前判断_old.py create mode 100644 flask_app/old_version/通义千问_old.py diff --git a/flask_app/old_version/不得存在及禁止投标情形_old.py b/flask_app/old_version/不得存在及禁止投标情形_old.py new file mode 100644 index 0000000..9a4fe1b --- /dev/null +++ b/flask_app/old_version/不得存在及禁止投标情形_old.py @@ -0,0 +1,172 @@ +import json +import os +import re + +from PyPDF2 import PdfWriter, PdfReader + +from flask_app.general.通义千问long import upload_file, qianwen_long +from flask_app.general.通用功能函数 import process_string_list + + +def extract_and_format_from_paths(json_paths, includes, excludes): + """ + 从多个 JSON 文件路径读取数据,提取包含特定关键词的内容,并按照格式要求合并。 + + 参数: + json_paths (list): 包含多个 JSON 文件路径的列表。 + includes (list): 包含要检查的关键词的列表。 + excludes (list): 包含要排除的关键词的列表。 + + 返回: + list: 包含所有文件中满足条件的格式化字符串列表。 + """ + all_formatted_results = [] + + # 遍历每个文件路径 + for path in json_paths: + # 检查路径是否为空或不存在 + if not path or not os.path.isfile(path): + print(f"禁止投标情形: Error: The file path '{path}' is invalid or does not exist.") + continue # 跳过无效路径 + try: + with open(path, 'r', encoding='utf-8') as file: + # 加载 JSON 数据 + json_data = json.load(file) + formatted_results = [] + + # 遍历 JSON 数据的每个键值对 + for key, value in json_data.items(): + if isinstance(value, dict): + # 如果值是字典,检查嵌套字典的每个键值对 + for sub_key, sub_value in value.items(): + if any(include in sub_key for include in includes): + # 如果子值包含关键词,格式化并添加到结果列表 + formatted_results.append(f"{sub_value}") + elif isinstance(value, str): # clause + # 检查是否包含任何 include 关键词 + for include in includes: + if include in value: + # 找到 include 之前的内容 + prefix = value.split(include)[0] + # 检查 prefix 是否不包含任何 exclude 关键词 + if not any(exclude in prefix for exclude in excludes): + # 如果不包含任何 exclude 关键词,添加整个 value 到结果列表 + if '\n' in value: + value = value.split('\n', 1)[-1] + formatted_results.append(value) + break # 找到一个符合条件的就跳出循环 + + # 将当前文件的结果添加到总结果列表 + all_formatted_results.extend(formatted_results) + except FileNotFoundError: + print(f"禁止投标情形: Error: The file '{path}' does not exist.") + except json.JSONDecodeError: + print(f"禁止投标情形: Error: The file '{path}' contains invalid JSON.") + + return all_formatted_results + +def extract_unique_items_from_texts(texts): + # 更新正则表达式以包括更广泛的序号类型,包括中文序号 + pattern = re.compile(r'(?:\d+\.|\(\d+\)|\(\d+\)|\d+\)|\①|\②|\③|\④|\⑤|\⑥|\⑦|\⑧|\⑨|\⑩|\⑪|\⑫)\s*') + intro_pattern = re.compile(r'^.*?[::]') + punctuation_pattern = re.compile(r'[;。,、..,:;!?!?]+$') + url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') + content_check_pattern = re.compile(r'[\u4e00-\u9fa5a-zA-Z0-9]{2,}') # 检查是否含有至少2个连续的字母、数字或汉字 + + all_results = [] + seen = set() + + for text in texts: + # 去除文本中的制表符和换行符 + text = text.replace('\t', '').replace('\n', '') + + # 删除引导性的文本(直到冒号,但保留冒号后的内容) + text = intro_pattern.sub('', text) + + # 替换URL为占位符,并保存URL以便后续还原 + urls = [] + def url_replacer(match): + urls.append(match.group(0)) + return f"{{URL{len(urls)}}}" + text = url_pattern.sub(url_replacer, text) + + # 使用数字和括号的模式分割文本 + items = pattern.split(text) + + for item in items: + cleaned_item = item.strip() + if cleaned_item: + # 进一步清理每个条目 + cleaned_item = pattern.sub('', cleaned_item) + cleaned_item = punctuation_pattern.sub('', cleaned_item) + cleaned_item = cleaned_item.strip() + + # 还原URL + for i, url in enumerate(urls, 1): + cleaned_item = cleaned_item.replace(f"{{URL{i}}}", url) + + # 添加未见过的独特条目,确保它包含足够的实质内容并长度大于3个字符 + if cleaned_item and cleaned_item not in seen and len(cleaned_item) > 3 and content_check_pattern.search(cleaned_item): + seen.add(cleaned_item) + all_results.append(cleaned_item) + + return all_results + +def merge_pdfs(paths, output_filename): + pdf_writer = PdfWriter() + output_path = None + + for path in paths: + pdf_reader = PdfReader(path) + for page in range(len(pdf_reader.pages)): + # 将每页添加到写入对象中 + pdf_writer.add_page(pdf_reader.pages[page]) + + if output_path is None: + # 确定输出文件路径 + output_path = os.path.join(os.path.dirname(path), output_filename) + + # 写入合并的PDF到文件 + if output_path: + with open(output_path, 'wb') as out: + pdf_writer.write(out) + print(f"禁止投标情形: Merged PDF saved to {output_path}") + else: + print("禁止投标情形: No files to merge.") + return output_path + + +def find_forbidden(qualification=""): + try: + if qualification: + file_id = upload_file(qualification) + user_query_forbidden = ( + "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...]," + "请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。" + ) + qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden) + else: + qianwen_forbidden_str = "[]" + + actual_list = process_string_list(qianwen_forbidden_str) # 提取出字符串列表 ["xxx","xx"] + includes = ["不得存在", "不得与", "禁止投标", "对投标人的纪律"] + excludes = ["招标", "评标", "定标"] + # forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes, excludes) + # processed_results = extract_unique_items_from_texts(forbidden_results) + # merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results)) + merged_forbidden_list = list(dict.fromkeys(actual_list)) + forbidden_dict = {'不得存在的其他情形': merged_forbidden_list} + return forbidden_dict + except Exception as e: + print(f"find_forbidden 在处理时发生异常: {e}") + return {'不得存在的其他情形': ""} + +#TODO:不得存在的情况文中有很多内容,货物标中采用了全文搜索的逻辑。 +if __name__ == '__main__': + # truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\truncate_output.json" + # clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json" + qualification = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf" + output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f" + doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile.docx' + res = find_forbidden(qualification) + print(json.dumps(res, ensure_ascii=False, indent=4)) \ No newline at end of file diff --git a/flask_app/old_version/判断截取位置_old.py b/flask_app/old_version/判断截取位置_old.py new file mode 100644 index 0000000..c0b1390 --- /dev/null +++ b/flask_app/old_version/判断截取位置_old.py @@ -0,0 +1,266 @@ +# -*- encoding:utf-8 -*- +import json +import re +import os +import sys +from pathlib import Path +from openai import OpenAI +import concurrent.futures +import queue +import time + +def qianwen_long(file_id, user_query): + print("call qianwen-long...") + """ + Uses a previously uploaded file to generate a response based on a user query. + """ + client = OpenAI( + api_key=os.getenv("DASHSCOPE_API_KEY"), + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" + ) + + # Generate a response based on the file ID + completion = client.chat.completions.create( + model="qwen-long", + top_p=0.5, + temperature=0.4, + messages=[ + { + 'role': 'system', + 'content': f'fileid://{file_id}' + }, + { + 'role': 'user', + 'content': user_query + } + ], + stream=False + ) + + # Return the response content + return completion.choices[0].message.content + +def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type): + + if llm_type==2: + print(f"qianwen_long! question:{question}") + qianwen_res = qianwen_long(file_id,question) + result_queue.put((ans_index,(question,qianwen_res))) + return +def multi_threading(queries, knowledge_name="", file_id="", llm_type=1): + if not queries: + return [] + + print("多线程提问:starting multi_threading...") + result_queue = queue.Queue() + + # 使用 ThreadPoolExecutor 管理线程 + with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor: + # 逐个提交任务,每提交一个任务后休眠1秒 + future_to_query = {} + for index, query in enumerate(queries): + future = executor.submit(llm_call, query, knowledge_name, file_id, result_queue, index, llm_type) + future_to_query[future] = index + time.sleep(1) # 每提交一个任务后等待1秒 + + # 收集每个线程的结果 + for future in concurrent.futures.as_completed(future_to_query): + index = future_to_query[future] + try: + future.result() # 捕获异常或确认任务完成 + except Exception as exc: + print(f"Query {index} generated an exception: {exc}") + # 确保在异常情况下也向 result_queue 添加占位符 + result_queue.put((index, None)) + + # 从队列中获取所有结果并按索引排序 + results = [None] * len(queries) + while not result_queue.empty(): + index, result = result_queue.get() + results[index] = result + # 检查是否所有结果都是 None + if all(result is None for result in results): + return [] + # 过滤掉None值 + results = [r for r in results if r is not None] + # 返回一个保证是列表的结构 + return results + +def upload_file(file_path): + """ + Uploads a file to DashScope and returns the file ID. + """ + client = OpenAI( + api_key=os.getenv("DASHSCOPE_API_KEY"), + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" + ) + file = client.files.create(file=Path(file_path), purpose="file-extract") + return file.id + +def load_data(json_path): + """ + 从指定的JSON文件中加载数据。 + + Args: + json_path (str): JSON文件的路径。 + + Returns: + dict: 加载的JSON数据字典。 + """ + try: + with open(json_path, 'r', encoding='utf-8') as f: + data = json.load(f) + return data + except FileNotFoundError: + print(f"错误:文件未找到 - {json_path}") + sys.exit(1) + except json.JSONDecodeError as e: + print(f"错误:解析JSON文件时出错 - {e}") + sys.exit(1) + + +def define_target_names(): + """ + 定义目标名称列表。 + + Returns: + list: 目标名称列表。 + """ + return [ + "营业执照", + # "开户信息", + "法定代表人身份证", + # "法定代表人授权人身份证", + "人员证书", + "人员社保资料", + # "劳动合同", + "企业证书", + "企业业绩", + "财务审计报告", + "缴纳税收证明", + "公司缴纳社保证明" + ] + + +def generate_user_query(target, chapters, keywords): + """ + 根据目标、章节和关键词生成用户查询模板。 + + Args: + target (str): 目标名称。 + chapters (list): 相关章节列表。 + keywords (list): 相关关键词列表。 + + Returns: + str: 生成的用户查询字符串。 + """ + template3 = f"""这是投标文件模板,作为投标人,我需要把不同的投标材料填充到对应位置,请你根据该文件回答:{target}应该插入在该文件哪个地方?你可能需要查找以下关键词出现的地方:{', '.join([f"'{kw}'" for kw in keywords])},并确认插入的位置。我已在原文中打上若干待插入位置的标记,形如'[$$第5个待插入位置$$]',它的标记与它上面的小节内容关联。你需要返回给我{target}应该插入位置的标记序号,即'[$$第5个待插入位置$$]'中的'5',而不是页码,若有多个位置需要插入,可以返回多个序号,你的回答以数组返回,如[17, 19],若插入位置不明确,那么返回[-1]。 +""" + template2 = f"""你是一个专业撰写投标文件的专家,这是投标文件模板,作为投标人,你需要把不同的投标材料填充到对应位置,请你根据该文件回答:{target}应该插入在该文件哪个地方?你可能需要查找以下关键词出现的地方:{', '.join([f"'{kw}'" for kw in keywords])},或者根据文中'备注'或'附'内的信息确认插入的位置。目前原文中各章节末尾已打上待插入位置的标记,标记格式为'[$$当前章节名:第x个待插入位置$$]'形如'[$$四、投标保证金:第4个待插入位置$$]',每个标记与它紧跟着的上面的这一章内容关联。你需要返回给我{target}应该插入位置的标记序号,即'[$$四、投标保证金:第4个待插入位置$$]'中的'4',而不是当前页码,若有多个位置需要插入,可以返回多个序号,你的回答以数组返回,如[4, 5],若插入位置不明确,那么返回[-1]。 + """ + template1 = f"""这是投标文件模板,作为投标人,我需要把不同的投标材料填充到对应位置,请你根据该文件回答:{target}应该插入在该文件哪个地方?你可能需要查找以下关键词出现的地方:{', '.join([f"'{kw}'" for kw in keywords])},并确认插入的位置。我已在原文中各章节末尾打上待插入位置的标记,标记格式为'[$$当前章节名:第x个待插入位置$$]'形如'[$$四、投标保证金:第4个待插入位置$$]',每个标记与它紧跟着的上面的这一章内容关联。你需要返回给我{target}应该插入位置的标记数字序号,即'[$$四、投标保证金:第4个待插入位置$$]'中的'4',而不是当前页码,若有多个位置需要插入,可以返回多个序号,你的回答以数组返回,如[4, 5],若插入位置不明确,那么返回[-1]。 + """ + + return template1 + + +def generate_user_queries(target_names, data_dict): + """ + 为每个目标生成对应的用户查询。 + + Args: + target_names (list): 目标名称列表。 + data_dict (dict): 数据字典。 + + Returns: + list: 包含目标和查询的字典列表。 + """ + user_queries = [] + for target in target_names: + if target in data_dict: + chapters = data_dict[target].get("章节", []) + keywords = data_dict[target].get("关键字", []) + query = generate_user_query(target, chapters, keywords) + user_queries.append({ + "target": target, + "query": query + }) + else: + print(f"警告:'{target}'未在数据字典中找到相关信息。") + return user_queries + + +def process_string_list(string_list): + """ + 处理字符串列表,提取方括号内的内容并转换为实际列表。 + + Args: + string_list (str): 包含方括号的字符串。 + + Returns: + list: 解析后的列表内容。 + """ + match = re.search(r'\[(.*?)\]', string_list) + if match: + content_inside = match.group(1).strip() + if content_inside: + items = [item.strip() for item in content_inside.split(',')] + if all(item.isdigit() for item in items): + formatted_list = [int(item) for item in items] + else: + formatted_list = items + return formatted_list + return [] + + +def main(): + # 定义JSON文件路径 + # json_path = "flask_app/general/static/插入位置.json" + json_path = "/flask_app/general/static/插入位置.json" + # 加载数据 + data_dict = load_data(json_path) + + # 定义目标名称 + target_names = define_target_names() + + # 生成用户查询列表 + user_query_list = generate_user_queries(target_names, data_dict) + + if not user_query_list: + print("没有生成任何用户查询。") + sys.exit(0) + + # 提取查询 + queries = [item['query'] for item in user_query_list] + + # 定义文件路径 + format_part = "C:\\Users\\Administrator\\Desktop\\bid_format.pdf" + # format_part="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹\\bid_format.docx" + # 检查文件是否存在 + if not os.path.isfile(format_part): + print(f"错误:文件未找到 - {format_part}") + sys.exit(1) + + # 上传文件并获取file_id + file_id = upload_file(format_part) + + if not file_id: + print("错误:文件上传失败。") + sys.exit(1) + + # 使用多线程并行处理查询 + results = multi_threading(queries, "", file_id,2) + + if not results: + print("错误:未收到任何处理结果。") + sys.exit(1) + + # 清理返回结果 + baseinfo_list = [process_string_list(res) for _, res in results] + + # 输出结果 + for i, info in enumerate(baseinfo_list): + print(f'{target_names[i]}:{info}') + +if __name__ == "__main__": + main() diff --git a/flask_app/old_version/回答来源_old.py b/flask_app/old_version/回答来源_old.py new file mode 100644 index 0000000..27bb7fe --- /dev/null +++ b/flask_app/old_version/回答来源_old.py @@ -0,0 +1,216 @@ +#基于多线程提问,现已废弃 +# assistant_id +import queue +import concurrent.futures +from dashscope import Assistants, Messages, Runs, Threads +from llama_index.indices.managed.dashscope import DashScopeCloudRetriever +from flask_app.general.json_utils import extract_content_from_json +prompt = """ +# 角色 +你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。 + +## 技能 +### 技能 1:文档解析与摘要 +- 深入理解并分析${document1}的内容,提取关键信息。 +- 根据需求生成简洁明了的摘要,保持原文核心意义不变。 + +### 技能 2:信息检索与关联 +- 在${document1}中高效检索特定信息或关键词。 +- 能够识别并链接到文档内部或外部的相关内容,增强信息的连贯性和深度。 + +## 限制 +- 所有操作均需基于${document1}的内容,不可超出此范围创造信息。 +- 在处理敏感或机密信息时,需遵守严格的隐私和安全规定。 +- 确保所有生成或改编的内容逻辑连贯,无误导性信息。 + +请注意,上述技能执行时将直接利用并参考${document1}的具体内容,以确保所有产出紧密相关且高质量。 +""" +prom = '请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}' + + + + +#正文和文档名之间的内容 +def extract_content_between_tags(text): + results = [] + + # 使用“【正文】”来分割文本 + parts = text.split('【正文】')[1:] # 跳过第一个分割结果,因为它前面不会有内容 + + for index, part in enumerate(parts): + # 查找“【文档名】”标签的位置 + doc_name_index = part.find('【文档名】') + # 查找 'file_ids' 标签的位置 + file_ids_index = part.find("'file_ids'") + + # 根据是否找到“【文档名】”来决定提取内容的截止点 + if doc_name_index != -1: + end_index = doc_name_index + elif file_ids_index != -1: + end_index = file_ids_index + else: + end_index = len(part) + + # 提取内容 + content = part[:end_index].strip() + results.append(content) + + # 如果存在 file_ids,处理最后一部分特别提取 file_ids 前的内容 + if "'file_ids'" in parts[-1]: + file_ids_index = parts[-1].find("'file_ids'") + if file_ids_index != -1: + last_content = parts[-1][:file_ids_index].strip() + results[-1] = last_content # 更新最后一部分的内容,确保只到 file_ids + + return results + +def find_references_in_extracted(formatted_ans, extracted_references): + results = {} # 用来存储匹配结果的字典 + + # 递归函数,用于处理多层嵌套的字典 + def recurse_through_dict(current_dict, path=[]): + for key, value in current_dict.items(): + # 检查值是否还是字典,如果是,进行递归 + if isinstance(value, dict): + recurse_through_dict(value, path + [key]) + else: + # 特定值处理:如果值为'未知',直接设置索引为-1 + if value == '未知': + results['.'.join(path + [key])] = -1 + else: + # 进行匹配检查 + found = False + for index, reference in enumerate(extracted_references): + if str(value) in reference: # 转换为字符串,确保兼容性 + results['.'.join(path + [key])] = index # 使用点表示法记录路径 + found = True + break + if not found: + results['.'.join(path + [key])] = None + + # 从根字典开始递归 + recurse_through_dict(formatted_ans) + return results + +def send_message(assistant, message='百炼是什么?'): + ans = [] + print(f"Query: {message}") + + # create thread. + thread = Threads.create() + print(thread) + + # create a message. + message = Messages.create(thread.id, content=message) + # create run + + run = Runs.create(thread.id, assistant_id=assistant.id) + print(run) + + # wait for run completed or requires_action + run_status = Runs.wait(run.id, thread_id=thread.id) + print(run_status) + reference_txt = str(run_status) + extracted_references = extract_content_between_tags(reference_txt) #引用的文章来源list + # get the thread messages. + msgs = Messages.list(thread.id) + for message in msgs['data'][::-1]: + ans.append(message['content'][0]['text']['value']) + return ans,extracted_references + +def rag_assistant(knowledge_name): + retriever = DashScopeCloudRetriever(knowledge_name) + pipeline_id = str(retriever.pipeline_id) + assistant = Assistants.create( + model='qwen-max', + name='smart helper', + description='智能助手,支持知识库查询和插件调用。', + temperature='0.3', + instructions=prom, + tools=[ + { + "type": "code_interpreter" + }, + { + "type": "rag", + "prompt_ra": { + "pipeline_id": pipeline_id, + "parameters": { + "type": "object", + "properties": { + "query_word": { + "type": "str", + "value": "${document1}" + } + } + } + } + }] + ) + return assistant + + +def pure_assistant(): + assistant = Assistants.create( + model='qwen-max', + name='smart helper', + description='智能助手,能基于用户的要求精准简洁地回答用户的提问', + instructions='智能助手,能基于用户的要求精准简洁地回答用户的提问', + tools=[ + { + "type": "code_interpreter" + }, + ] + ) + return assistant + +def llm_call(question, knowledge_name, result_queue, ans_index, use_rag=True): + if use_rag: + assistant = rag_assistant(knowledge_name) + else: + assistant = pure_assistant() + ans,extracted_references = send_message(assistant, message=question) + for index, reference in enumerate(extracted_references, start=0): + print(f"{index}. {reference}") + formatted_ans=extract_content_from_json(ans[1]) + print(formatted_ans) + + results = find_references_in_extracted(formatted_ans, extracted_references) + for key, index in results.items(): + print(f"{key}: Found at index {index}") + result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans) + +def multi_threading(queries, knowledge_name, use_rag=True): + result_queue = queue.Queue() + + # 使用 ThreadPoolExecutor 管理线程 + with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: + # 使用字典保存每个提交的任务的Future对象,以便按顺序访问结果 + future_to_query = {executor.submit(llm_call, query, knowledge_name, result_queue, index, use_rag): index for + index, query in enumerate(queries)} + + # 收集每个线程的结果 + for future in concurrent.futures.as_completed(future_to_query): + index = future_to_query[future] + # 由于 llm_call 函数本身会处理结果,这里只需要确保任务执行完成 + try: + future.result() # 可以用来捕获异常或确认任务完成 + except Exception as exc: + print(f"Query {index} generated an exception: {exc}") + + # 从队列中获取所有结果并按索引排序 + results = [None] * len(queries) + while not result_queue.empty(): + index, result = result_queue.get() + results[index] = result + return results + +if __name__ == "__main__": + # 读取问题列表 + questions = ["该招标文件的工程概况(或项目概况)是?招标范围是?请按json格式给我提供信息,键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。"] + knowledge_name = "招标解析5word" + results = multi_threading(questions, knowledge_name, use_rag=True) + # 打印结果 + for question, response in results: + print(f"Question: {question}") + print(f"Response: {response}") diff --git a/flask_app/old_version/基础信息整合_old.py b/flask_app/old_version/基础信息整合_old.py new file mode 100644 index 0000000..e8be244 --- /dev/null +++ b/flask_app/old_version/基础信息整合_old.py @@ -0,0 +1,166 @@ +import json + +from flask_app.general.json_utils import clean_json_string, rename_outer_key +from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice +from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge +from flask_app.general.多线程提问 import read_questions_from_file, multi_threading +from flask_app.general.通义千问long import upload_file +from flask_app.general.通用功能函数 import judge_consortium_bidding + +def aggregate_basic_info_engineering(baseinfo_list): + """ + 将基础信息列表中的数据进行合并和分类。 + + 参数: + - baseinfo_list (list): 包含多个基础信息的列表。 + + 返回: + - dict: 合并和分类后的基础信息字典。 + """ + key_groups = { + "招标人/代理信息": ["招标人", "招标人联系方式", "招标代理机构", "招标代理机构联系方式"], + "项目信息": ["项目名称", "招标编号", "项目概况", "招标范围", "招标控制价", "投标竞争下浮率"], + "关键时间/内容": [ + "投标文件递交截止日期", + "投标文件递交方式", + "开标时间", + "开标地点", + "投标人要求澄清招标文件的截止时间", + "投标有效期", + "评标结果公示媒介" + ], + "保证金相关": ["质量保证金", "退还投标保证金"], + "其他信息": [ + "重新招标、不再招标和终止招标", + "投标费用承担", + "招标代理服务费", + "是否退还投标文件", + ] + } + + combined_data = {} + relevant_keys_detected = set() + + # 合并所有基础信息并收集相关键 + for baseinfo in baseinfo_list: + combined_data.update(baseinfo) + relevant_keys_detected.update(baseinfo.keys()) + + # 动态调整键组 + dynamic_key_handling(key_groups, relevant_keys_detected) + + # 创建一个副本以存储未分类的项目 + unclassified_items = {k: v for k, v in combined_data.items() if k not in [item for sublist in key_groups.values() for item in sublist]} + + # 按键组分类并嵌套 + for group_name, keys in key_groups.items(): + group_data = {key: combined_data.get(key, "未提供") for key in keys} + combined_data[group_name] = group_data + # 从 unclassified_items 中移除已分类的键 + for key in keys: + unclassified_items.pop(key, None) + + # 将剩余未分类的键值对添加到 "其他信息" 组 + combined_data["其他信息"].update(unclassified_items) + + # 移除顶层的未分类键值对 + for key in list(combined_data.keys()): + if key not in key_groups: + del combined_data[key] + + return combined_data + +def dynamic_key_handling(key_groups, detected_keys): + # 检查和调整键组配置 + for key in detected_keys: + # 处理“保证金相关”组,插到"质量保证金"前 + if "保证金" in key: + group = key_groups["保证金相关"] + insert_before = "质量保证金" + if insert_before in group: + index = group.index(insert_before) + if key not in group: # 避免重复插入 + group.insert(index, key) + else: + group.append(key) # 如果没有找到特定键,则追加到末尾 + elif "联合体" in key: + key_groups["项目信息"].append(key) + elif "分包" in key: + key_groups["项目信息"].append(key) + elif "踏勘现场" in key: + key_groups["其他信息"].append(key) + elif "投标预备会" in key: + key_groups["其他信息"].append(key) + elif "偏离" in key: + key_groups["其他信息"].append(key) + +def combine_basic_info(knowledge_name, truncate0, output_folder, clause_path): + """ + 综合和处理基础信息,生成最终的基础信息字典。 + + 参数: + - knowledge_name (str): 知识名称。 + - truncate0 (str): 文件路径。 + - output_folder (str): 输出文件夹路径。 + - clause_path (str): 条款路径。 + + 返回: + - dict: 综合后的基础信息。 + """ + baseinfo_list = [] + baseinfo_file_path = 'flask_app/static/提示词/基本信息工程标.txt' + # baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息工程标.txt' + questions = read_questions_from_file(baseinfo_file_path) + res1 = multi_threading(questions, knowledge_name) + + for index, response in res1: + try: + if response and len(response) > 1: + baseinfo_list.append(clean_json_string(response[1])) + else: + print(f"基础信息整合: Warning: Missing or incomplete response data for query index {index}.") + except Exception as e: + print(f"基础信息整合: Error processing response for query index {index}: {e}") + + # 判断是否分包、是否需要递交投标保证金等 + chosen_numbers, merged = judge_whether_main(truncate0, output_folder) + baseinfo_list.append(merged) + + judge_file_path = 'flask_app/static/提示词/是否相关问题.txt' + # judge_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题.txt' + judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers) + judge_consortium = judge_consortium_bidding(baseinfo_list) # 通过招标公告判断是否接受联合体投标 + + if judge_consortium: + judge_consortium_question = ( + "该招标文件对于联合体投标的要求是怎样的,请按json格式给我提供信息," + "外层键名为'联合体投标要求',其中有一个嵌套键值对为:\"是否接受联合体投标\":\"是\"" + ) + judge_questions.append(judge_consortium_question) + + file_id = upload_file(truncate0) + res2 = multi_threading(judge_questions, "", file_id, 2) # 调用千问-long + + if not res2: + print("基础信息整合: multi_threading error!") + else: + for question, response in res2: + baseinfo_list.append(clean_json_string(response)) + + rebidding_situation = extract_from_notice(clause_path, 3) # "重新招标, 不再招标和终止招标"需从投标人须知正文提取 + update_json = rename_outer_key(rebidding_situation, "重新招标、不再招标和终止招标") + baseinfo_list.append(update_json) + aggregated_baseinfo = aggregate_basic_info_engineering(baseinfo_list) # 现在是一个字典 + return {"基础信息": aggregated_baseinfo} + +if __name__ == "__main__": + knowledge_name = "ztb" + output_folder="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405" + truncate0="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\ztbfile_tobidders_notice_table.pdf" + clause_path="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\clause1.json" + res=combine_basic_info(knowledge_name,truncate0,output_folder,clause_path) + print(json.dumps(res,ensure_ascii=False,indent=4)) + + + + diff --git a/flask_app/old_version/多线程分类_old.py b/flask_app/old_version/多线程分类_old.py new file mode 100644 index 0000000..c50c288 --- /dev/null +++ b/flask_app/old_version/多线程分类_old.py @@ -0,0 +1,373 @@ +# -*- coding: utf-8 -*- +import os +import shutil +from PyPDF2 import PdfReader, PdfWriter + + +def validate_pdf(file_path): + """ 验证PDF文件是否损坏 """ + try: + with open(file_path, "rb") as file: + pdf = PdfReader(file) + return len(pdf.pages) > 0 + except Exception as e: + print(f"Error reading PDF {file_path}: {str(e)}") + return False + + +def truncate_pdf(source_path, target_path, max_pages=15): + """截取PDF文件的前15页并保存""" + try: + with open(source_path, "rb") as infile: + reader = PdfReader(infile) + writer = PdfWriter() + for i in range(min(max_pages, len(reader.pages))): + writer.add_page(reader.pages[i]) + with open(target_path, "wb") as outfile: + writer.write(outfile) + except Exception as e: + print(f"Error processing PDF {source_path}: {str(e)}") + +def copy_file(src, dest): + """ 复制单个文件 """ + os.makedirs(os.path.dirname(dest), exist_ok=True) + shutil.copy2(src, dest) + +def copy_directory(source, destination): + """复制整个目录""" + os.makedirs(destination, exist_ok=True) + for item in os.listdir(source): + src_path = os.path.join(source, item) + dest_path = os.path.join(destination, item) + if os.path.isfile(src_path): + copy_file(src_path, dest_path) + else: + copy_directory(src_path, dest_path) + +def unique_file_name(base_path, file_name): + counter = 1 + name_part, extension = os.path.splitext(file_name) + new_file_name = file_name + # 检查文件是否存在,若存在则修改文件名 + while os.path.exists(os.path.join(base_path, new_file_name)): + new_file_name = f"{name_part}_{counter}{extension}" + counter += 1 + return new_file_name + +def process_pdf_folders(source_dir, target_dir): + """ 处理源目录中的PDF文件,并基于条件选择目标文件夹 """ + for root, dirs, files in os.walk(source_dir, topdown=False): + for file in files: + if file.lower().endswith('.pdf'): + source_file_path = os.path.join(root, file) + if validate_pdf(source_file_path): + relative_path = os.path.relpath(root, source_dir) + target_file_dir = os.path.join(target_dir, relative_path) + target_file_path = os.path.join(target_file_dir, file) + copy_file(source_file_path, target_file_path) + else: + print(f"Deleted corrupt file: {source_file_path}") + # 清除空目录 + if not os.listdir(root): + os.rmdir(root) + + +def classify_folders(source_dir, target_dir, project_index): + """Classifies folders and processes files based on specific criteria.""" + temp_dir = os.path.join(source_dir, 'temp') + os.makedirs(temp_dir, exist_ok=True) + + target_project_dir = None + processed_dirs = set() # Set to track processed directories + + for subdir in os.listdir(source_dir): + subdir_path = os.path.join(source_dir, subdir) + if not os.path.isdir(subdir_path): + continue + + files = [f.lower() for f in os.listdir(subdir_path)] + if 'zb.pdf' in files: + target_project_dir, target_zb_name= process_tender_files(subdir_path, temp_dir, target_dir, project_index) + processed_dirs.add(subdir_path) # Mark this directory as processed + elif subdir.lower() == "输出文件": + process_evaluation_files(subdir_path, target_project_dir) + processed_dirs.add(subdir_path) # Mark this directory as processed + + # Process remaining folders, skipping already processed ones + process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs,target_zb_name) + #删除tmp目录 + # if os.path.exists(temp_dir): + # shutil.rmtree(temp_dir) + if os.path.exists(source_dir): + shutil.rmtree(source_dir) + +def process_tender_files(subdir_path, temp_dir, target_dir, project_index): + """Processes tender files and returns the target project directory and updated index.""" + zb_path = os.path.join(subdir_path, "zb.pdf") + truncated_zb_path = os.path.join(temp_dir, "truncate_zb.pdf") + truncate_pdf(zb_path, truncated_zb_path, 30) # Truncate to the first 30 pages + bot_response = file_parse(truncated_zb_path, project_index, 1) + zb_response = extract_answer(bot_response[1], 1) + zb_response[0]=zb_response[0].replace('/', '-').replace('\\', '-') #用'-'代替'/' ,目录名不允许出现斜杠 + target_zb_name, category = zb_response[2] + ".pdf", zb_response[3] + + new_zb_path = os.path.join(subdir_path, target_zb_name) + os.rename(zb_path, new_zb_path) + + target_category_dir = os.path.join(target_dir, category) + target_project_dir = os.path.join(target_category_dir, zb_response[0] + "_" + zb_response[1]) + copy_directory(subdir_path, os.path.join(target_project_dir, "招标文件夹")) + + os.remove(truncated_zb_path) + shutil.rmtree(subdir_path) + return target_project_dir, zb_response[2] + + +def process_evaluation_files(subdir_path, target_project_dir): + """Processes evaluation folders.""" + copy_directory(subdir_path, os.path.join(target_project_dir, "评标文件夹")) + shutil.rmtree(subdir_path) + + +def process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs, target_zb_name): + """Processes remaining folders containing bid files.""" + target_tb_dir = os.path.join(target_project_dir, "投标文件夹") + + for subdir in os.listdir(source_dir): + subdir_path = os.path.join(source_dir, subdir) + if not os.path.isdir(subdir_path) or subdir_path in processed_dirs: + continue # Skip processed directories + + target_tbcom_dir = None # Initialize outside the file loop + + for item in os.listdir(subdir_path): + item_src_path = os.path.join(subdir_path, item) + new_name = "truncate_" + item + truncated_tb_path = os.path.join(temp_dir, new_name) + truncate_pdf(item_src_path, truncated_tb_path) + bot_response = file_parse(truncated_tb_path, project_index, 2) + tb_response = extract_answer(bot_response[1], 2) + if not tb_response: + continue # If there's no response, skip this file + + # Initialize target_tbcom_dir only once based on the first processed file + if not target_tbcom_dir: + target_tb_name, _ = tb_response + if(target_tb_name != target_zb_name and target_tb_name != "未知"): + target_tbcom_dir = os.path.join(target_tb_dir, target_tb_name) + print(target_tbcom_dir) + os.makedirs(target_tbcom_dir, exist_ok=True) + + tb_section = tb_response[1] + ".pdf" + new_tb_path = os.path.join(subdir_path, tb_section) + # 获取唯一文件名 + new_tb_path = os.path.join(subdir_path, unique_file_name(subdir_path, tb_section)) + os.rename(item_src_path, new_tb_path) + + # Remove temporary truncated file + os.remove(truncated_tb_path) + + # Copy the whole directory at once after all files have been processed and renamed + if target_tbcom_dir: + copy_directory(subdir_path, target_tbcom_dir) + shutil.rmtree(subdir_path) # Optionally remove the original directory after copying + +import re + +def extract_answer(input_string, type): + # 使用正则表达式匹配所需的信息 + # 第一种模式:项目名称、项目编号、招标人、类别 + # 在每个字段值后都添加了\s*以忽略尾随的空格 + pattern1 = r"项目名称[::]\s*(.*?)[;;]\s*项目编号[::]\s*(.*?)[;;]\s*招标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*" + # 第二种模式:投标人、类别 + pattern2 = r"投标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*" + + if type == 1: + match = re.search(pattern1, input_string) + if match: + print(f"1: {match.group(1).strip()} 2: {match.group(2).strip()} 3: {match.group(3).strip()} 4: {match.group(4).strip()}") + return [match.group(1).strip(), match.group(2).strip(), match.group(3).strip(), match.group(4).strip()] + else: + print("No match found for type 1.") + return [] + elif type == 2: + match = re.search(pattern2, input_string) + if match: + # 检查是否包含“投标函”,如果是则替换部分内容为“商务文件” + part = match.group(2).strip() + if "投标函" in part: + part = "商务文件" + return [match.group(1).strip(), part] + else: + print("No match found for type 2.") + return [] + + +from llama_index.readers.dashscope.base import DashScopeParse +from llama_index.readers.dashscope.utils import ResultType +from llama_index.indices.managed.dashscope import DashScopeCloudIndex +from dashscope import Assistants, Messages, Runs, Threads + + +def send_message(assistant, index,documents,message='百炼是什么?'): + print(f"Query: {message}") + + # create thread. + # create a thread. + thread = Threads.create() + + print(thread) + + # create a message. + message = Messages.create(thread.id, content=message) + # create run + + run = Runs.create(thread.id, assistant_id=assistant.id) + print("run:" + str(run)) + + # # get run statue + run_status = Runs.get(run.id, thread_id=thread.id) + # print(run_status) + + # wait for run completed or requires_action + run_status = Runs.wait(run.id, thread_id=thread.id) + # print(run_status) + + # if prompt input tool result, submit tool result. + + run_status = Runs.get(run.id, thread_id=thread.id) + # print(run_status) + # verify_status_code(run_status) + + # get the thread messages. + msgs = Messages.list(thread.id) + # print(msgs) + # print(json.dumps(msgs, default=lambda o: o.__dict__, sort_keys=True, indent=4)) + + ans = [] + + print("运行结果:") + for message in msgs['data'][::-1]: + ans.append(message['content'][0]['text']['value']) + print("content: ", message['content'][0]['text']['value']) + print("\n") + deleteFileFromKnowledge(index,documents) + return ans + + +def file_parse(filepath, knowledge_index, type): + parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND) + documents = parse.load_data(file_path=filepath) + # 创建一个字典来映射index值到知识库名 + index_to_name = { + 0: "文件分类知识库0", + 1: "文件分类知识库1", + 2: "文件分类知识库2", + 3: "文件分类知识库3" + } + # 使用get方法获取对应的知识库名,如果index不存在,返回"默认文件分类知识库" + cloud_index_name = index_to_name.get(knowledge_index, "0") + + index = DashScopeCloudIndex(cloud_index_name) + index._insert(documents) + retriever = index.as_retriever() + pipeline_id = str(retriever.pipeline_id) + assistant = Assistants.create( + model='qwen-plus', + name='smart helper', + description='智能助手,支持知识库查询和插件调用。', + instructions='请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}', + tools=[ + { + "type": "code_interpreter" + }, + { + "type": "rag", + "prompt_ra": { + "pipeline_id": pipeline_id, + "parameters": { + "type": "object", + "properties": { + "query_word": { + "type": "str", + "value": "${document1}" + } + + } + } + } + }] + ) + questions1 = "这份招标文件的项目名称是什么?这份文件的招标编号是多少?这份文件的招标人是谁?这份招标文件属于以下哪类招标:服务类、工程类、还是货物类?你可以结合你对招投标的了解和以下内容:工程类招标投标是指建设单位对拟建的工程项目通过法定的程序和方式吸引建设项目的承包单位竞争;\ + 货物类招投标是指以货物作为采购对象的招标,业主或称购货方为获得货物通过招标的形式选择合格的供货商或称供货方,包括原材料、产品、设备、电能和固态、液态、气态物体等;服务类招标又称为服务采购,指的是除了工程和货之外的其他招标投标活动,服物招标投标范围包含建设工程的勘察、设计、监理、工程咨询评估、科技项目、科研课题、物业管理、金融保险服务等\ + 请按下列键值对格式给我提供信息,避免无关内容:‘项目名称:XXX;项目编号:XXX;招标人:XXX;类别:XXX‘" + questions2 = "这份投标文件的投标人是谁?如果文件中未明确提及投标人,投标人以未知替代;这份文件属于哪类投标文件?你的回答仅限于以下三种:商务文件、技术文件、报价文件,不要给出其他回答。请按下列键值对格式给我提供信息,避免无关内容:‘投标人:XXX;类别:XXX’" + if (type == 1): + questions = questions1 + elif (type == 2): + questions = questions2 + return send_message(assistant,index,documents, message=questions) + +def deleteFileFromKnowledge(index,documents): + # 初始化一个列表来存储所有文档的 ID + file_ids = [] + + # 检查documents是否为列表且不为空 + if isinstance(documents, list) and documents: + # 遍历每个文档 + for document in documents: + # 使用属性访问方式获取每个文档的 id_ + # 确保 document 对象有一个名为 id_ 的属性 + file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常 + if file_id: + file_ids.append(file_id) # 将 id 添加到列表中 + index.delete_ref_doc(file_ids) + +import concurrent.futures +import logging + +# 配置日志,只输出到控制台 +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +def process_directory(subdir_path, base_intermediate_directory, final_directory, project_index): + logging.info(f"Starting to process directory: {subdir_path} with index {project_index}") + + # 为每个子目录创建一个专用的临时目录 + intermediate_directory = os.path.join(base_intermediate_directory, f"intermediate_{project_index}") + os.makedirs(intermediate_directory, exist_ok=True) + logging.info(f"Created intermediate directory: {intermediate_directory}") + + # 这里可以添加具体的目录处理函数,例如: + process_pdf_folders(subdir_path, intermediate_directory) + classify_folders(intermediate_directory, final_directory, project_index % 4) + + # 处理完毕后清理该目录 + shutil.rmtree(intermediate_directory) + logging.info(f"Deleted intermediate directory: {intermediate_directory}") + +def main(base_directory, base_intermediate_directory, final_directory): + os.makedirs(final_directory, exist_ok=True) + logging.info(f"Final directory ensured at: {final_directory}") + + project_index = 0 + with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: + futures = [] + for subdir in os.listdir(base_directory): + subdir_path = os.path.join(base_directory, subdir) + if os.path.isdir(subdir_path): + logging.info(f"Submitting job for directory: {subdir_path}") + future = executor.submit(process_directory, subdir_path, base_intermediate_directory, final_directory, project_index) + futures.append(future) + project_index += 1 + + for future in concurrent.futures.as_completed(futures): + try: + future.result() # 如果执行没有抛出异常,完成该任务 + except Exception as e: + logging.error(f"Thread resulted in an error: {e}") + +if __name__ == "__main__": + base_directory = 'D:\\bidding_trading_files\\招投标文件2023\\2023' + base_intermediate_directory = 'D:\\tmp' + final_directory = 'D:\\output' + main(base_directory, base_intermediate_directory, final_directory) diff --git a/flask_app/old_version/废标项_old.py b/flask_app/old_version/废标项_old.py new file mode 100644 index 0000000..77bec71 --- /dev/null +++ b/flask_app/old_version/废标项_old.py @@ -0,0 +1,66 @@ +import fitz # PyMuPDF +import re + +def extract_text_with_keywords(pdf_path, keywords, follow_up_keywords): + """ + 提取PDF文档中包含特定关键词的段落,包括正文和表格。如果段落中包含特定的后续关键词,也提取其后的段落,直到遇到下一个相似的序列编号。 + :param pdf_path: PDF文件的路径。 + :param keywords: 包含关键词的列表。 + :param follow_up_keywords: 触发连续提取的关键词列表。 + :return: 包含关键词的段落列表。 + """ + doc = fitz.open(pdf_path) + extracted_paragraphs = [] + continue_collecting = False + current_section_pattern = None + + for page in doc: + text_blocks = page.get_text("blocks") + for index, block in enumerate(text_blocks): + text = block[4].strip() # Text content of the block + if text == "": # Skip empty lines + continue + if continue_collecting: + if current_section_pattern and re.match(current_section_pattern, text): + continue_collecting = False + else: + extracted_paragraphs.append(text) + + if any(keyword in text for keyword in keywords): + extracted_paragraphs.append(text) + if any(follow_up in text for follow_up in follow_up_keywords): + continue_collecting = True + section_number = re.match(r'(\d+(\.\d+)*)', text) + if section_number: + current_section_number = section_number.group(1) + base_section_number = current_section_number.rsplit('.', 1)[0] + current_section_pattern = re.compile(rf'^{re.escape(base_section_number)}\.\d+\b') + else: + found_next_number = False + for next_index in range(index + 1, len(text_blocks)): + next_text = text_blocks[next_index][4].strip() + next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)', next_text) + if next_section_number: + found_next_number = True + current_section_pattern = re.compile(rf'^{next_section_number.group(1)}\b') + elif found_next_number: + break + doc.close() + return extracted_paragraphs + +# Example usage +doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标02_invalid.docx' +keywords = ['否决', '无效投标', '被拒绝', '予以拒绝'] +follow_up_keywords = ['情形之一'] +extracted_contents = extract_text_with_keywords(doc_path, keywords, follow_up_keywords) + +# Writing to file and handling duplicates +output_file = "C:\\Users\\Administrator\\Desktop\\temp.txt" +with open(output_file, 'w', encoding='utf-8') as file: + for content in extracted_contents: + file.write(content + '\n') + +# file_id = upload_file(output_file) +# user_query="根据文本中的内容,请你回答,否决投标和拒绝投标的情况有哪些?由于文本中存在冗余且无关的信息,且文本中的序号是混乱的,请你重新编排答案返回给我,以json格式返回,你的回答仅限于原文内容但删去原有序号,请不要自己组织语言。" +# res=qianwen_long(file_id,user_query) +# print("Query Result:", res) \ No newline at end of file diff --git a/flask_app/old_version/招标文件解析_old.py b/flask_app/old_version/招标文件解析_old.py new file mode 100644 index 0000000..7bcf816 --- /dev/null +++ b/flask_app/old_version/招标文件解析_old.py @@ -0,0 +1,305 @@ +# -*- encoding:utf-8 -*- +import json +import logging +import time +from concurrent.futures import ThreadPoolExecutor +from flask_app.main.截取pdf import truncate_pdf_multiple +from flask_app.general.table_content_extraction import extract_tables_main +from flask_app.old_version.文档理解大模型版知识库处理.知识库操作_old import addfileToKnowledge, deleteKnowledge +from flask_app.main.提取json工程标版 import convert_clause_to_json +from flask_app.general.json_utils import transform_json_values +from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid +from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice +import concurrent.futures +from flask_app.old_version.基础信息整合_old import combine_basic_info +from flask_app.old_version.资格审查模块old_old import combine_review_standards +from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards +from flask_app.general.format_change import pdf2docx, docx2pdf +from flask_app.general.docx截取docx import copy_docx + +def get_global_logger(unique_id): + if unique_id is None: + return logging.getLogger() # 获取默认的日志器 + logger = logging.getLogger(unique_id) + return logger + +logger=None + +# 创建全局线程池 +executor = ThreadPoolExecutor() +def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id): + logger.info("starting 文件预处理...") + logger.info("output_folder..." + output_folder) + + # 根据文件类型处理文件路径 + if file_type == 1: # docx + docx_path = downloaded_file_path + pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理 + elif file_type == 2: # pdf + pdf_path = downloaded_file_path + docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库 + else: + logger.error("Unsupported file type provided. Preprocessing halted.") + return None + + # # 异步上传知识库 + future_knowledge = executor.submit(addfileToKnowledge, docx_path, "招标解析" + unique_id) + + # 调用截取PDF多次 + truncate_files = truncate_pdf_multiple(pdf_path, output_folder) + + # 处理各个部分 + truncate0_docpath = pdf2docx(truncate_files[0]) # 投标人须知前附表转docx + invalid_docpath = copy_docx(docx_path) # docx截取无效标部分 + truncate_jsonpath = extract_tables_main(truncate0_docpath, output_folder) # 投标人须知前附表docx->json + truncate0 = truncate_files[0] + truncate1 = truncate_files[1] + truncate3 = truncate_files[3] + merged_baseinfo_path=truncate_files[-1] + clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json + + logger.info("文件预处理done") + + # 提前返回,不等待 future_knowledge 完成,返回包含 Future 对象 + return { + 'file_path': downloaded_file_path, + 'output_folder': output_folder, + 'truncate0': truncate0, + 'truncate1': truncate1, + 'truncate3': truncate3, + 'knowledge_future': future_knowledge, # 返回 Future 对象 + 'truncate0_jsonpath': truncate_jsonpath, + 'merged_baseinfo_path':merged_baseinfo_path, + 'clause_path': clause_path, + 'invalid_docpath': invalid_docpath + } + + +def post_processing(data,includes): + # 初始化结果字典,预设'其他'分类为空字典 + result = {"其他": {}} + + # 遍历原始字典的每一个键值对 + for key, value in data.items(): + if key in includes: + # 如果键在includes列表中,直接保留这个键值对 + result[key] = value + else: + # 如果键不在includes列表中,将这个键值对加入到'其他'分类中 + result["其他"][key] = value + + # 如果'其他'分类没有任何内容,可以选择删除这个键 + if not result["其他"]: + del result["其他"] + + return result +# 基本信息 +def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path): # 投标人须知前附表 + logger.info("starting基础信息...") + basic_res = combine_basic_info(knowledge_name, truncate0, output_folder, clause_path) + logger.info("基础信息done") + return basic_res + + +# 形式、响应、资格评审 +def fetch_qualification_review(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path,input_file,output_folder): + logger.info("starting资格审查...") + review_standards_res = combine_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath, + clause_path,input_file,output_folder) + logger.info("资格审查done") + return review_standards_res + + +# 评分细则 流式 +def fetch_evaluation_standards(truncate1): # 评标办法前附表 + logger.info("starting 商务标和技术标...") + + # 获取评标办法前附表的字典结果 + evaluation_standards_res = combine_evaluation_standards(truncate1) + + # 获取技术标和商务标 + technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})} + commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})} + + logger.info("商务标和技术标 done") + + # 返回将 "技术标" 和 "商务标" 包含在新的键中 + return { + "technical_standards": technical_standards, + "commercial_standards": commercial_standards + } + +# def fetch_evaluation_standards(truncate1): # 评标办法前附表 +# logger.info("starting商务标技术标...") +# evaluation_standards_res = combine_evaluation_standards(truncate1) +# logger.info("商务标技术标done") +# return evaluation_standards_res + + +# 无效、废标项解析 +def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3): + # 废标项要求:千问 + logger.info("starting无效标与废标...") + find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3) + logger.info("无效标与废标done...") + return find_invalid_res + + +# 投标文件要求 +def fetch_bidding_documents_requirements(clause_path): + logger.info("starting投标文件要求...") + fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1) + logger.info("投标文件要求done...") + return {"投标文件要求":fetch_bidding_documents_requirements_json} + +# 开评定标流程 +def fetch_bid_opening(clause_path): + logger.info("starting开评定标流程...") + fetch_bid_opening_json = extract_from_notice(clause_path, 2) + logger.info("开评定标流程done...") + return {"开评定标流程":fetch_bid_opening_json} + +# def main_processing(output_folder, downloaded_file_path, file_type, unique_id): # file_type=1->docx file_type=2->pdf +# global logger +# logger = get_global_logger(unique_id) +# # Preprocess files and get necessary data paths and knowledge index +# processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id) +# if not processed_data: +# return "" +# +# with concurrent.futures.ThreadPoolExecutor() as executor: +# # Submit all tasks to the executor +# futures = { +# 'base_info': executor.submit(fetch_project_basic_info, processed_data['knowledge_name'], +# processed_data['truncate0'], output_folder, +# processed_data['clause_path']), +# 'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'], +# processed_data['truncate3'], +# processed_data['knowledge_name'], processed_data['truncate0_jsonpath'], +# processed_data['clause_path'],processed_data['file_path'],processed_data['output_folder']), +# 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']), +# 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], +# output_folder, processed_data['truncate0_jsonpath'], +# processed_data['clause_path'], processed_data['truncate3']), +# 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, +# processed_data['clause_path']), +# 'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path']) +# } +# +# comprehensive_responses = [] +# # Collect results in the defined order +# for key in ['base_info', 'qualification_review', 'evaluation_standards', 'invalid_requirements', +# 'bidding_documents_requirements', 'opening_bid']: +# try: +# # Wait for the future to complete and get the result +# result = futures[key].result() +# comprehensive_responses.append(result) +# except Exception as exc: +# logger.error(f"Error processing {key}: {exc}") +# # 合并 JSON 结果 +# combined_final_result = combine_json_results(comprehensive_responses) +# includes = ["基础信息", "资格审查", "商务标", "技术标", "无效标与废标项", "投标文件要求", "开评定标流程"] +# result = post_processing(combined_final_result, includes) +# modified_json = transform_json_values(result) +# +# final_result_path = os.path.join(output_folder, "final_result.json") +# with open(final_result_path, 'w', encoding='utf-8') as file: +# json.dump(modified_json, file, ensure_ascii=False, indent=2) +# logger.info("final_result.json has been saved") +# deleteKnowledge(processed_data['knowledge_index']) +# return final_result_path + + +#分段返回 +def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_id): + global logger + logger = get_global_logger(unique_id) + + # 预处理文件,获取处理后的数据 + processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id) + if not processed_data: + yield json.dumps({}) # 如果处理数据失败,返回空的 JSON + + with concurrent.futures.ThreadPoolExecutor() as executor: + # 立即启动不依赖 knowledge_name 和 index 的任务 + futures = { + 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']), + 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], + output_folder, processed_data['truncate0_jsonpath'], + processed_data['clause_path'], processed_data['truncate3']), + 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, processed_data['clause_path']), + 'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path']) + } + + # 提前处理这些不依赖的任务,按完成顺序返回 + for future in concurrent.futures.as_completed(futures.values()): + key = next(k for k, v in futures.items() if v == future) + try: + result = future.result() + + # 如果是 evaluation_standards,拆分技术标和商务标 + if key == 'evaluation_standards': + technical_standards = result["technical_standards"] + commercial_standards = result["commercial_standards"] + + # 分别返回技术标和商务标 + yield json.dumps({'technical_standards': transform_json_values(technical_standards)}, ensure_ascii=False) + yield json.dumps({'commercial_standards': transform_json_values(commercial_standards)}, ensure_ascii=False) + + else: + # 处理其他任务的结果 + yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False) + + except Exception as exc: + logger.error(f"Error processing {key}: {exc}") + yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False) + + # 只有在需要 knowledge_name 和 index 时才等待 future_knowledge 完成 + try: + knowledge_name = "招标解析" + unique_id + index = processed_data['knowledge_future'].result() # 阻塞等待知识库上传任务完成 + + # 提交依赖 knowledge_name 和 index 的任务 + future_dependencies = { + 'base_info': executor.submit(fetch_project_basic_info, knowledge_name, processed_data['truncate0'], + output_folder, processed_data['clause_path']), + 'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'], + processed_data['truncate3'], knowledge_name, + processed_data['truncate0_jsonpath'], + processed_data['clause_path'], processed_data['file_path'], + processed_data['output_folder']), + } + + # 按完成顺序返回依赖任务的结果 + for future in concurrent.futures.as_completed(future_dependencies.values()): + key = next(k for k, v in future_dependencies.items() if v == future) + try: + result = future.result() + yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False) + except Exception as exc: + logger.error(f"Error processing {key}: {exc}") + yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False) + + except Exception as e: + logger.error(f"Error uploading to knowledge base: {e}") + yield json.dumps({'error': f'Knowledge upload failed: {str(e)}'}, ensure_ascii=False) + + # 删除知识索引 + deleteKnowledge(index) + +if __name__ == "__main__": + output_folder = "flask_app/static/output/zytest1" + # truncate0 = os.path.join(output_folder, "ztb_tobidders_notice_table.pdf") + # truncate1=os.path.join(output_folder,"ztb_evaluation_method.pdf") + # clause_path = convert_clause_to_json(truncate1, output_folder) + # truncate0_jsonpath = os.path.join(output_folder, "truncate_output3.json") + + start_time = time.time() + file_type = 1 #1:docx 2:pdf 3:其他 + input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest11.pdf" + # file_path = main_processing(output_folder, input_file, file_type, "uuidzyzy11") + + preprocess_files(output_folder, input_file, file_type, "zytest1") + end_time = time.time() + elapsed_time = end_time - start_time # 计算耗时 + print(f"Function execution took {elapsed_time} seconds.") diff --git a/flask_app/old_version/文件分类普通版_old.py b/flask_app/old_version/文件分类普通版_old.py new file mode 100644 index 0000000..dc333ad --- /dev/null +++ b/flask_app/old_version/文件分类普通版_old.py @@ -0,0 +1,350 @@ +# -*- coding: utf-8 -*- +import os +import shutil +from PyPDF2 import PdfReader, PdfWriter + + +def validate_pdf(file_path): + """ 验证PDF文件是否损坏 """ + try: + with open(file_path, "rb") as file: + pdf = PdfReader(file) + return len(pdf.pages) > 0 + except Exception as e: + print(f"Error reading PDF {file_path}: {str(e)}") + return False + + +def truncate_pdf(source_path, target_path, max_pages=15): + """截取PDF文件的前15页并保存""" + try: + with open(source_path, "rb") as infile: + reader = PdfReader(infile) + writer = PdfWriter() + for i in range(min(max_pages, len(reader.pages))): + writer.add_page(reader.pages[i]) + with open(target_path, "wb") as outfile: + writer.write(outfile) + except Exception as e: + print(f"Error processing PDF {source_path}: {str(e)}") + +def copy_file(src, dest): + """ 复制单个文件 """ + os.makedirs(os.path.dirname(dest), exist_ok=True) + shutil.copy2(src, dest) + +def copy_directory(source, destination): + """复制整个目录""" + os.makedirs(destination, exist_ok=True) + for item in os.listdir(source): + src_path = os.path.join(source, item) + dest_path = os.path.join(destination, item) + if os.path.isfile(src_path): + copy_file(src_path, dest_path) + else: + copy_directory(src_path, dest_path) + +def unique_file_name(base_path, file_name): + counter = 1 + name_part, extension = os.path.splitext(file_name) + new_file_name = file_name + # 检查文件是否存在,若存在则修改文件名 + while os.path.exists(os.path.join(base_path, new_file_name)): + new_file_name = f"{name_part}_{counter}{extension}" + counter += 1 + return new_file_name + +def process_pdf_folders(source_dir, target_dir): + """ 处理源目录中的PDF文件,并基于条件选择目标文件夹 """ + for root, dirs, files in os.walk(source_dir, topdown=False): + for file in files: + if file.lower().endswith('.pdf'): + source_file_path = os.path.join(root, file) + if validate_pdf(source_file_path): + relative_path = os.path.relpath(root, source_dir) + target_file_dir = os.path.join(target_dir, relative_path) + target_file_path = os.path.join(target_file_dir, file) + copy_file(source_file_path, target_file_path) + else: + print(f"Deleted corrupt file: {source_file_path}") + # 清除空目录 + if not os.listdir(root): + os.rmdir(root) + + +def classify_folders(source_dir, target_dir, project_index): + """Classifies folders and processes files based on specific criteria.""" + temp_dir = os.path.join(source_dir, 'temp') + os.makedirs(temp_dir, exist_ok=True) + + target_project_dir = None + processed_dirs = set() # Set to track processed directories + + for subdir in os.listdir(source_dir): + subdir_path = os.path.join(source_dir, subdir) + if not os.path.isdir(subdir_path): + continue + + files = [f.lower() for f in os.listdir(subdir_path)] + if 'zb.pdf' in files: + target_project_dir, project_index ,target_zb_name= process_tender_files(subdir_path, temp_dir, target_dir, project_index) + processed_dirs.add(subdir_path) # Mark this directory as processed + elif subdir.lower() == "输出文件": + process_evaluation_files(subdir_path, target_project_dir) + processed_dirs.add(subdir_path) # Mark this directory as processed + + # Process remaining folders, skipping already processed ones + process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs,target_zb_name) + #删除tmp目录 + # if os.path.exists(temp_dir): + # shutil.rmtree(temp_dir) + if os.path.exists(source_dir): + shutil.rmtree(source_dir) + +def process_tender_files(subdir_path, temp_dir, target_dir, project_index): + """Processes tender files and returns the target project directory and updated index.""" + zb_path = os.path.join(subdir_path, "zb.pdf") + truncated_zb_path = os.path.join(temp_dir, "truncate_zb.pdf") + truncate_pdf(zb_path, truncated_zb_path, 30) # Truncate to the first 30 pages + bot_response = file_parse(truncated_zb_path, "file_" + str(project_index), 1) + project_index += 1 + zb_response = extract_answer(bot_response[1], 1) + zb_response[0]=zb_response[0].replace('/', '-').replace('\\', '-') #用'-'代替'/' ,目录名不允许出现斜杠 + target_zb_name, category = zb_response[2] + ".pdf", zb_response[3] + + new_zb_path = os.path.join(subdir_path, target_zb_name) + os.rename(zb_path, new_zb_path) + + target_category_dir = os.path.join(target_dir, category) + target_project_dir = os.path.join(target_category_dir, zb_response[0] + "_" + zb_response[1]) + copy_directory(subdir_path, os.path.join(target_project_dir, "招标文件夹")) + + os.remove(truncated_zb_path) + shutil.rmtree(subdir_path) + return target_project_dir, project_index,zb_response[2] + + +def process_evaluation_files(subdir_path, target_project_dir): + """Processes evaluation folders.""" + copy_directory(subdir_path, os.path.join(target_project_dir, "评标文件夹")) + shutil.rmtree(subdir_path) + + +def process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs, target_zb_name): + """Processes remaining folders containing bid files.""" + target_tb_dir = os.path.join(target_project_dir, "投标文件夹") + + for subdir in os.listdir(source_dir): + subdir_path = os.path.join(source_dir, subdir) + if not os.path.isdir(subdir_path) or subdir_path in processed_dirs: + continue # Skip processed directories + + target_tbcom_dir = None # Initialize outside the file loop + + for item in os.listdir(subdir_path): + item_src_path = os.path.join(subdir_path, item) + new_name = "truncate_" + item + truncated_tb_path = os.path.join(temp_dir, new_name) + truncate_pdf(item_src_path, truncated_tb_path) + bot_response = file_parse(truncated_tb_path, "file_" + str(project_index), 2) + project_index += 1 + tb_response = extract_answer(bot_response[1], 2) + if not tb_response: + continue # If there's no response, skip this file + + # Initialize target_tbcom_dir only once based on the first processed file + if not target_tbcom_dir: + target_tb_name, _ = tb_response + if(target_tb_name != target_zb_name and target_tb_name != "未知"): + target_tbcom_dir = os.path.join(target_tb_dir, target_tb_name) + print(target_tbcom_dir) + os.makedirs(target_tbcom_dir, exist_ok=True) + + tb_section = tb_response[1] + ".pdf" + new_tb_path = os.path.join(subdir_path, tb_section) + # 获取唯一文件名 + new_tb_path = os.path.join(subdir_path, unique_file_name(subdir_path, tb_section)) + os.rename(item_src_path, new_tb_path) + + # Remove temporary truncated file + os.remove(truncated_tb_path) + + # Copy the whole directory at once after all files have been processed and renamed + if target_tbcom_dir: + copy_directory(subdir_path, target_tbcom_dir) + shutil.rmtree(subdir_path) # Optionally remove the original directory after copying + + + +import re + + +import re + +def extract_answer(input_string, type): + # 使用正则表达式匹配所需的信息 + # 第一种模式:项目名称、项目编号、招标人、类别 + # 在每个字段值后都添加了\s*以忽略尾随的空格 + pattern1 = r"项目名称[::]\s*(.*?)[;;]\s*项目编号[::]\s*(.*?)[;;]\s*招标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*" + # 第二种模式:投标人、类别 + pattern2 = r"投标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*" + + if type == 1: + match = re.search(pattern1, input_string) + if match: + print(f"1: {match.group(1).strip()} 2: {match.group(2).strip()} 3: {match.group(3).strip()} 4: {match.group(4).strip()}") + return [match.group(1).strip(), match.group(2).strip(), match.group(3).strip(), match.group(4).strip()] + else: + print("No match found for type 1.") + return [] + elif type == 2: + match = re.search(pattern2, input_string) + if match: + # 检查是否包含“投标函”,如果是则替换部分内容为“商务文件” + part = match.group(2).strip() + if "投标函" in part: + part = "商务文件" + return [match.group(1).strip(), part] + else: + print("No match found for type 2.") + return [] + + +from llama_index.readers.dashscope.base import DashScopeParse +from llama_index.readers.dashscope.utils import ResultType +from llama_index.indices.managed.dashscope import DashScopeCloudIndex +from dashscope import Assistants, Messages, Runs, Threads + + +def send_message(assistant, index,documents,message='百炼是什么?'): + print(f"Query: {message}") + + # create thread. + # create a thread. + thread = Threads.create() + + print(thread) + + # create a message. + message = Messages.create(thread.id, content=message) + # create run + + run = Runs.create(thread.id, assistant_id=assistant.id) + print("run:" + str(run)) + + # # get run statue + run_status = Runs.get(run.id, thread_id=thread.id) + # print(run_status) + + # wait for run completed or requires_action + run_status = Runs.wait(run.id, thread_id=thread.id) + # print(run_status) + + # if prompt input tool result, submit tool result. + + run_status = Runs.get(run.id, thread_id=thread.id) + # print(run_status) + # verify_status_code(run_status) + + # get the thread messages. + msgs = Messages.list(thread.id) + # print(msgs) + # print(json.dumps(msgs, default=lambda o: o.__dict__, sort_keys=True, indent=4)) + + ans = [] + + print("运行结果:") + for message in msgs['data'][::-1]: + ans.append(message['content'][0]['text']['value']) + print("content: ", message['content'][0]['text']['value']) + print("\n") + deleteFileFromKnowledge(index,documents) + return ans + + +def file_parse(filepath, knowledge_index, type): + parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND) + documents = parse.load_data(file_path=filepath) + index = DashScopeCloudIndex("文件分类临时知识库") + index._insert(documents) + retriever = index.as_retriever() + pipeline_id = str(retriever.pipeline_id) + assistant = Assistants.create( + model='qwen-max', + name='smart helper', + description='智能助手,支持知识库查询和插件调用。', + instructions='请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}', + tools=[ + { + "type": "code_interpreter" + }, + { + "type": "rag", + "prompt_ra": { + "pipeline_id": pipeline_id, + "parameters": { + "type": "object", + "properties": { + "query_word": { + "type": "str", + "value": "${document1}" + } + + } + } + } + }] + ) + questions1 = "这份招标文件的项目名称是什么?这份文件的招标编号是多少?这份文件的招标人是谁?这份招标文件属于以下哪类招标:服务类、工程类、还是货物类?你可以结合你对招投标的了解和以下内容:工程类招标投标是指建设单位对拟建的工程项目通过法定的程序和方式吸引建设项目的承包单位竞争;\ + 货物类招投标是指以货物作为采购对象的招标,业主或称购货方为获得货物通过招标的形式选择合格的供货商或称供货方,包括原材料、产品、设备、电能和固态、液态、气态物体等;服务类招标又称为服务采购,指的是除了工程和货之外的其他招标投标活动,物招标投标范围包含建设工程的勘察、设计、监理、工程咨询评估、科技项目、科研课题、物业管理、金融保险服务等\ + 请按下列格式给我提供信息,避免无关内容:‘项目名称:XXX;项目编号:XXX;招标人:XXX;类别:XXX‘" + questions2 = "这份投标文件的投标人是谁?如果文件中未明确提及投标人,投标人以未知替代;这份文件属于哪类投标文件?你的回答仅限于以下三种:商务文件、技术文件、报价文件,不要给出其他回答。请按下列格式给我提供信息,避免无关内容:‘投标人:XXX;类别:XXX’" + if (type == 1): + questions = questions1 + elif (type == 2): + questions = questions2 + return send_message(assistant,index,documents, message=questions) + +def deleteFileFromKnowledge(index,documents): + # 初始化一个列表来存储所有文档的 ID + file_ids = [] + + # 检查documents是否为列表且不为空 + if isinstance(documents, list) and documents: + # 遍历每个文档 + for document in documents: + # 使用属性访问方式获取每个文档的 id_ + # 确保 document 对象有一个名为 id_ 的属性 + file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常 + if file_id: + file_ids.append(file_id) # 将 id 添加到列表中 + index.delete_ref_doc(file_ids) + +def process_directory(base_directory, intermediate_directory, final_directory): + # 遍历base_directory下的所有子目录 + for subdir in os.listdir(base_directory): + subdir_path = os.path.join(base_directory, subdir) + # 确保是目录 + if os.path.isdir(subdir_path): + # 处理每个项目的目录 + process_pdf_folders(subdir_path, intermediate_directory) + classify_folders(intermediate_directory, final_directory, 0) + + # 清理临时目录以备下一个项目使用 + if os.path.exists(intermediate_directory): + shutil.rmtree(intermediate_directory) + +def main(base_directory, intermediate_directory, final_directory): + # 确保中间目录和最终目录存在 + os.makedirs(intermediate_directory, exist_ok=True) + os.makedirs(final_directory, exist_ok=True) + process_directory(base_directory, intermediate_directory, final_directory) + + +if __name__ == "__main__": + base_directory = 'D:\\bidding_trading_files\\招投标文件2023\\2023' + intermediate_directory = 'D:\\tmp' + final_directory = 'D:\\output' + main(base_directory, intermediate_directory, final_directory) #处理多级目录 + # process_pdf_folders(base_directory, intermediate_directory) ##处理单级目录 + # classify_folders(intermediate_directory, final_directory, 0) diff --git a/flask_app/old_version/文档理解大模型版知识库处理/删除知识库_old.py b/flask_app/old_version/文档理解大模型版知识库处理/删除知识库_old.py new file mode 100644 index 0000000..fa48cdd --- /dev/null +++ b/flask_app/old_version/文档理解大模型版知识库处理/删除知识库_old.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +# This file is auto-generated, don't edit it. Thanks. +import os +from alibabacloud_bailian20231229.client import Client as bailian20231229Client +from alibabacloud_tea_openapi import models as open_api_models +from alibabacloud_bailian20231229 import models as bailian_20231229_models +from alibabacloud_tea_util import models as util_models +from alibabacloud_tea_util.client import Client as UtilClient + +def create_client() -> bailian20231229Client: + """ + 使用AK&SK初始化账号Client + @return: Client + @throws Exception + """ + config = open_api_models.Config( + access_key_id=os.environ['ALIBABA_CLOUD_ACCESS_KEY_ID'], + access_key_secret=os.environ['ALIBABA_CLOUD_ACCESS_KEY_SECRET'] + ) + config.endpoint = 'bailian.cn-beijing.aliyuncs.com' + return bailian20231229Client(config) + +def delete_index(client: bailian20231229Client, workspace_id: str, index_id: str) -> None: + delete_index_request = bailian_20231229_models.DeleteIndexRequest( + index_id=index_id + ) + runtime = util_models.RuntimeOptions() + headers = {} + try: + response = client.delete_index_with_options(workspace_id, delete_index_request, headers, runtime) + print("routes Response:", response) + except Exception as error: + print(error.message) + print(error.data.get("Recommend")) + UtilClient.assert_as_string(error.message) + +async def delete_index_async(client: bailian20231229Client, workspace_id: str, index_id: str) -> None: + delete_index_request = bailian_20231229_models.DeleteIndexRequest( + index_id=index_id + ) + runtime = util_models.RuntimeOptions() + headers = {} + try: + response = await client.delete_index_with_options_async(workspace_id, delete_index_request, headers, runtime) + print("routes Response:", response) + except Exception as error: + print(error.message) + print(error.data.get("Recommend")) + UtilClient.assert_as_string(error.message) + +if __name__ == '__main__': + workspace_id = os.environ['DASHSCOPE_WORKSPACE_ID'] + index_id = 'pg5rrsv26x' + client = create_client() + delete_index(client, workspace_id, index_id) diff --git a/flask_app/old_version/文档理解大模型版知识库处理/知识库操作_old.py b/flask_app/old_version/文档理解大模型版知识库处理/知识库操作_old.py new file mode 100644 index 0000000..f0c1b2d --- /dev/null +++ b/flask_app/old_version/文档理解大模型版知识库处理/知识库操作_old.py @@ -0,0 +1,58 @@ +import os +import uuid +from llama_index.readers.dashscope.base import DashScopeParse +from llama_index.readers.dashscope.utils import ResultType +from llama_index.indices.managed.dashscope import DashScopeCloudIndex +from flask_app.old_version.文档理解大模型版知识库处理.删除知识库_old import delete_index, create_client + + +def addfileToKnowledge(filepath,knowledge_name): + parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND) + documents = parse.load_data(file_path=filepath) + index = DashScopeCloudIndex.from_documents( + documents, + knowledge_name, + verbose=True, + ) + print("knowledge created successfully!!!") + # index = DashScopeCloudIndex(knowledge_name) + # index._insert(documents) + # return index, documents + return index + +def deleteKnowledge(index): + retriever = index.as_retriever() + index_id = str(retriever.pipeline_id) + workspace_id = os.environ.get('DASHSCOPE_WORKSPACE_ID') + client = create_client() + delete_index(client,workspace_id,index_id) + print("knowledge old_version successfully!!!") + + + +def deleteFileFromKnowledge(index, documents): + # 初始化一个列表来存储所有文档的 ID + file_ids = [] + # 检查documents是否为列表且不为空 + if isinstance(documents, list) and documents: + # 遍历每个文档 + for document in documents: + # 使用属性访问方式获取每个文档的 id_ + # 确保 document 对象有一个名为 id_ 的属性 + file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常 + if file_id: + file_ids.append(file_id) # 将 id 添加到列表中 + print("old_version successfully") + index.delete_ref_doc(file_ids) + + +# 示例用法 +if __name__ == "__main__": + filepath = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标01.pdf" + unique_id = str(uuid.uuid4()) + knowledge_name="招标解析"+unique_id + # index = addfileToKnowledge(filepath,knowledge_name) + index = DashScopeCloudIndex("招标解析e8cc45f4-cd41-47cf-a5e6-2b7885debfff") + # 删除文件 + # deleteFileFromKnowledge(index, document) + deleteKnowledge(index) diff --git a/flask_app/old_version/文档理解大模型版知识库处理/调用文件解析状态查询_old.py b/flask_app/old_version/文档理解大模型版知识库处理/调用文件解析状态查询_old.py new file mode 100644 index 0000000..6a5547e --- /dev/null +++ b/flask_app/old_version/文档理解大模型版知识库处理/调用文件解析状态查询_old.py @@ -0,0 +1,33 @@ +from typing import List +from alibabacloud_docmind_api20220711.client import Client as docmind_api20220711Client +from alibabacloud_tea_openapi import models as open_api_models +from alibabacloud_docmind_api20220711 import models as docmind_api20220711_models +from alibabacloud_tea_util.client import Client as UtilClient +from alibabacloud_credentials.client import Client as CredClient + +def query(): + # 使用默认凭证初始化Credentials Client。 + cred=CredClient() + config = open_api_models.Config( + # 通过credentials获取配置中的AccessKey ID + cred.get_credential().access_key_id, + # 通过credentials获取配置中的AccessKey Secret + cred.get_credential().access_key_secret, + ) + # 访问的域名 + config.endpoint = f'docmind-api.cn-hangzhou.aliyuncs.com' + client = docmind_api20220711Client(config) + request = docmind_api20220711_models.QueryDocParserStatusRequest( + # id : 任务提交接口返回的id + id='docmind-20241008-24363df30d274863894f037dbb7244e8' + ) + try: + # 复制代码运行请自行打印 routes 的返回值 + response = client.query_doc_parser_status(request) + # API返回值格式层级为 body -> data -> 具体属性。可根据业务需要打印相应的结果。获取属性值均以小写开头 + # 获取返回结果。建议先把response.body.data转成json,然后再从json里面取具体需要的值。 + print(response.body.data.status) + except Exception as error: + # 如有需要,请打印 error + UtilClient.assert_as_string(error.message) +query() \ No newline at end of file diff --git a/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析异步提交_old.py b/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析异步提交_old.py new file mode 100644 index 0000000..2088e74 --- /dev/null +++ b/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析异步提交_old.py @@ -0,0 +1,40 @@ +from alibabacloud_docmind_api20220711.client import Client as docmind_api20220711Client +from alibabacloud_tea_openapi import models as open_api_models +from alibabacloud_docmind_api20220711 import models as docmind_api20220711_models +from alibabacloud_tea_util.client import Client as UtilClient +from alibabacloud_tea_util import models as util_models +from alibabacloud_credentials.client import Client as CredClient + +def submit_file(): + # 使用默认凭证初始化Credentials Client。 + cred=CredClient() + config = open_api_models.Config( + # 通过credentials获取配置中的AccessKey ID + # access_key_id=cred.get_access_key_id(), + cred.get_credential().access_key_id, + # 通过credentials获取配置中的AccessKey Secret + # access_key_secret=cred.get_access_key_secret() + cred.get_credential().access_key_secret, + ) + # 访问的域名 + config.endpoint = f'docmind-api.cn-hangzhou.aliyuncs.com' + client = docmind_api20220711Client(config) + request = docmind_api20220711_models.SubmitDocParserJobAdvanceRequest( + # file_url_object : 本地文件流 + file_url_object=open("zbtest4.pdf", "rb"), + # file_name :文件名称。名称必须包含文件类型 + file_name='zbtest4.pdf' + # file_name_extension : 文件后缀格式。与文件名二选一 + # file_name_extension='pdf' + ) + runtime = util_models.RuntimeOptions() + try: + # 复制代码运行请自行打印 routes 的返回值 + response = client.submit_doc_parser_job_advance(request, runtime) + # API返回值格式层级为 body -> data -> 具体属性。可根据业务需要打印相应的结果。如下示例为打印返回的业务id格式 + # 获取属性值均以小写开头, + print(response.body.data.id) + except Exception as error: + # 如有需要,请打印 error + UtilClient.assert_as_string(error.message) +submit_file() \ No newline at end of file diff --git a/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析结果获取_old.py b/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析结果获取_old.py new file mode 100644 index 0000000..21b85f8 --- /dev/null +++ b/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析结果获取_old.py @@ -0,0 +1,35 @@ +from typing import List +from alibabacloud_docmind_api20220711.client import Client as docmind_api20220711Client +from alibabacloud_tea_openapi import models as open_api_models +from alibabacloud_docmind_api20220711 import models as docmind_api20220711_models +from alibabacloud_tea_util.client import Client as UtilClient +from alibabacloud_credentials.client import Client as CredClient + +def query(): + # 使用默认凭证初始化Credentials Client。 + cred=CredClient() + config = open_api_models.Config( + # 通过credentials获取配置中的AccessKey ID + cred.get_credential().access_key_id, + # 通过credentials获取配置中的AccessKey Secret + cred.get_credential().access_key_secret, + ) + # 访问的域名 + config.endpoint = f'docmind-api.cn-hangzhou.aliyuncs.com' + client = docmind_api20220711Client(config) + request = docmind_api20220711_models.GetDocParserResultRequest( + # id : 任务提交接口返回的id + id='docmind-20241008-24363df30d274863894f037dbb7244e8', + layout_step_size=10, + layout_num=0 + ) + try: + # 复制代码运行请自行打印 routes 的返回值 + response = client.get_doc_parser_result(request) + # API返回值格式层级为 body -> data -> 具体属性。可根据业务需要打印相应的结果。获取属性值均以小写开头 + # 获取返回结果。建议先把response.body.data转成json,然后再从json里面取具体需要的值。 + print(response.body.data) + except Exception as error: + # 如有需要,请打印 error + UtilClient.assert_as_string(error.message) +query() \ No newline at end of file diff --git a/flask_app/old_version/无效标和废标和禁止投标整合_old.py b/flask_app/old_version/无效标和废标和禁止投标整合_old.py new file mode 100644 index 0000000..67bf587 --- /dev/null +++ b/flask_app/old_version/无效标和废标和禁止投标整合_old.py @@ -0,0 +1,440 @@ +# -*- coding: utf-8 -*- +import json +import os.path +import time +import re + +from flask_app.general.format_change import pdf2docx +from flask_app.general.通义千问long import upload_file, qianwen_long +from concurrent.futures import ThreadPoolExecutor + +from flask_app.general.table_content_extraction import extract_tables_main +from flask_app.old_version.不得存在及禁止投标情形_old import find_forbidden, process_string_list + + +#处理跨页的段落 +def preprocess_paragraphs(paragraphs): + processed = [] # 初始化处理后的段落列表 + index = 0 + while index < len(paragraphs): + current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白 + + # 检查当前段落是否为空 + if current_text == '': + # 确保有前一个和后一个段落 + if 0 < index < len(paragraphs) - 1: + prev_text = paragraphs[index - 1].text.strip() # 获取前一个段落的文本 + next_text = paragraphs[index + 1].text.strip() # 获取后一个段落的文本 + + # 检查前一个段落的文本是否不以标点符号结尾 + if not prev_text.endswith((',', ',', '。', '!', '?')): + # 定义列表项的模式 + list_item_pattern = r'^\s*([($]\d+[)$]|[A-Za-z]\.\s*|[一二三四五六七八九十]+、)' + # 检查前一个段落是否以列表模式开头,且后一个段落不以列表模式开头 + # is_prev_list = re.match(list_item_pattern, prev_text) + is_next_list = re.match(list_item_pattern, next_text) + if not is_next_list and len(prev_text) > 30: + # 合并前一个和后一个段落的文本 + merged_text = prev_text + next_text # 为了可读性添加空格 + if processed: + processed[-1] = merged_text # 更新处理后的最后一个段落 + else: + processed.append(merged_text) # 如果列表为空,直接添加合并后的文本 + + # 跳过下一个段落,因为它已经被合并 + index += 2 + continue + else: + # 非空段落,添加到处理后的列表中 + processed.append(current_text) + index += 1 + + return processed + +#如果当前段落有序号,则向下匹配直接遇到相同的序号样式 +#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。 + +def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): + from collections import OrderedDict + from docx import Document + import re + + if isinstance(keywords, str): + keywords = [keywords] + + doc = Document(doc_path) + extracted_paragraphs = OrderedDict() + continue_collecting = False + current_section_pattern = None + active_key = None + + def match_keywords(text, patterns): + return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns) + + def extract_from_text(text, current_index): + nonlocal continue_collecting, current_section_pattern, active_key + if text == "": + return current_index + + if continue_collecting: + if current_section_pattern and re.match(current_section_pattern, text): + continue_collecting = False + active_key = None + else: + if active_key is not None: + extracted_paragraphs[active_key].append(text) + return current_index + + if match_keywords(text, keywords): + active_key = text + extracted_paragraphs[active_key] = [text] + if match_keywords(text, follow_up_keywords): + continue_collecting = True + section_number = re.match(r'^(\d+([..]\d+)*)\s*[..]?', text) # 修改后的正则,支持 '数字 、' 和 '数字.' + if section_number: + current_section_number = section_number.group(1) + level_count = current_section_number.count('.') + + # Pattern to match current level, e.g., 3.4.5 + pattern = r'^' + (r'\d+\s*[..]\s*') * level_count + r'\d+' + # Generate patterns for next section at same level and parent level + parts = current_section_number.split('.') + matched_patterns = [pattern] # start with the full pattern + + # Next section at same level + parts[-1] = str(int(parts[-1]) + 1) + next_pattern = r'^' + r'\s*\.\s*'.join(parts) + matched_patterns.append(next_pattern) + + # Parent section (if applicable) + if len(parts) > 1: + parent_section_parts = parts[:-1] + parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1) + parent_pattern = r'^' + r'\s*\.\s*'.join(parent_section_parts) + matched_patterns.append(parent_pattern) + + # Combine the patterns + combined_pattern = r'(' + r')|('.join(matched_patterns) + r')' + current_section_pattern = re.compile(combined_pattern) + + else: + found_next_number = False + current_section_pattern = None + + while current_index < len(doc.paragraphs) - 1: + current_index += 1 + next_text = doc.paragraphs[current_index].text.strip() + if not found_next_number: + next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))', next_text) + if next_section_number: + found_next_number = True + if next_section_number.group(1): + section_parts = next_section_number.group(1).split('.') + dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b' + elif next_section_number.group(2): + dynamic_pattern = r'^[\(\(]\d+[\)\)]' + current_section_pattern = re.compile(dynamic_pattern) + if current_section_pattern and re.match(current_section_pattern, next_text): + extracted_paragraphs[active_key].append(next_text) + else: + continue_collecting = False + active_key=None + break + + return current_index + + processed_paragraphs = preprocess_paragraphs(doc.paragraphs) + index = 0 + while index < len(processed_paragraphs): + index = extract_from_text(processed_paragraphs[index].strip(), index) + index += 1 + + return extracted_paragraphs + +def preprocess_text_list(text_list): + new_text_list = [] + # 正则表达式匹配中文字符或标点后的空格,该空格后紧跟字母、数字或带括号的数字 + split_pattern = re.compile(r'(?<=[\u4e00-\u9fff。;!??!;])(?=\s+[a-zA-Z\d]|\s+\([1-9]\d*\)|\s+\([1-9]\d*\))') + for text in text_list: + # 使用正则表达式检查并拆分元素 + parts = split_pattern.split(text) + new_text_list.extend(part.strip() for part in parts if part.strip()) # 添加非空字符串检查 + + return new_text_list + +def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达式提取到的东西格式化 + all_texts1 = [] + all_texts2=[] + # 定义用于分割句子的正则表达式,包括中文和西文的结束标点 + split_pattern = r'(?<=[。!?\!\?])' + + for key, text_list in extracted_contents.items(): + if len(text_list) == 1: + for data in text_list: + # 检查是否包含任何需要排除的字符串 + if any(exclude in data for exclude in excludes): + continue # 如果包含任何排除字符串,跳过这个数据 + # 去掉开头的序号,包括字母+数字的格式 以及括号+数字 + pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)' + data = re.sub(pattern, '', data).strip() + keyword_match = re.search(keywords, data) + if keyword_match: + # 从关键词位置开始查找结束标点符号 + start_pos = keyword_match.start() + # 截取从关键词开始到后面的内容 + substring = data[start_pos:] + # 按定义的结束标点分割 + sentences = re.split(split_pattern, substring, 1) + if len(sentences) > 0 and sentences[0]: + # 只取第一句,保留标点 + cleaned_text = data[:start_pos] + sentences[0] #eg:经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。潜在投标人应自行承担现场考察的全部费用、责任和风险。 + # 经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。 + else: + cleaned_text = data # 如果没有标点,使用整个字符串 + else: + # 如果没有找到关键词,保留原文本 + cleaned_text = data + # 删除空格 + cleaned_text_no_spaces = cleaned_text.replace(' ', '').replace(' ', '') + # 如果长度大于8,则添加到结果列表 + if len(cleaned_text_no_spaces) > 8: + all_texts1.append(cleaned_text_no_spaces) + + else: + # print(text_list) + new_text_list=preprocess_text_list(text_list) + # print(new_text_list) + pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)' + # data = re.sub(pattern, '', new_text_list[0]).strip() #去除序号 + data = re.sub(pattern, '', new_text_list[0]).replace(' ','').strip() + # 将修改后的第一个元素和剩余的元素连接起来 + new_text_list[0] = data # 更新列表中的第一个元素 + joined_text = "\n".join(new_text_list) # 如果列表中有多个元素,则连接它们 + # 删除空格 + joined_text_no_spaces = joined_text.replace(' ', '').replace(' ', '') + all_texts2.append(joined_text_no_spaces) # 将每个列表的内容添加到 all_texts 中 + + return all_texts1,all_texts2 #all_texts1要额外用gpt all_text2直接返回结果 +def find_sentences_with_keywords(data, keywords, follow_up_keywords): + """递归查找并返回包含关键词的句子列表,并根据是否存在后续关键词分别存储到两个列表中。""" + sentences1 = [] # 保存没有后续关键词的情况 + sentences2 = [] # 保存有后续关键词的情况 + + if isinstance(data, dict): + for value in data.values(): + result1, result2 = find_sentences_with_keywords(value, keywords, follow_up_keywords) + sentences1.extend(result1) + sentences2.extend(result2) + elif isinstance(data, list): + for item in data: + result1, result2 = find_sentences_with_keywords(item, keywords, follow_up_keywords) + sentences1.extend(result1) + sentences2.extend(result2) + elif isinstance(data, str): + # 分割句子,保证句子完整性(按标点符号和序号分割) + # split_sentences = re.split(r'(?<=[。!?\!\?])|(?=\d+[\、\.])|(?=[((]\d+[))])', data) # 扩展匹配序号分割 + split_sentences = re.split(r'(?<=[。!?\!\?])|(?=\d+\.\d+)|(?=\d+[\、\.])|(?=[((]\d+[))])', data) + i = 0 + while i < len(split_sentences): + sentence = split_sentences[i].strip() + if re.search(keywords, sentence, re.IGNORECASE): + follow_up_present = any( + re.search(follow_up, sentence, re.IGNORECASE) for follow_up in follow_up_keywords) + if follow_up_present: + # 如果存在后续关键词,则从当前位置开始截取 + start_index = i + end_index = start_index + found_next_section = False + for j in range(start_index + 1, len(split_sentences)): + if re.match(r'\d+\.\d+(\.\d+)?', split_sentences[j].strip()): + end_index = j + found_next_section = True + break + + if found_next_section: + full_text = ' '.join(split_sentences[start_index:end_index]).strip() + else: + full_text = ' '.join(split_sentences[start_index:]).strip() + # pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)' + pattern = r'^\s*([((]\d+[))]|[A-Za-z]\.\s*|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)' + # data=re.sub(pattern,'',full_text) + data = re.sub(pattern, '', full_text).replace(' ','').strip() + sentences2.append(data) # 存储有后续关键词的情况 + i = end_index if found_next_section else len(split_sentences) + else: + # pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)' + pattern = r'^\s*([((]\d+[))]|[A-Za-z]\.\s*|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)' + # data = re.sub(pattern, '', sentence).replace('\n','').strip() + cleaned_sentence = re.sub(pattern, '', sentence).replace('\n', '').replace(' ','').strip() + if len(cleaned_sentence) > 8: + sentences1.append(cleaned_sentence) # 存储没有后续关键词的情况 + i += 1 + else: + i += 1 + + return sentences1, sentences2 # 返回两个列表 + +def extract_sentences_from_json(json_path, keywords,follow_up_keywords): + if not json_path: + return [],[] + with open(json_path, 'r', encoding='utf-8') as file: + data = json.load(file) + """从JSON数据中提取包含关键词的句子。""" + return find_sentences_with_keywords(data, keywords,follow_up_keywords) + +#处理无效投标 +def extract_values_if_contains(data, includes): + """ + 递归检查字典中的值是否包含列表 'includes' 中的内容。 + 如果包含,将这些值添加到一个列表中并返回。 + + 参数: + data (dict): 字典或从 JSON 解析得到的数据。 + includes (list): 包含要检查的关键词的列表。 + + 返回: + list: 包含满足条件的值的列表。 + """ + included_values = [] # 初始化结果列表 + + # 定义递归函数来处理嵌套字典 + def recursive_search(current_data): + if isinstance(current_data, dict): + for key, value in current_data.items(): + if isinstance(value, dict): + # 如果值是字典,递归搜索 + recursive_search(value) + elif isinstance(value, str): + # 如果值是字符串,检查是否包含任何 includes 中的关键词 + if any(include in value for include in includes): + included_values.append(value) + elif isinstance(current_data, list): + for item in current_data: + # 如果是列表,递归每个元素 + recursive_search(item) + + # 开始递归搜索 + recursive_search(data) + + return included_values + + +#你是一个文本助手,文本内的信息以'...............'分割,你负责准确筛选所需的信息并返回,每块信息要求完整,不遗漏,你不得擅自进行总结或删减。 +#以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号。 +#以上是原文内容,文本内的信息以'...............'分割,请你根据该信息回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选所需的信息并返回。最终结果以json列表格式返回给我,键名为'否决和无效投标情形',你的回答完全忠于原文内容,且回答内容与原文内容一致,要求完整与准确,不能擅自总结或者概括。", + +#TODO:truncate_json_path为空的时候,单独提取表格数据 +def handle_query(file_path, user_query, output_file, result_key, keywords, truncate_json_path): + try: + excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:"] + follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下'] + extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) # 提取正文(除表格) + # print(extracted_contents) + all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表 + all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords) # 提取表格数据(json_data) + qianwen_txt = all_texts1 + all_tables1 + selected_contents = set() # 使用 set 去重 + + if qianwen_txt: + with open(output_file, 'w', encoding='utf-8') as file: + counter = 1 + for content in qianwen_txt: + file.write("..............." + '\n') + file.write(f"{counter}. {content}\n") + counter += 1 + + file_id = upload_file(output_file) + # qianwen_ans = qianwen_long(file_id, user_query) + qianwen_ans = qianwen_long(file_id, user_query) + num_list = process_string_list(qianwen_ans) + print(result_key + "选中的序号:" + str(num_list)) + + for index in num_list: + if 1 <= index <= len(qianwen_txt): + content = qianwen_txt[index - 1] + selected_contents.add(content) + + # 无论 qianwen_txt 是否为空,都添加 all_texts2 和 all_tables2 的内容 + selected_contents.update(all_texts2) + selected_contents.update(all_tables2) + + # 如果 selected_contents 不为空,则返回结果,否则返回空字符串 + if selected_contents: + res = {result_key: list(selected_contents)} + else: + res = {result_key: ""} + + return res + except Exception as e: + print(f"handle_query 在处理 {result_key} 时发生异常: {e}") + return {result_key: ""} + +def combine_find_invalid(invalid_docpath, output_dir, tobidders_notice_table, clause_path, qualification): + tobidders_notice_table_docx = pdf2docx(tobidders_notice_table) # 投标人须知前附表转docx + truncate_jsonpath = extract_tables_main(tobidders_notice_table_docx, output_dir) # 投标人须知前附表docx->json + queries = [ + ( + r'否\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效', + "以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。", + os.path.join(output_dir, "temp1.txt"), + "否决和无效投标情形" + ), + ( + r'废\s*标', + "以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:废标项的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。", + os.path.join(output_dir, "temp2.txt"), + "废标项" + ) + ] + results = [] + # 使用线程池来并行处理查询 + with ThreadPoolExecutor() as executor: + futures = [] + for keywords, user_query, output_file, result_key in queries: + future = executor.submit( + handle_query, + invalid_docpath, + user_query, + output_file, + result_key, + keywords, + truncate_jsonpath + ) + futures.append((future, result_key)) + time.sleep(0.5) # 暂停0.5秒后再提交下一个任务 + # 按照提交的顺序收集结果 + for future, result_key in futures: + try: + result = future.result() + except Exception as e: + print(f"线程处理 {result_key} 时出错: {e}") + result = {result_key: ""} + results.append(result) + + # 禁止投标(find_forbidden)部分 + try: + # print("starting不得存在的情形...") + forbidden_res = find_forbidden(truncate_jsonpath, clause_path, qualification) + except Exception as e: + print(f"find_forbidden 处理时出错: {e}") + forbidden_res = {'不得存在的其他情形': ""} + results.append(forbidden_res) + + combined_dict = {} + for d in results: + combined_dict.update(d) + # print("无效标与废标done...") + return {"无效标与废标项": combined_dict} + +if __name__ == '__main__': + start_time = time.time() + tobidders_notice_table="" + clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json" + qualification="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf" + output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\zbout" + # doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx' + invalid_docpath = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_invalid.docx' + results = combine_find_invalid(invalid_docpath, output_dir,tobidders_notice_table,clause_path,qualification) + end_time = time.time() + print("Elapsed time:", str(end_time - start_time)) + print("Results:", json.dumps(results,ensure_ascii=False,indent=4)) diff --git a/flask_app/old_version/解析old_old.py b/flask_app/old_version/解析old_old.py new file mode 100644 index 0000000..c977194 --- /dev/null +++ b/flask_app/old_version/解析old_old.py @@ -0,0 +1,221 @@ +# -*- encoding:utf-8 -*- +import json +import logging +import time +from concurrent.futures import ThreadPoolExecutor +from flask_app.main.截取pdf import truncate_pdf_multiple +from flask_app.general.table_content_extraction import extract_tables_main +from flask_app.main.提取json工程标版 import convert_clause_to_json +from flask_app.general.json_utils import transform_json_values +from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid +from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice +import concurrent.futures +from flask_app.main.基础信息整合快速版 import combine_basic_info +from flask_app.main.资格审查模块 import combine_review_standards +from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards +from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx +from flask_app.general.docx截取docx import copy_docx + +def get_global_logger(unique_id): + if unique_id is None: + return logging.getLogger() # 获取默认的日志器 + logger = logging.getLogger(unique_id) + return logger + +logger=None + +# 创建全局线程池 +executor = ThreadPoolExecutor() +def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id): + logger.info("starting 文件预处理...") + logger.info("output_folder..." + output_folder) + + # 根据文件类型处理文件路径 + if file_type == 1: # docx + docx_path = downloaded_file_path + pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理 + elif file_type == 2: # pdf + pdf_path = downloaded_file_path + docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库 + elif file_type == 3: #doc + pdf_path=docx2pdf(downloaded_file_path) + docx_path=doc2docx(downloaded_file_path) + else: + logger.error("Unsupported file type provided. Preprocessing halted.") + return None + + # 调用截取PDF多次 + truncate_files = truncate_pdf_multiple(pdf_path, output_folder,unique_id) + + # 处理各个部分 + truncate0_docpath = pdf2docx(truncate_files[0]) # 投标人须知前附表转docx + + invalid_path=truncate_files[5] + invalid_docpath = copy_docx(docx_path) # docx截取无效标部分 + # invalid_docpath=pdf2docx(invalid_path) + truncate_jsonpath = extract_tables_main(truncate0_docpath, output_folder) # 投标人须知前附表docx->json + truncate0 = truncate_files[0] + truncate1 = truncate_files[1] + truncate2=truncate_files[2] + truncate3 = truncate_files[3] + merged_baseinfo_path=truncate_files[-1] + clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json + + logger.info("文件预处理done") + + # 返回包含预处理后文件路径的字典 + return { + 'file_path': downloaded_file_path, + 'output_folder': output_folder, + 'invalid_path':invalid_path, + 'truncate0': truncate0, + 'truncate1': truncate1, + 'truncate2':truncate2, + 'truncate3': truncate3, + 'truncate0_jsonpath': truncate_jsonpath, + 'merged_baseinfo_path':merged_baseinfo_path, + 'clause_path': clause_path, + 'invalid_docpath': invalid_docpath + } + + +def post_processing(data,includes): + # 初始化结果字典,预设'其他'分类为空字典 + result = {"其他": {}} + + # 遍历原始字典的每一个键值对 + for key, value in data.items(): + if key in includes: + # 如果键在includes列表中,直接保留这个键值对 + result[key] = value + else: + # 如果键不在includes列表中,将这个键值对加入到'其他'分类中 + result["其他"][key] = value + + # 如果'其他'分类没有任何内容,可以选择删除这个键 + if not result["其他"]: + del result["其他"] + + return result +# 基本信息 +def fetch_project_basic_info(merged_baseinfo_path, truncate0, truncate2, clause_path): # 投标人须知前附表 + logger.info("starting基础信息...") + basic_res = combine_basic_info(merged_baseinfo_path, truncate0, truncate2, clause_path) + logger.info("基础信息done") + return basic_res + + +# 形式、响应、资格评审 +def fetch_qualification_review(truncate1, truncate3,output_folder, truncate0_jsonpath, clause_path,invalid_path,merged_baseinfo_path): + logger.info("starting资格审查...") + review_standards_res = combine_review_standards(truncate1, truncate3,output_folder, truncate0_jsonpath, + clause_path,invalid_path,merged_baseinfo_path) + logger.info("资格审查done") + return review_standards_res + + +# 评分细则 流式 +def fetch_evaluation_standards(truncate1): # 评标办法前附表 + logger.info("starting 商务标和技术标...") + + # 获取评标办法前附表的字典结果 + evaluation_standards_res = combine_evaluation_standards(truncate1) + + # 获取技术标和商务标 + technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})} + commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})} + + logger.info("商务标和技术标 done") + + # 返回将 "技术标" 和 "商务标" 包含在新的键中 + return { + "technical_standards": technical_standards, + "commercial_standards": commercial_standards + } + + +# 无效、废标项解析 +def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3): + # 废标项要求:千问 + logger.info("starting无效标与废标...") + find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3) + logger.info("无效标与废标done...") + return find_invalid_res + + +# 投标文件要求 +def fetch_bidding_documents_requirements(clause_path): + logger.info("starting投标文件要求...") + fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1) + logger.info("投标文件要求done...") + return {"投标文件要求":fetch_bidding_documents_requirements_json} + +# 开评定标流程 +def fetch_bid_opening(clause_path): + logger.info("starting开评定标流程...") + fetch_bid_opening_json = extract_from_notice(clause_path, 2) + logger.info("开评定标流程done...") + return {"开评定标流程":fetch_bid_opening_json} + +#分段返回 +def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_id): + global logger + logger = get_global_logger(unique_id) + # 预处理文件,获取处理后的数据 + processed_data = preprocess_files(output_folder, downloaded_file_path, file_type) + if not processed_data: + yield json.dumps({}) # 如果处理数据失败,返回空的 JSON + + with concurrent.futures.ThreadPoolExecutor() as executor: + # 立即启动不依赖 knowledge_name 和 index 的任务 + futures = { + 'base_info': executor.submit(fetch_project_basic_info, processed_data['merged_baseinfo_path'], + processed_data['truncate0'], + processed_data['truncate2'], processed_data['clause_path']), + 'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'], + processed_data['truncate3'], output_folder, + processed_data['truncate0_jsonpath'], + processed_data['clause_path'], processed_data['invalid_path'], + processed_data['merged_baseinfo_path']), + 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']), + 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], + output_folder, processed_data['truncate0_jsonpath'], + processed_data['clause_path'], processed_data['truncate3']), + 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, processed_data['clause_path']), + 'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path']) + } + + # 提前处理这些不依赖的任务,按完成顺序返回 + for future in concurrent.futures.as_completed(futures.values()): + key = next(k for k, v in futures.items() if v == future) + try: + result = future.result() + + # 如果是 evaluation_standards,拆分技术标和商务标 + if key == 'evaluation_standards': + technical_standards = result["technical_standards"] + commercial_standards = result["commercial_standards"] + + # 分别返回技术标和商务标 + yield json.dumps({'technical_standards': transform_json_values(technical_standards)}, ensure_ascii=False) + yield json.dumps({'commercial_standards': transform_json_values(commercial_standards)}, ensure_ascii=False) + + else: + # 处理其他任务的结果 + yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False) + + except Exception as exc: + logger.error(f"Error processing {key}: {exc}") + yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False) + +if __name__ == "__main__": + start_time = time.time() + output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test1" + file_type = 2 #1:docx 2:pdf 3:其他 + input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf" + print("yes") + for output in engineering_bid_main(output_folder, input_file, file_type, "zytest"): + print(output) + end_time = time.time() + elapsed_time = end_time - start_time # 计算耗时 + print(f"Function execution took {elapsed_time} seconds.") diff --git a/flask_app/old_version/资格审查模块old_old.py b/flask_app/old_version/资格审查模块old_old.py new file mode 100644 index 0000000..6f93927 --- /dev/null +++ b/flask_app/old_version/资格审查模块old_old.py @@ -0,0 +1,42 @@ +import os + +from flask_app.main.提取json工程标版 import convert_clause_to_json +from flask_app.general.json_utils import extract_content_from_json +from flask_app.old_version.形式响应评审old import process_reviews +from flask_app.old_version.资格评审old_old import process_qualification +from flask_app.general.通义千问long import upload_file, qianwen_long +from concurrent.futures import ThreadPoolExecutor + + +def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder): #评标办法前附表 + # 形式评审、响应评审:千问 + file_id=upload_file(truncate1) #评标办法前附表 + user_query_1 = "根据该文档中的评标办法前附表,请你列出该文件中的形式评审标准和响应性评审标准和资格评审标准,请以json格式返回,外层键名为'形式评审标准'和'响应性评审标准'和'资格评审标准',嵌套键名为'评审因素'中的内容,相应的键值为对应'评审标准'中的内容。" + results = qianwen_long(file_id, user_query_1) + original_dict_data = extract_content_from_json(results) + qualification_review = original_dict_data.pop('资格评审标准', '默认值或None') #qianwen-long有关资格评审的内容 + with ThreadPoolExecutor() as executor: + # 创建Future对象 + future_qualification = executor.submit(process_qualification, qualification_review, truncate3, knowledge_name) + future_form_response = executor.submit(process_reviews, original_dict_data, knowledge_name, truncate0_jsonpath, + clause_path,input_file,output_folder) + + # 等待执行结果 + final_qualify_json = future_qualification.result() + form_response_dict = future_form_response.result() + form_response_dict.update(final_qualify_json) + # return nest_json_under_key(form_response_dict,"资格审查") + return {"资格审查":form_response_dict} + +if __name__ == "__main__": + input_file="D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21\\ztbfile.pdf" + output_folder = "D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21" + # truncate0 = os.path.join(output_folder, "zbtest20_tobidders_notice_table.pdf") + truncate2=os.path.join(output_folder,"ztbfile_tobidders_notice.pdf") + knowledge_name="zbtest20" + truncate1=os.path.join(output_folder,"ztbfile_evaluation_method.pdf") + truncate3=os.path.join(output_folder,"ztbfile_qualification.pdf") + clause_path = convert_clause_to_json(truncate2, output_folder) + truncate0_jsonpath = os.path.join(output_folder, "truncate_output.json") + res=combine_review_standards(truncate1,truncate3, knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder) + print(res) \ No newline at end of file diff --git a/flask_app/old_version/资格评审old_old.py b/flask_app/old_version/资格评审old_old.py new file mode 100644 index 0000000..58a3c84 --- /dev/null +++ b/flask_app/old_version/资格评审old_old.py @@ -0,0 +1,182 @@ +# -*- encoding:utf-8 -*- +# 资格审查中,首先排除'联合体投标'和'不得存在的情况',有'符合'等的,加入matching_keys列表,否则保留原字典 +import json +import re +from flask_app.general.json_utils import clean_json_string, combine_json_results, add_keys_to_json +from flask_app.general.多线程提问 import multi_threading, read_questions_from_file +from flask_app.general.通义千问long import upload_file + + +def merge_dictionaries_under_common_key(dicts, common_key): + # 初始化一个空字典来保存合并的结果 + merged_dict = {common_key: {}} + + # 遍历列表中的每个字典 + for d in dicts: + if common_key in d: + # 使用字典解包来合并字典 + merged_dict[common_key].update(d[common_key]) + else: + print(f"资格评审: Warning: Dictionary does not contain the key {common_key}") + + return merged_dict + + +def generate_qual_question(matching_keys_list): # 这里假设资质、信誉与人员要求 要不都有、要不都没 + if not matching_keys_list: + return [] + else: + questions = [] + # 将列表转换为单引号包裹的格式,并用逗号和空格分隔 + formatted_keys = ["'{}'".format(key) for key in matching_keys_list] + # 将格式化后的关键词列表连接成字符串 + keys_string = "、".join(formatted_keys) + # 构造完整的问题语句 + question1 = (f"该招标文件中资格评审的内容是怎样的?具体内容包括{keys_string}," + "请你以json格式返回结果,外层键名为'资格评审',嵌套键名为具体的字段,请你忠于原文,回答要求完整准确,不要擅自总结、删减。") + question2 = "该招标文件中资格评审中有关人员资格的要求是怎样的?请依次给出所需的岗位、需要的数量、资格要求、需要提交的证明材料(如具体的社保证明、技能证书等,若有时间要求请注明时间范围)、在岗要求、备注,若相关要求不存在,则无需返回该键值对。请你以json格式返回结果,外层键名为'资格评审',嵌套键名为具体的要求,请你忠于原文,回答要求完整准确,不要擅自总结、删减。" + questions.append(question1) + questions.append(question2) + return questions + + +def extract_matching_keys_qual(dict_data): + # 定义包含模式的列表 + include_patterns = [re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"), + re.compile(r"符合")] + # 初始化列表,用于存储匹配的键 + matching_keys = [] + non_matching_keys = {} + # 定义排除项 + excludes = ['联合体', '禁止投标', '不存在', '不得存在', '资格', '管理机构', '负责人', '人员'] # 联合体、禁止投标的情况、人员需要额外问 + # 遍历字典中的每个键值对 + for key, value in dict_data.items(): + if "附件" in key and "资质" in key: + return [], [] # 如果同时出现,立即返回空列表 + # 检查值是否符合任何一个包含模式 + if any(pattern.search(value) for pattern in include_patterns): + # 如果值符合包含模式,再检查键是否包含任何排除项 + if not any(ex in key for ex in excludes): + # 如果键不包含排除项,则添加到匹配键列表中 + matching_keys.append(key) + else: + # value中有实质的内容,不需要额外问。 + non_matching_keys[key] = value + + return matching_keys, non_matching_keys # matching:['资质条件', '财务状况'] non_matching_keys:{'营业执照': '具备有效的营业执照', '施工机械设备': '具备完善的施工设备'} + + +# 获取联合体投标的要求,由于它不一定在资格审查表中,故调用rag +def get_consortium_dict(knowledge_name): + qualify_list = [] + consortium_questions = [ + "该招标文件对于联合体投标的要求是怎样的,请按json格式给我提供信息,外层键名为'联合体投标要求(如有)',嵌套键名为你对该要求的总结,而键值需要完全与原文保持一致,不要擅自总结、删减。"] + results1 = multi_threading(consortium_questions, knowledge_name) + for _, response in results1: # _占位,代表ques;response[0]也是ques;response[1]是ans + try: + if response and len(response) > 1: # 检查response存在且有至少两个元素 + qualify_list.append(response[1]) + else: + print(f"资格评审: Warning: Missing or incomplete response data for query index {_}.") + except Exception as e: + print(f"资格评审: Error processing response for query index {_}: {e}") + consortium_dict = combine_json_results(qualify_list) + return consortium_dict + + +def get_all_dict(knowledge_name): + qualification_review_file_path = 'flask_app/static/提示词/资格评审.txt' + questions = read_questions_from_file(qualification_review_file_path) + qualification_list = [] + res1 = multi_threading(questions, knowledge_name) + for _, response in res1: # _占位,代表ques;response[0]也是ques;response[1]是ans + try: + if response and len(response) > 1: # 检查response存在且有至少两个元素 + qualification_list.append(response[1]) + else: + print(f"资格评审: Warning: Missing or incomplete response data for query index {_}.") + except Exception as e: + print(f"资格评审: Error processing response for query index {_}: {e}") + qualification_combined_res = combine_json_results(qualification_list) + return {'资格评审': qualification_combined_res} + + +def process_qualification(qualification_review, truncate3, knowledge_name): + # 资格评审 + matching_keys_list, non_matching_dict = extract_matching_keys_qual( + qualification_review) # matching_keys_list:['资质条件', '财务状况'] non_matching_dict:{'营业执照': '具备有效的营业执照', '施工机械设备': '具备完善的施工设备'} + if not matching_keys_list: + if not non_matching_dict: # 古法提取 + if truncate3 != "": # 提取到资格审查附件的情况 + print("资格评审: type1") + matching_keys_list = ["资质条件", "财务要求", "业绩要求", "信誉要求", "其他要求"] + ques = generate_qual_question(matching_keys_list) + file_id2 = upload_file(truncate3) + results2 = multi_threading(ques, "", file_id2, 2) # 资格评审表,调用qianwen-long + res_list = [] + if not results2: + print("资格评审: 调用大模型未成功获取资格评审文件中的要求!") + else: + # 打印结果 + for question, response in results2: + res_list.append(clean_json_string(response)) # 都是问资格评审表得出的 + if res_list: + merged_dict = merge_dictionaries_under_common_key(res_list, '资格评审') + consortium_dict = get_consortium_dict(knowledge_name) + updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) + return updated_qualify_json + else: + print("资格评审: 无法获取大模型结果,返回空值") + return {"资格评审": ""} + else: + print("资格评审: type2") + return get_all_dict(knowledge_name) or {"资格评审": ""} + + else: # 此时要求全部写在评分办法前附表中,不需要额外提取。 + print("资格评审: type3") + new_non_matching_json = {'资格评审': non_matching_dict} + substring = '联合体' + found_key = any(substring in key for key in non_matching_dict.keys()) # 没有联合体投标,则需生成,防止重复 + if not found_key: + consortium_dict = get_consortium_dict(knowledge_name) + final_qualify_json = add_keys_to_json(new_non_matching_json, consortium_dict) + return final_qualify_json + else: + return new_non_matching_json or {"资格评审": ""} + + elif matching_keys_list and truncate3 == "": # 这种情况是评分办法前附表中有要求,但是没有正确截取到'资格审查表' + print("资格评审: type4") + final_qualification = get_all_dict(knowledge_name) + final_qualify_json = add_keys_to_json(final_qualification, non_matching_dict) + return final_qualify_json or {"资格评审": ""} + else: # 大多数情况 + print("资格评审: type5") + user_querys = generate_qual_question(matching_keys_list) # 生成提问->‘附件:资格审查’ + file_id2 = upload_file(truncate3) + results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表,调用qianwen-long + res_list = [] + if not results2: + print("资格评审: 调用大模型未成功获取资格评审文件中的要求!") + return {"资格评审": ""} + else: + # 打印结果 + for question, response in results2: + cleaned_res = clean_json_string(response) + res_list.append(cleaned_res) # 都是问资格评审表得出的 + merged_dict = merge_dictionaries_under_common_key(res_list, '资格评审') + consortium_dict = get_consortium_dict(knowledge_name) + updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) # 合并字典 + final_qualify_json = add_keys_to_json(updated_qualify_json, non_matching_dict) + return final_qualify_json or {"资格评审": ""} + + +if __name__ == "__main__": + # qualification_review={'营业执照': '具备有效的营业执照', '资质等级': '具备建设行政主管部门颁发的市政公用工程监理乙级及以上资质或房屋建筑工程监理乙级及以上资质或工程监理综合资质证书', '财务状况': '投标人须提供近三年(2018 年、2019 年、2020 年)完', '类似项目业绩': '投标人近 5 年(2017 年至今)须具有至少一项投资概算在4000 万元及以上房屋建筑工程或市政公用工程监理业绩,同时提供中标通知书及监理服务合同,项目规模及项目时间认定以监理服务合同内信息为准', '信誉': '根据《关于在招标投标活动中对失信被执行', '主要人员': '监理工程师:至少提供 1 名具有房屋建筑工程专','不存在禁止投标的':'不存在第二章“投标人须知”第 1.4.3 项规定的情形','联合体投标':'hha'} + qualification_review = {'营业执照': '具备有效的营业执照', '安全生产许可证': '具备有效的安全生产许可证', + '资质等级': '符合第二章“投标人须知”规定', '财务状况': '符合第二章“投标人须知”规定'} + truncate3 = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_qualification.pdf" + knowledge_name = "招标解析word13" + res = process_qualification(qualification_review, truncate3, knowledge_name) + print(json.dumps(res, ensure_ascii=False, indent=4)) + +# 该招标文件中资格评审关于财务状况的内容是怎样的?请你以json格式返回结果,外层键名为'财务状况',请你忠于原文,回答要求完整准确,不要擅自总结、删减,且不要回答诸如'见投标人须知前附表'或'见第x.x项规定'这类无实质性内容的回答。 diff --git a/flask_app/old_version/资格评审前判断_old.py b/flask_app/old_version/资格评审前判断_old.py new file mode 100644 index 0000000..eb3dc87 --- /dev/null +++ b/flask_app/old_version/资格评审前判断_old.py @@ -0,0 +1,29 @@ +from flask_app.general.读取文件.按页读取pdf import extract_text_by_page + +def check_strings_in_pdf(file_path): + judge_list=['施工机械设备', '企业信息登记'] + # Read text from PDF + text = extract_text_by_page(file_path) # Assuming this returns all text from the PDF + full_text = ''.join(text).replace('\n', '').replace(' ', '') # Clean up the text + + # Initialize the questions list + ques_list = [] + + # Check for each string in the judge_list and construct questions accordingly + if judge_list[0] in full_text: + ques_list.append(f"该招标文件对于'{judge_list[0]}'的要求是怎样的,请按json格式给我提供信息,键名为'{judge_list[0]}',若存在未知信息,在对应的键值中填'未知'。") + if len(judge_list) > 1 and judge_list[1] in full_text: + ques_list.append(f"该招标文件对于'{judge_list[1]}'的要求是怎样的,请按json格式给我提供信息,键名为'{judge_list[1]}',若存在未知信息,在对应的键值中填'未知'。") + + if not ques_list: + return None + return ques_list + +# Test cases or example usage +if __name__ == '__main__': + file_path = 'C:/Users/Administrator/Desktop/zbtest18_39-45.pdf' # Replace with your actual PDF file path + judge_list = ['施工机械设备', '企业信息登记'] # List of strings to check in the PDF + + questions = check_strings_in_pdf(file_path, judge_list) + for question in questions: + print(question) diff --git a/flask_app/old_version/通义千问_old.py b/flask_app/old_version/通义千问_old.py new file mode 100644 index 0000000..4a71a37 --- /dev/null +++ b/flask_app/old_version/通义千问_old.py @@ -0,0 +1,48 @@ +import json +import random +from http import HTTPStatus +from dashscope import Generation + +def call_with_messages(messages): + response = Generation.call(model="qwen-max", + messages=messages, + seed=random.randint(1, 10000), + temperature=0.5, + top_p=0.5, + top_k=50, + result_format='message') + if response.status_code == HTTPStatus.OK: + content = response.output['choices'][0]['message']['content'] + return content + else: + raise Exception(f'Request id: {response.request_id}, Status code: {response.status_code}, error code: {response.code}, error message: {response.message}') + +def prepare_question_from_json(json_path, user_query): + with open(json_path, 'r', encoding='utf-8') as file: + json_data = json.load(file) + question = json.dumps(json_data, ensure_ascii=False) + user_query + return question + +#专用于判断是否 +def qianwen_ask(json_path, user_query): + messages = [] + question = prepare_question_from_json(json_path, user_query) + messages.append({'role': 'system', 'content': 'You are a helpful assistant.'}) + messages.append({'role': 'user', 'content': question}) + return call_with_messages(messages) + +#通用问题 +def qianwen_ask2(questions): + messages = [] + messages.append({'role': 'system', 'content': 'You are a helpful assistant.'}) + messages.append({'role': 'user', 'content': questions}) + return call_with_messages(messages) + +if __name__ == '__main__': + json_path = 'judge_exist.json' + prompt = "请你依据以上信息回答,是否组织踏勘现场?是否召开投标预备会?是否允许偏离?是否退还投标文件?是否允许分包? 是否需要递交投标保证金?是否有履约保证金(履约担保)?是否有招标代理服务费?请按json格式给我提供信息,键名分别为'是否组织踏勘现场','是否召开投标预备会','是否允许偏离','是否退还投标文件',是否允许分包','是否需要递交投标保证金','是否有履约保证金','是否有招标代理服务费',键值仅限于'是','否','未知',若存在未知或者矛盾信息,请回答'未知'。" + try: + content = qianwen_ask(json_path, prompt) + print(content) + except Exception as e: + print(f"An error occurred: {e}") From 00a4218e06fc4d1de0a67e07cf3b68a8d8d4ef0d Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Thu, 28 Nov 2024 17:27:44 +0800 Subject: [PATCH 5/8] =?UTF-8?q?11.28=20=E6=8A=80=E6=9C=AF=E5=8F=82?= =?UTF-8?q?=E6=95=B0=E6=8F=90=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/file2markdown.py | 2 +- flask_app/main/商务评分技术评分整合.py | 15 +- flask_app/货物标/技术参数要求提取.py | 242 ++++++++++++------------- flask_app/货物标/提取采购需求main.py | 3 +- flask_app/货物标/评分标准提取main.py | 9 +- 5 files changed, 133 insertions(+), 138 deletions(-) diff --git a/flask_app/general/file2markdown.py b/flask_app/general/file2markdown.py index 271fc6c..ba858d8 100644 --- a/flask_app/general/file2markdown.py +++ b/flask_app/general/file2markdown.py @@ -72,6 +72,6 @@ def convert_pdf_to_markdown(file_path): if __name__ == "__main__": - file_path=r"C:\Users\Administrator\Desktop\货物标\output3\ztbfile_qualification1.pdf" + file_path=r"C:\Users\Administrator\Desktop\fsdownload\e702f1e6-095d-443d-bb7d-ef2e42037cb1\ztbfile_procurement.pdf" res=convert_pdf_to_markdown(file_path) print(res) \ No newline at end of file diff --git a/flask_app/main/商务评分技术评分整合.py b/flask_app/main/商务评分技术评分整合.py index 1a112ae..6a785d3 100644 --- a/flask_app/main/商务评分技术评分整合.py +++ b/flask_app/main/商务评分技术评分整合.py @@ -113,9 +113,16 @@ def combine_evaluation_standards(evaluation_method): # user_query_2 = ( # "根据该文档中的评标办法前附表,请你列出该文件的技术评分,商务评分,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术评分'中新增子键名'备注'存放该信息。如果评分内容(因素)不是这3个,则返回文档中给定的评分内容(因素)以及它的评分要求。请以json格式返回结果,不要回答有关形式、资格、响应性评审标准的内容") user_query_2 = ( - """ -根据该文档中的评标办法前附表,请你列出该文件的技术评分,商务评分,投标报价评审以及它们对应的具体评分要求,请以json格式返回结果,最外层键名分别是'技术评分','商务评分','投标报价评审',请在这三大项评分中分别用若干键值对表示具体评分项,外层键名为各评审因素,可能存在嵌套关系,但最内层键值为一个列表,列表中包含若干(可为一)描述该评审因素的评分及要求的字典,内层键名分别是'评分'和'要求',若无评分,可删去'评分'键值对,'要求'中说明了该评审因素的评分标准;若这三大项评分中存在其他信息,则在相应评分大块内部新增键名'备注'存放该信息,键值为具体的要求,否则不需要。如果评分内容(因素)不是这三大项,则返回文档中给定的评分内容(因素)以及它们的具体评分要求。 -以下为需要考虑的注意事项:1.不要回答有关资格审查的内容,也不要从评标办法正文中提取回答 2.若大项的'xx评分'要求未在文中说明,则键名'xx评分'的键值设为'本项目无xx评分项',例如"技术评分":"本项目无技术评分项" 3. 如果该招标活动有多个包,则最外层键名为对应的包名,否则不需要 4.你无需将表格的单元格内的内容进行拆分,需要将它视为一个整体。以下为示例输出,仅供格式参考: + """根据该文档中的评标办法表格,请你列出该文件的技术评分,商务评分,投标报价评审以及它们对应的具体评分要求,请以json格式返回结果,最外层键名分别是'技术评分','商务评分','投标报价评审',请在这三大项评分中分别用若干键值对表示具体评分项,外层键名为各评审因素,可能存在嵌套关系,但最内层键值为一个列表,列表中包含若干(可为一)描述该评审因素的评分及要求的字典,内层键名分别是'评分'和'要求',若无评分,可删去'评分'键值对,'要求'中说明了该评审因素的评分标准;若这三大项评分中存在其他信息,则在相应评分大块内部新增键名'备注'存放该信息,键值为具体的要求,否则不需要。如果评分内容(因素)不是这三大项,则返回文档中给定的评分内容(因素)以及它们的具体评分要求。 + +要求与指南: +1. 请首先定位评分细则的表格,不要回答有关资格审查的内容,也不要从评标办法正文中提取回答 +2. 若大项的'xx评分'要求未在文中说明,则键名'xx评分'的键值设为'本项目无xx评分项',例如"技术评分":"本项目无技术评分项" +3. 如果该招标活动有多个包,则最外层键名为对应的包名,否则最外层键名为各大评分项 +4. 你无需将表格的单元格内的内容进行拆分,需要将它视为一个整体 +5. '评分'的键值不能是一个范围数字,如'0-5分',应该是一个具体数字,如'5分',或者是一个定性的指标如'合格制' + +以下为示例输出,仅供格式参考: { "一包": { "技术评分": { @@ -181,7 +188,7 @@ def combine_evaluation_standards(evaluation_method): return update_json #商务标技术标整合 if __name__ == "__main__": # evaluation_method="C:\\Users\\Administrator\\Desktop\\招标文件\\招标01_evaluation_method.pdf" - evaluation_method= r"D:\flask_project\flask_app\static\output\output1\9a4e3bc3-8367-4529-87db-af0ea53dc348\ztbfile_evaluation_method.pdf" + evaluation_method= r"C:\Users\Administrator\Desktop\招标文件\招标04_evaluation_method.pdf" evaluation_standards_res=combine_evaluation_standards(evaluation_method) # 从结果中提取"商务标"和"技术标" technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})} diff --git a/flask_app/货物标/技术参数要求提取.py b/flask_app/货物标/技术参数要求提取.py index b247f07..2f8bad5 100644 --- a/flask_app/货物标/技术参数要求提取.py +++ b/flask_app/货物标/技术参数要求提取.py @@ -290,151 +290,131 @@ def combine_and_update_results(original_data, updates): return original_data -# user_query1 = """ -# 请你首先定位该采购文件中的采购清单或采购需求部分,请告诉我需要采购的货物,如果有采购清单,请直接根据清单上的货物(或系统)名称给出结果,注意不要返回'说明'或'规格'或'技术参数'列中的内容;若没有采购清单,你要从表格中或文中摘取需要采购的系统和货物,采购需求中可能包含层次关系,例如采购的某系统中可能包含几种货物,那么你需要用嵌套键值对表示这种关系,且不要遗漏该系统中包含的货物,你的输出请以json格式返回,最外层键名为'采购需求',嵌套键名为对应的系统名称或货物名称,需与原文保持一致,无需给出采购数量和单位。以下为需要考虑的特殊情况:如果采购清单中同一层级(或同一系统)下存在同名货物且它们的采购要求有所不同,请你以'货物名-编号'区分多种型号,编号为从 1 开始的自然数,依次递增,例如若采购清单中有两种型号的'交换机',那么你应返回两个键名,'交换机-1'和'交换机-2';如有未知内容,在对应键值处填'未知'。以下为考虑了特殊情况的示例输出: -# { -# "采购需求": { -# "交换机-1":{}, -# "交换机-2":{}, -# "门禁管理系统": {}, -# "交通监控视频子系统": { -# "高清视频抓拍像机":{}, -# "补光灯":{} -# }, -# "LED全彩显示屏": {} -# } -# } -# """ - #文件内容以markdown格式组织,其中表格部分(若有)以html语法组织, def get_technical_requirements(file_path,invalid_path,processed_filepath): # docx_file_path=pdf2docx(file_path) - file_id=upload_file(file_path) #docx + file_id=upload_file(file_path) #目前传入的为docx文档 first_query_template="该文件是否说明了采购需求,即需要采购哪些货物?如果有,请回答'是',否则,回答'否'" #防止截取失败 judge_res=qianwen_long(file_id,first_query_template) prompt_template1 = ''' - 任务:解析采购文件,提取采购需求,并以JSON格式返回。 +任务:解析采购文件,提取采购需求,并以JSON格式返回。 - 要求与指南: - 1. 精准定位:请运用文档理解能力,找到文件中的采购需求部分,若有采购清单,请直接根据采购清单上的货物(或系统)名称给出结果。 - 2. 采购目标:采购目标通常有硬件(如设备、货物)和软件(如系统软件、应用APP),一次采购活动可能同时包含这两种类型。对于工程类的施工、建设采购需求,无需提取。 - 3. 非清单形式处理:若未出现采购清单,则从表格或文字中摘取采购信息。 - 4. 系统归属:一些采购活动可能将采购目标划分为若干系统和货物,每个系统可能包含若干货物,则将这些货物名称作为该系统的二级键;系统可以只包含总体'系统功能'而无货物。 - 5. 软件需求:对于软件应用或系统软件需求,仅需列出系统模块构成(若有),并作为系统键值的一部分,无需在模块下再细分功能。 - 6. 系统功能:若采购的某系统提及总体系统功能,则在系统值中添加'系统功能'二级键,不展开具体内容。 - 7. 完整性:确保不遗漏系统内的货物,也不添加未提及的内容。 +要求与指南: +1. 精准定位:请运用文档理解能力,找到文件中的采购需求部分,若有采购清单,请直接根据采购清单上的货物(或系统)名称给出结果;若未出现采购清单,则从表格或文字中摘取采购信息 +2. 采购目标:采购种类通常有硬件(如设备、货物)和软件(如系统软件、应用APP),一次采购活动可以同时包含这两种类型。 +3. 系统归属:一些采购活动可能将采购目标划分为若干系统和货物,每个系统可能包含若干货物,则将这些货物名称作为该系统的二级键;系统可以只包含总体'系统功能'而无货物。 +4. 软件需求:对于软件应用或系统软件需求,仅需列出系统模块构成(若有),并作为系统键值的一部分,无需在模块下再细分功能。 +5. 系统功能:若采购的某系统提及总体系统功能,则在系统值中添加'系统功能'二级键,不展开具体内容。 +6. 完整性:确保不遗漏系统内的货物,也不添加未提及的内容,若采购清单之外有额外的货物采购要求,且该货物暂未提取至JSON回答中,请将这些货物名称也包含进来。 - 输出格式: - 1.JSON格式,最外层键名为'采购需求'。 - 2.层次关系用嵌套键值对表示。 - 3.嵌套键名为系统或货物或模块名称,与原文保持一致。 - 4.最内层键值应为空列表[]。 - 5.不包含'说明'、'规格'、'技术参数'等列内容,仅返回采购的货物或系统或模块名称。 +输出格式: +1.JSON格式,最外层键名为'采购需求'。 +2.层次关系用嵌套键值对表示。 +3.嵌套键名为系统或货物或模块名称,与原文保持一致。 +4.最内层键值应为空列表[]。 +5.不包含'说明'、'规格'、'技术参数'等列内容,仅返回采购的货物或系统或模块名称。 - 特殊情况处理: - 同一层级(如同一系统中)下同名但采购要求不同的货物,以'货物名-编号'区分,编号从1递增。例如若同层级下存在两种型号的交换机,那么命名分别是'交换机-1'和'交换机-2',以规避重复键名;否则无需在名称后添加编号。 +特殊情况处理: +同一层级(如同一系统中)下同名但采购要求不同的货物,以'货物名-编号'区分,编号从1递增。例如若同层级下存在两种型号的交换机,那么命名分别是'交换机-1'和'交换机-2',以规避重复键名;否则无需在名称后添加编号。 - {{ - "采购需求": {{ - "交换机-1": [], - "交换机-2": [], - "门禁管理系统": {{ - "系统功能":[] - }}, - "交通监控视频子系统": {{ - "系统功能": [], - "高清视频抓拍像机": [], - "补光灯": [] - }}, - "LED全彩显示屏": [] - // 其他系统和货物 - }} - }} - 示例输出2,系统软件采购: - {{ - "采购需求": {{ - "信息管理系统": {{ - "通用模块":[], - "用户管理":[] - }}, - "信息检索系统": {{ - "系统功能":[], - "权限管理模块":[] - }}, - "XX小程序":[], - "数据分析中心":[] - }} - }} +{{ + "采购需求": {{ + "交换机-1": [], + "交换机-2": [], + "门禁管理系统": {{ + "系统功能":[] + }}, + "交通监控视频子系统": {{ + "系统功能": [], + "高清视频抓拍像机": [], + "补光灯": [] + }}, + "LED全彩显示屏": [] + // 其他系统和货物 + }} +}} +示例输出2,系统软件采购: +{{ + "采购需求": {{ + "信息管理系统": {{ + "通用模块":[], + "用户管理":[] + }}, + "信息检索系统": {{ + "系统功能":[], + "权限管理模块":[] + }}, + "XX小程序":[], + "数据分析中心":[] + }} +}} - 注意事项: - 1.严格按照上述要求执行,确保输出准确性和规范性。 - 2.如有任何疑问或不确定内容,请保留原文描述,必要时使用'未知'标注。 - ''' +注意事项: +1.严格按照上述要求执行,确保输出准确性和规范性。 +2.如有任何疑问或不确定内容,请保留原文描述,必要时使用'未知'标注。 +''' prompt_template2 = ''' - 任务:解析采购文件,提取采购需求,并以JSON格式返回。 +任务:你负责解析采购文件,提取采购需求,并以JSON格式返回,不要遗漏该项目需要采购的货物(或系统)。 - 要求与指南: - 1. 精准定位:请运用文档理解能力,找到文件中的采购需求部分,若有采购清单,请直接根据采购清单上的货物(或系统)名称给出结果。 - 2. 采购目标:采购目标通常有硬件(如设备、货物)和软件(如系统软件、应用APP),一次采购活动可能同时包含这两种类型。对于工程类的施工、建设采购需求,无需提取。 - 3. 非清单形式处理:若未出现采购清单,则从表格或文字中摘取采购信息。 - 4. 系统归属:一些采购活动可能将采购目标划分为若干系统和货物,每个系统可能包含若干货物,则将这些货物名称作为该系统的二级键;系统可以只包含总体'系统功能'而无货物。 - 5. 软件需求:对于软件应用或系统软件采购,若有多个系统且序号分明,请不要遗漏,最多仅需列出系统模块构成(若有),并作为该系统键值的一部分,无需在模块下再细分功能。 - 6. 系统功能:若采购的某系统提及总体系统功能,则在系统值中添加'系统功能'二级键,不展开具体内容。 - 7. 完整性:确保不遗漏系统内的货物,也不添加未提及的内容。 +要求与指南: +1. 精准定位:请运用文档理解能力,定位文件中的采购需求部分。若有采购清单,请直接根据采购清单上的货物(或系统)名称给出结果,注意你无需提取诸如'说明'、'规格'、'技术参数'、'描述'等列的内容,即你不需要给出详细的采购要求,仅返回采购的货物或系统或模块名称;若没有采购清单,则从表格或文本中摘取采购信息。 +2. 采购目标:采购种类通常有硬件(如设备、货物)和软件(如系统软件、应用APP),一次采购活动可以同时包含这两种类型。 +3. 系统归属:一些采购活动可能将采购目标划分为若干系统和货物,每个系统可能包含若干货物,则将这些货物名称作为该系统的二级键,注意这种包含关系是通过表格结构或文档标题层次来得出的;系统可以只包含总体'系统功能'而无货物。 +5. 软件需求:对于软件应用或系统软件采购,若有多个系统且序号分明,请不要遗漏,最多仅需列出系统模块构成(若有),并作为该系统键值的一部分,无需在模块下再细分功能。 +5. 系统功能:若采购的某系统提及总体系统功能,则在系统值中添加'系统功能'二级键,不展开具体内容。 +6. 完整性:确保不遗漏系统内的货物,也不添加未提及的内容。若'采购清单'中未提取的货物(或系统)名称在形如'主要设备功能指标'的标题下有详细参数指标要求,请将该货物名也添加至返回中。 - 输出格式: - 1.JSON格式,最外层键名为'采购需求'。 - 2.层次关系用嵌套键值对表示。 - 3.嵌套键名为系统或货物或模块名称,与原文保持一致。 - 4.最内层键值应为空列表[]。 - 5.不包含'说明'、'规格'、'技术参数'等列内容,仅返回采购的货物或系统或模块名称。 +输出格式: +1.JSON格式,最外层键名为'采购需求'。 +2.层次关系用嵌套键值对表示。 +3.嵌套键名为系统或货物或模块名称,与原文保持一致。 +4.最内层键值应为空列表[]。 - 特殊情况处理: - 若同一层级(如同一系统中)下存在同名但采购要求不同的货物,以'货物名-编号'区分,编号从1递增,例如若同层级下存在两种型号的交换机,那么命名分别是'交换机-1'和'交换机-2',以规避重复键名;否则无需在名称后添加编号。 +特殊情况处理: +若同一层级(如同一系统中)下存在同名但采购要求不同的货物,请以'货物名-编号'区分,编号从1递增,例如若同层级下存在两种型号的交换机,那么命名分别是'交换机-1'和'交换机-2',以规避重复键名;否则无需在名称后添加编号。 - 示例输出1,普通系统、货物类采购: - {{ - "采购需求": {{ - "交换机-1": [], - "交换机-2": [], - "门禁管理系统": {{ - "系统功能":[] - }}, - "交通监控视频子系统": {{ - "系统功能": [], - "交换机":[], - "高清视频抓拍像机": [], - "补光灯": [] - }}, - "LED全彩显示屏": [] - // 其他系统和货物 - }} - }} - 示例输出2,系统软件采购: - {{ - "采购需求": {{ - "信息管理系统": {{ - "通用模块":[], - "用户管理":[] - }}, - "信息检索系统": {{ - "系统功能":[], - "权限管理模块":[] - }}, - "XX管理系统":[], - "XX管理系统":[] - }} - }} +示例输出1,普通系统、货物类采购: +{{ + "采购需求": {{ + "交换机-1": [], + "交换机-2": [], + "门禁管理系统": {{ + "系统功能":[] + }}, + "交通监控视频子系统": {{ + "系统功能": [], + "交换机":[], + "高清视频抓拍像机": [], + "补光灯": [] + }}, + "LED全彩显示屏": [] + // 其他系统和货物 + }} +}} +示例输出2,软件系统类采购: +{{ + "采购需求": {{ + "信息管理系统": {{ + "通用模块":[], + "用户管理":[] + }}, + "信息检索系统": {{ + "系统功能":[], + "权限管理模块":[] + }}, + "XX管理系统":[], + //其他系统 + }} +}} - 文件内容:{full_text} +文件内容:{full_text} - 注意事项: - 1.严格按照上述要求执行,确保输出准确性和规范性。 - 2.如有任何疑问或不确定内容,请保留原文描述,必要时使用'未知'标注。 - ''' +注意事项: +1.严格按照上述要求执行,确保输出准确性和规范性。 +''' if '否' in judge_res: + print("no!调用invalid_path") file_id=upload_file(invalid_path) - print("调用invalid_path") model_res=qianwen_long(file_id,prompt_template1) print(model_res) else: @@ -444,7 +424,6 @@ def get_technical_requirements(file_path,invalid_path,processed_filepath): model_res=doubao_model(user_query) # model_res = qianwen_long(file_id,prompt_template1) print(model_res) - cleaned_res = clean_json_string(model_res) #转字典 processed_data=truncate_system_keys(cleaned_res['采购需求']) key_paths, grouped_paths, good_list, data_copy= generate_key_paths(processed_data) # 提取需要采购的货物清单 key_list:交通监控视频子系统.高清视频抓拍像机 ... grouped_paths是同一系统下同时有'交换机-1'和'交换机-2',提取'交换机' ,输出eg:{'交通标志.标志牌铝板', '交通信号灯.交换机'} @@ -571,16 +550,17 @@ def test_all_files_in_folder(input_folder, output_folder): if __name__ == "__main__": start_time=time.time() # truncate_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\469d2aee-9024-4993-896e-2ac7322d41b7\\ztbfile_procurement.docx" - truncate_file=r"C:\Users\Administrator\Desktop\fsdownload\5950ad84-30c8-4643-b6de-b13ef5be7a5c\ztbfile.pdf" + truncate_docfile=r"C:\Users\Administrator\Desktop\货物标\output1\招标文件(107国道)_procurement.docx" + truncate_file=r'C:\Users\Administrator\Desktop\fsdownload\e702f1e6-095d-443d-bb7d-ef2e42037cb1\ztbfile_procurement.pdf' # invalid_path="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile.pdf" # truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile_procurement.docx" # output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\tmp" # file_id = upload_file(truncate_file) - invalid_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a110ed59-00e8-47ec-873a-bd4579a6e628\\ztbfile.pdf" + invalid_path=r"C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\ztbfile.pdf" # file_id=upload_file(truncate_file) - # processed_filepath = pdf2txt(truncate_file) - processed_filepath=r"C:\Users\Administrator\Desktop\fsdownload\5950ad84-30c8-4643-b6de-b13ef5be7a5c\tmp\extract1.txt" - res=get_technical_requirements(truncate_file,invalid_path,processed_filepath) + processed_filepath = pdf2txt(truncate_file) + # processed_filepath=r"C:\Users\Administrator\Desktop\fsdownload\e702f1e6-095d-443d-bb7d-ef2e42037cb1\金水河沿线排涝泵站提档升级项目.txt" + res=get_technical_requirements(truncate_docfile,invalid_path,processed_filepath) json_string = json.dumps(res, ensure_ascii=False, indent=4) print(json_string) # # input_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1" diff --git a/flask_app/货物标/提取采购需求main.py b/flask_app/货物标/提取采购需求main.py index baa0f33..e8f5bf2 100644 --- a/flask_app/货物标/提取采购需求main.py +++ b/flask_app/货物标/提取采购需求main.py @@ -64,7 +64,8 @@ def fetch_procurement_reqs(procurement_path, invalid_path): # TODO:技术要求可以在技术参数之后执行,把完整的技术参数输入,问大模型,除了上述内容还有哪些,这样的话把技术标和其他的区分开。 -# TODO: 094有问题 +# TODO:0362 + if __name__ == "__main__": start_time = time.time() output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output" diff --git a/flask_app/货物标/评分标准提取main.py b/flask_app/货物标/评分标准提取main.py index c422e8d..b6aead4 100644 --- a/flask_app/货物标/评分标准提取main.py +++ b/flask_app/货物标/评分标准提取main.py @@ -225,7 +225,14 @@ def combine_evaluation_standards(truncate_file): user_query = ( """ 根据该文档中的评分标准表格中的内容,请你列出该文件的技术评分,商务评分,投标报价评审以及它们对应的具体评分要求,请以json格式返回结果,最外层键名分别是'技术评分','商务评分','投标报价评分',请在这三大项评分中分别用若干键值对表示具体评分项,外层键名为各评审因素,键值为一个列表,列表中包含若干(可为一)描述该评审因素的评分及要求的字典,内层键名分别是'评分'和'要求',若无评分,可删去'评分'键值对,'要求'中说明了该评审因素的评分标准;若这三大项评分中存在其他信息,则在相应评分大块内部新增键名'备注'存放该信息,键值为具体的要求,否则不需要。如果评分内容(因素)不是这三大项,则返回文档中给定的评分内容(因素)以及它们的具体评分要求。 -以下为需要考虑的注意事项:1.不要回答有关资格审查、符合性审查的内容,也不要从评标办法正文中(表格外)提取回答 2.若大项的'xx评分'要求未在文中说明,则键名'xx评分'的键值设为'本项目无xx评分项',例如"技术评分":"本项目无技术评分项" 3. 如果该招标活动有多个包,则最外层键名为对应的包名,否则不需要 4.你无需将表格的单元格内的内容进行拆分,需要将它视为一个整体。以下为示例输出,仅供格式参考: +要求与指南: +1.请首先定位评分细则的表格,不要回答有关资格审查、符合性审查的内容,也不要从评标办法正文中(表格外)提取回答 +2.若大项的'xx评分'要求未在文中说明,则键名'xx评分'的键值设为'本项目无xx评分项',例如"技术评分":"本项目无技术评分项" +3. 如果该招标活动有多个包,则最外层键名为对应的包名,否则不需要 +4.你无需将表格的单元格内的内容进行拆分,需要将它视为一个整体。 +5. '评分'的键值不能是一个范围数字,如'0-5分',应该是一个具体数字,如'5分',或者是一个定性的指标如'合格制' + +以下为示例输出,仅供格式参考: { "一包": { "技术评分": { From dd18c0d47b8c179d53787bc160f3e520c2e01cca Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Fri, 29 Nov 2024 15:02:07 +0800 Subject: [PATCH 6/8] =?UTF-8?q?11.29=20=E4=BC=98=E5=8C=96=E6=8F=90?= =?UTF-8?q?=E7=A4=BA=E8=AF=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/接口_技术偏离表.py | 60 ++++++------ flask_app/货物标/商务服务其他要求提取.py | 117 ++++++++++------------- 2 files changed, 79 insertions(+), 98 deletions(-) diff --git a/flask_app/general/接口_技术偏离表.py b/flask_app/general/接口_技术偏离表.py index 42e682f..c3efbe3 100644 --- a/flask_app/general/接口_技术偏离表.py +++ b/flask_app/general/接口_技术偏离表.py @@ -174,38 +174,40 @@ def extract_business_deviation(procurement): business_requirements_string = json.dumps(new_data, ensure_ascii=False, indent=4) # print(business_requirements_string) - prompt_template1 = """请帮我从以下文本中摘取商务要求部分,并将信息重新组织,外键名为'商务要求',键值为字符串列表,其中每个字符串为一条商务要求,去除开头的序号(若有)。 -#角色 -你是一个专业的招投标业务专家,擅长从招标文件中总结商务要求的部分,并逐条列出,作为编写商务要求偏离表的前置准备。 + prompt_template1 = """请帮我从以下文本中摘取商务要求部分,并将信息重新组织,外键名为'商务要求',键值为字符串列表,其中每个字符串为一条商务要求,保留三角▲、五角星★(若有),但是去除开头的序号(若有)。 + #角色 + 你是一个专业的招投标业务专家,擅长从招标文件中总结商务要求的部分,并逐条列出,作为编写商务要求偏离表的前置准备。 -#要求与指南: -1. 每条内容需要有实际的含义、要求,不能光有标题性质的表述如'售后服务期限(质保期)及要求'。 -2. 你的回答内容需从所给文本中整理,尽量不改变原文的表达,对于嵌套键值对,若键值本身符合'商务要求',可直接将其返回;若键值本身语义表达不完整,可将键值对拼接之后作为一条商务要求,拼接符号可以是冒号,即':'。 -3. 若无商务要求,键值为空列表,即[] + #要求与指南: + 1. 每条内容需要有实际的含义、要求,不能光有标题性质的表述如'售后服务期限(质保期)及要求'。 + 2. 你的回答内容需从所给文本中整理,尽量不改变原文的表达,请勿擅自添加三角▲、五角星★;若输入文本中存在嵌套键值对格式,且键值本身符合'商务要求',可直接将其添加至'商务要求'的键值中;若键值本身语义表达不完整,可将键值对拼接之后作为一条商务要求,拼接符号可以是冒号,即':'。 + 3. 输入文本中出现三角▲或五角星★开头的地方,请格外重视,不可漏提,若其后面的内容是标题性质的表述、不具备实际的要求,请你保留三角▲或五角星★,根据语义添加至紧跟着的字符串开头中。 + 3. 若无商务要求,键值为空列表,即[] -### 示例输入如下: -{{ - "招标要求1": "整个平台运行运维服务,须安排人员驻场对平台进行运行维护,采用 4人轮流值班,依照 7×12小时对可视化督察巡控平台进行操作,确保平台稳定运行,并对线上发现违规操作进行记录,通过督察平台推送督办单给线下监督员小程序进行检查。" - "招标要求2": {{ - "合同履行期限": "交货期(工期):合同签订之日起 15个日历天内完成,并通过项目验收。", - "交货地点": "采购人指定地点", - "报价方式": "本项目报价须为固定总价,包含但不限于:采购、实施、调试、试运行、验收、运维等所有完成本项目相关的一切费用。", - "其他要求": "无。" - }}, - "招标要求3": "路口必须在各方向埋设双管。" -}} -### 对应的参考输出如下: -{{ - "商务要求":[ - "整个平台运行运维服务,须安排人员驻场对平台进行运行维护,采用 4人轮流值班,依照 7×12小时对可视化督察巡控平台进行操作,确保平台稳定运行,并对线上发现违规操作进行记录,通过督察平台推送督办单给线下监督员小程序进行检查。", - "交货期(工期):合同签订之日起 15个日历天内完成,并通过项目验收。", - "交货地点:采购人指定地点", - "本项目报价须为固定总价,包含但不限于:采购、实施、调试、试运行、验收、运维等所有完成本项目相关的一切费用。" - ] -}} -文本内容:{full_text} -""" + ### 示例输入如下: + {{ + "招标要求1": ["▲(1)整个平台运行运维服务,须安排人员驻场对平台进行运行维护,采用 4人轮流值班,依照 7×12小时对可视化督察巡控平台进行操作,确保平台稳定运行,并对线上发现违规操作进行记录,通过督察平台推送督办单给线下监督员小程序进行检查。"] + "招标要求2": {{ + "合同履行期限": ["★交货期(工期):合同签订之日起 15个日历天内完成,并通过项目验收。"], + "交货地点": ["采购人指定地点"], + "报价方式": ["(1)本项目报价须为固定总价,包含但不限于:采购、实施、调试、试运行、验收、运维等所有完成本项目相关的一切费用。","(2)因投标人自身原因造成漏报、少报皆由其自行承担责任,采购人不再补偿。"], + "其他要求": ["无。"] + }} + }} + ### 对应的参考输出如下: + {{ + "商务要求":[ + "▲整个平台运行运维服务,须安排人员驻场对平台进行运行维护,采用 4人轮流值班,依照 7×12小时对可视化督察巡控平台进行操作,确保平台稳定运行,并对线上发现违规操作进行记录,通过督察平台推送督办单给线下监督员小程序进行检查。", + "★交货期(工期):合同签订之日起 15个日历天内完成,并通过项目验收。", + "交货地点:采购人指定地点", + "本项目报价须为固定总价,包含但不限于:采购、实施、调试、试运行、验收、运维等所有完成本项目相关的一切费用。", + "因投标人自身原因造成漏报、少报皆由其自行承担责任,采购人不再补偿。" + ] + }} + + 文本内容:{full_text} + """ user_query1 = prompt_template1.format(full_text=business_requirements_string) model_res1 = doubao_model(user_query1) # print(model_res) diff --git a/flask_app/货物标/商务服务其他要求提取.py b/flask_app/货物标/商务服务其他要求提取.py index 3691297..f0a90c4 100644 --- a/flask_app/货物标/商务服务其他要求提取.py +++ b/flask_app/货物标/商务服务其他要求提取.py @@ -77,30 +77,46 @@ def find_exists(truncate_file, required_keys): matched_requirements = [] punctuation = r"[,。?!、;:,.?!]*" for req in required_keys: - # required_keys 中的元素本身已包含 \s*,直接作为正则模式 if re.search(req, relevant_text): - if req == "服\s*务\s*要\s*求": - # 提取所有包含"服务要求"的行 + # 替换特定的关键词 + if req in [r"总\s*体\s*要\s*求", r"建\s*设\s*要\s*求"]: + # 匹配到“总体要求”或“建设要求”时,添加“技术要求”而非原关键词 + tech_req = r"技\s*术\s*要\s*求" + if tech_req not in matched_requirements: + matched_requirements.append(tech_req) + elif req in [r"培\s*训\s*要\s*求",r"质\s*保\s*要\s*求",r"售\s*后\s*要\s*求"]: + # 匹配到“培训要求”时,添加“服务要求”而非原关键词 + service_req = r"服\s*务\s*要\s*求" + if service_req not in matched_requirements: + matched_requirements.append(service_req) + elif req in [r"进\s*度\s*要\s*求",r"工\s*期\s*要\s*求"]: + busi_req = r"商\s*务\s*要\s*求" + if busi_req not in matched_requirements: + matched_requirements.append(busi_req) + elif req == r"服\s*务\s*要\s*求": + # 处理“服务要求”时的特殊逻辑 lines = [line for line in relevant_text.split('\n') if re.search(req, line)] - # 检查是否存在'技术'紧跟在'服务要求'前面(中间只有标点,标点是可选的) - pattern = "技\s*术" + punctuation + req + pattern = r"技\s*术" + punctuation + req if any(re.search(pattern, line) for line in lines): - # 如果存在'技术'紧跟'服务要求',添加"技术、服务要求" - if "技\s*术\s*、\s*服\s*务\s*要\s*求" not in matched_requirements: - matched_requirements.append("技\s*术\s*、\s*服\s*务\s*要\s*求") + combined_req = r"技\s*术\s*、\s*服\s*务\s*要\s*求" + if combined_req not in matched_requirements: + matched_requirements.append(combined_req) else: - # 如果不存在'技术'紧跟'服务要求',正常添加"服务要求" - matched_requirements.append(req) + if req not in matched_requirements: + matched_requirements.append(req) else: - matched_requirements.append(req) - + # 对于其他匹配的关键词,直接添加 + if req not in matched_requirements: + matched_requirements.append(req) # 去除 \s*,仅返回原始关键词 clean_requirements = [re.sub(r'\\s\*', '', req) for req in matched_requirements] # 判断互斥关系:如果有"技术、服务要求",删除"技术要求"和"服务要求" if "技术、服务要求" in clean_requirements: clean_requirements = [req for req in clean_requirements if req not in ["技术要求", "服务要求"]] - + # 确保最终返回的列表仅包含指定的五项 + # allowed_requirements = {"技术要求", "服务要求", "商务要求", "其他要求", "技术、服务要求"} + # final_requirements = [req for req in clean_requirements if req in allowed_requirements] return clean_requirements @@ -125,10 +141,7 @@ def generate_template(required_keys, type=1): "技术要求": ["相关技术要求1", "相关技术要求2"], "服务要求": ["服务要求1", "服务要求2", "服务要求3"], "商务要求": ["商务要求1", "商务要求2"], - "其他要求": { - "子因素名1": ["关于项目采购的其他要求1...", "关于项目采购的其他要求2..."], - "子因素名2": ["关于项目采购的其他要求3..."] - }, + "其他要求": ["关于项目采购的其他要求1...", "关于项目采购的其他要求2...", "关于项目采购的其他要求3..."], "技术、服务要求": ["相关技术、服务要求内容1", "相关技术、服务要求内容2", "相关技术、服务要求内容3"] } example_content2 = { @@ -143,7 +156,10 @@ def generate_template(required_keys, type=1): "子因素名1": ["商务要求1"], "子因素名2": ["商务要求2"] }, - "其他要求": ["关于项目采购的其他要求1...", "关于项目采购的其他要求2...", "关于项目采购的其他要求3..."], + "其他要求": { + "子因素名1": ["关于项目采购的其他要求1...", "关于项目采购的其他要求2..."], + "子因素名2": ["关于项目采购的其他要求3..."] + }, "技术、服务要求": { "子因素名1": ["相关技术、服务要求内容1"], "子因素名2": ["相关技术、服务要求内容2", "相关技术、服务要求内容3"] @@ -180,29 +196,31 @@ def generate_template(required_keys, type=1): def generate_prompt_instruction(keys_str, outer_keys_str, another_keys_str, type): if type == 1: specific_instructions = textwrap.dedent( - """4. 若章节开头位置或者采购清单中除了需要采购的货物、数量、单位之外,还有带三角▲或五角星★的描述内容(如工期要求、质保要求等商务要求),请将该部分内容提取出来,添加在外层键名为'商务要求'的键值部分。 - 5. 在提取'服务要求'的时候,通常包含'售后、维护、培训'等要求,若原文中有这些要求,请一并提取置于'服务要求'的键值中,。 + """4. 若章节开头位置或者采购清单中除了需要采购的货物、数量、单位之外,还有带三角▲或五角星★的描述内容(如工期要求、进度要求、品牌要求等商务要求),也请将该部分内容提取出来,添加在外层键名为'商务要求'的键值部分;若存在标题包含'工期要求'、'进度要求'等和商务要求有关的关键字,也请将该标题下的内容提取,添加在外层键名为'商务要求'的键值部分。请不要遗漏这部分的'商务要求'。 + 5. 在提取'服务要求'的时候,若原文(包含正文和表格)中存在'安装要求'、'售后要求'、'维护要求'、'培训要求'等服务相关的要求说明,请添加至'服务要求'的键值部分,不要遗漏这部分的'服务要求'。 """ ) else: specific_instructions = textwrap.dedent( - """4. 在提取技术要求或技术、服务要求时,你无需从采购清单或表格中提取技术要求以及参数要求,你仅需定位到原文中包含'技术要求'或'技术、服务要求'关键字的标题并提取其后相关内容;若技术要求的内容全在表格中,键值为空列表[]。 - 5. 在提取'技术要求'时,注意不要提取有关'售后、维护、运维、培训、质保'等要求,它们不属于'技术要求'。 + """4. 在提取技术要求或技术、服务要求时,你无需从采购清单或表格中提取技术要求以及参数要求,你仅需定位到原文中包含'技术要求'或'技术、服务要求'关键字的标题并提取其后相关内容,从正文中提取;若技术要求或技术服务要求的内容全在表格中,键值为空列表[]。 + 5. 在提取'技术要求'时,注意不要提取有关'安装、售后、维护、运维、培训、质保'等要求,它们不属于'技术要求',但是若大标题中包含'总体要求''建设要求'等和技术要求相关的关键字,请添加到'技术要求'的键值部分。 """ ) return textwrap.dedent( - f"""请你根据该货物类招标文件中的采购要求部分内容,请告诉我该项目采购的{keys_str}分别是什么,请以json格式返回结果,默认情况下外层键名是{outer_keys_str},键值为字符串列表,每个字符串表示具体的一条要求,可以按原文中的序号作划分(若有序号的话),请按原文内容回答,保留三角▲、五角星★和序号(若有),不要擅自增添内容及序号。请不要提取{another_keys_str}中的内容。 + f"""请你根据该货物类招标文件中的采购要求部分内容,请告诉我该项目采购的{keys_str}分别是什么,请以json格式返回结果,默认情况下外层键名是{outer_keys_str},键值为字符串列表,每个字符串表示具体的一条要求,可以按原文中的序号作划分(若有序号的话),请按原文内容回答,保留三角▲、五角星★和序号(若有),不要擅自增添内容及序号。注意:1. 若相应要求下存在子标题表示子要求因素,可以将它忽略而不是将它与下文具体要求进行多行合并,或者作为该要求下的嵌套键名,总之字符串列表中只提取具体的要求。2. 请不要提取{another_keys_str}中的内容。 要求与指南: 1. JSON 的结构要求: - - 默认情况无需嵌套键值对,键值为字符串列表;若存在嵌套结构(即有明确标题表示各子要求),则嵌套键名是原文中该要求下相应子标题,最多一层嵌套。 + - 默认情况无需嵌套键值对,键值为字符串列表;若存在嵌套结构(即有明确标题表示各子要求),则嵌套键名是各子要求,嵌套键值为字符串列表,为该子要求下的若干要求,最多一层嵌套。 - 每个外层键对应的值可以是: a. 一个字符串列表,表示具体的一条条要求。若只有一条要求,也用字符串列表表示。 b. 一个对象(字典),其键为子因素名,值为字符串列表。 + c. 若文档中无符合的相关要求,键值为空列表[] - 最多只允许一层嵌套。 2. 请优先且准确定位正文部分包含以下关键字的标题:{outer_keys_str},在其之后提取'XX要求'相关内容,尽量避免在无关地方提取内容。 - 3. 注意请不要返回Markdown语法,必要时使用冒号':'将相关信息拼接在一起。若文档中无符合的相关要求,键值为空列表[] + 3. 注意请不要返回Markdown表格语法,必要时使用冒号':'将相关信息拼接在一起 {specific_instructions} + 6. 字符串列表中的每个字符串内容需与原文内容保持一致,保留前面的三角▲、五角星★和序号(若有),而且你不可以擅自添加序号。 """) # 过滤示例内容 @@ -223,56 +241,17 @@ def generate_template(required_keys, type=1): user_query_template = f""" {prompt_instruction} 以下为示例输出,仅供格式参考: - 示例 1: + 示例 1,无嵌套键值对: {tech_json_example1_str} - 示例 2: + 示例 2,嵌套键值对形式: {tech_json_example2_str} """ return user_query_template -def merge_requirements(input_dict): - # 初始化一个临时字典,用于存储标准化后的键 - temp_dict = {} - # 初始化最终字典,只包含指定的四个键 - final_keys = ['技术要求', '商务要求', '服务要求', '其他要求'] - final_dict = {key: "" for key in final_keys} - - # 如果输入字典中有'其他要求',保留其内容 - if '其他要求' in temp_dict and temp_dict['其他要求'].strip(): - final_dict['其他要求'] = temp_dict['其他要求'].strip() - - # 处理'技术要求', '商务要求', '服务要求' - for key in ['技术要求', '商务要求', '服务要求']: - if key in temp_dict: - final_dict[key] = temp_dict[key].strip() - - # 收集需要合并到'其他要求'的内容 - merge_keys = ['总体要求', '进度要求', '培训要求'] - merged_contents = [] - for key in merge_keys: - if key in temp_dict and temp_dict[key].strip(): - merged_contents.append(temp_dict[key].strip()) - - # 如果有需要合并的内容 - if merged_contents: - merged_text = " ".join(merged_contents) - if final_dict['其他要求']: - final_dict['其他要求'] += " " + merged_text - else: - final_dict['其他要求'] = merged_text - - # 移除多余的空格 - for key in final_dict: - final_dict[key] = final_dict[key].strip() - - return final_dict - - -# ,"总\s*体\s*要\s*求","进\s*度\s*要\s*求","培\s*训\s*要\s*求" def get_business_requirements(procurement_path,procurement_docpath): file_id = upload_file(procurement_docpath) print(file_id) - required_keys = ["技\s*术\s*要\s*求", "商\s*务\s*要\s*求", "服\s*务\s*要\s*求", "其\s*他\s*要\s*求"] + required_keys = ["技\s*术\s*要\s*求", "商\s*务\s*要\s*求", "服\s*务\s*要\s*求", "其\s*他\s*要\s*求","总\s*体\s*要\s*求","建\s*设\s*要\s*求","进\s*度\s*要\s*求","工\s*期\s*要\s*求","质\s*保\s*要\s*求","培\s*训\s*要\s*求","售\s*后\s*要\s*求"] contained_keys = find_exists(procurement_path, required_keys) print(contained_keys) if not contained_keys: @@ -301,8 +280,8 @@ def get_business_requirements(procurement_path,procurement_docpath): # TODO:改为先判断,再摘取 if __name__ == "__main__": # truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\e4be098d-b378-4126-9c32-a742b237b3b1\\ztbfile_procurement.docx" - truncate_file = r"C:\Users\Administrator\Desktop\货物标\output1\交警支队机动车查验监管系统项目采购_procurement.pdf" - docx_path=r'C:\Users\Administrator\Desktop\货物标\output1\交警支队机动车查验监管系统项目采购_procurement.docx' + truncate_file = r"C:\Users\Administrator\Desktop\货物标\output1\陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1)_procurement.pdf" + docx_path=r'C:\Users\Administrator\Desktop\货物标\output1\陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1)_procurement.docx' # truncate_file=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0519-001-招标文件_procurement.pdf" # file_id = upload_file(truncate_file) processed_filepath = pdf2txt(truncate_file) From 714ab3454d03cf882bb6bc87a25fccfa0edff782 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Fri, 29 Nov 2024 15:06:11 +0800 Subject: [PATCH 7/8] =?UTF-8?q?11.29=20=E4=BC=98=E5=8C=96=E6=8F=90?= =?UTF-8?q?=E7=A4=BA=E8=AF=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/货物标/提取采购需求main.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/flask_app/货物标/提取采购需求main.py b/flask_app/货物标/提取采购需求main.py index e8f5bf2..c846c0c 100644 --- a/flask_app/货物标/提取采购需求main.py +++ b/flask_app/货物标/提取采购需求main.py @@ -47,8 +47,6 @@ def fetch_procurement_reqs(procurement_path, invalid_path): "采购需求": technical_requirements.get("采购需求", {}) } - # 合并 business_requirements 到 procurement_reqs 中 - # 这样无论 business_requirements 包含什么键(如 "技术要求"、"服务要求" 或 "技术、服务要求"),都将被保留 procurement_reqs.update(business_requirements) # 如果需要确保所有默认键存在,可以取消下面的注释 From bcffbc914386785bbaf0560d233a774ee79ddb4d Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Fri, 29 Nov 2024 15:13:54 +0800 Subject: [PATCH 8/8] =?UTF-8?q?11.29=20=E4=BC=98=E5=8C=96=E6=8F=90?= =?UTF-8?q?=E7=A4=BA=E8=AF=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/货物标/提取采购需求main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flask_app/货物标/提取采购需求main.py b/flask_app/货物标/提取采购需求main.py index c846c0c..d4f05c6 100644 --- a/flask_app/货物标/提取采购需求main.py +++ b/flask_app/货物标/提取采购需求main.py @@ -27,7 +27,7 @@ def fetch_procurement_reqs(procurement_path, invalid_path): return DEFAULT_PROCUREMENT_REQS.copy() try: - # processed_filepath = convert_pdf_to_markdown(procurement_path) # 转markdown格式 + # processe:d_filepath = convert_pdf_to_markdown(procurement_path) # 转markdown格式 processed_filepath = pdf2txt(procurement_path) # 纯文本提取 # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements with concurrent.futures.ThreadPoolExecutor() as executor: