From 493e8abf3134f967e2e47acb662946050ef16546 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Thu, 28 Nov 2024 13:38:17 +0800 Subject: [PATCH] =?UTF-8?q?11.28=20=E6=B8=85=E7=90=86file=5Fid?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../old_version/不得存在及禁止投标情形_old.py | 172 +++++++ flask_app/old_version/判断截取位置_old.py | 266 +++++++++++ flask_app/old_version/回答来源_old.py | 216 +++++++++ flask_app/old_version/基础信息整合_old.py | 166 +++++++ flask_app/old_version/多线程分类_old.py | 373 +++++++++++++++ flask_app/old_version/废标项_old.py | 66 +++ flask_app/old_version/招标文件解析_old.py | 305 ++++++++++++ flask_app/old_version/文件分类普通版_old.py | 350 ++++++++++++++ .../删除知识库_old.py | 55 +++ .../知识库操作_old.py | 58 +++ .../调用文件解析状态查询_old.py | 33 ++ .../调用文档解析异步提交_old.py | 40 ++ .../调用文档解析结果获取_old.py | 35 ++ .../无效标和废标和禁止投标整合_old.py | 440 ++++++++++++++++++ flask_app/old_version/解析old_old.py | 221 +++++++++ flask_app/old_version/资格审查模块old_old.py | 42 ++ flask_app/old_version/资格评审old_old.py | 182 ++++++++ flask_app/old_version/资格评审前判断_old.py | 29 ++ flask_app/old_version/通义千问_old.py | 48 ++ 19 files changed, 3097 insertions(+) create mode 100644 flask_app/old_version/不得存在及禁止投标情形_old.py create mode 100644 flask_app/old_version/判断截取位置_old.py create mode 100644 flask_app/old_version/回答来源_old.py create mode 100644 flask_app/old_version/基础信息整合_old.py create mode 100644 flask_app/old_version/多线程分类_old.py create mode 100644 flask_app/old_version/废标项_old.py create mode 100644 flask_app/old_version/招标文件解析_old.py create mode 100644 flask_app/old_version/文件分类普通版_old.py create mode 100644 flask_app/old_version/文档理解大模型版知识库处理/删除知识库_old.py create mode 100644 flask_app/old_version/文档理解大模型版知识库处理/知识库操作_old.py create mode 100644 flask_app/old_version/文档理解大模型版知识库处理/调用文件解析状态查询_old.py create mode 100644 flask_app/old_version/文档理解大模型版知识库处理/调用文档解析异步提交_old.py create mode 100644 flask_app/old_version/文档理解大模型版知识库处理/调用文档解析结果获取_old.py create mode 100644 flask_app/old_version/无效标和废标和禁止投标整合_old.py create mode 100644 flask_app/old_version/解析old_old.py create mode 100644 flask_app/old_version/资格审查模块old_old.py create mode 100644 flask_app/old_version/资格评审old_old.py create mode 100644 flask_app/old_version/资格评审前判断_old.py create mode 100644 flask_app/old_version/通义千问_old.py diff --git a/flask_app/old_version/不得存在及禁止投标情形_old.py b/flask_app/old_version/不得存在及禁止投标情形_old.py new file mode 100644 index 0000000..9a4fe1b --- /dev/null +++ b/flask_app/old_version/不得存在及禁止投标情形_old.py @@ -0,0 +1,172 @@ +import json +import os +import re + +from PyPDF2 import PdfWriter, PdfReader + +from flask_app.general.通义千问long import upload_file, qianwen_long +from flask_app.general.通用功能函数 import process_string_list + + +def extract_and_format_from_paths(json_paths, includes, excludes): + """ + 从多个 JSON 文件路径读取数据,提取包含特定关键词的内容,并按照格式要求合并。 + + 参数: + json_paths (list): 包含多个 JSON 文件路径的列表。 + includes (list): 包含要检查的关键词的列表。 + excludes (list): 包含要排除的关键词的列表。 + + 返回: + list: 包含所有文件中满足条件的格式化字符串列表。 + """ + all_formatted_results = [] + + # 遍历每个文件路径 + for path in json_paths: + # 检查路径是否为空或不存在 + if not path or not os.path.isfile(path): + print(f"禁止投标情形: Error: The file path '{path}' is invalid or does not exist.") + continue # 跳过无效路径 + try: + with open(path, 'r', encoding='utf-8') as file: + # 加载 JSON 数据 + json_data = json.load(file) + formatted_results = [] + + # 遍历 JSON 数据的每个键值对 + for key, value in json_data.items(): + if isinstance(value, dict): + # 如果值是字典,检查嵌套字典的每个键值对 + for sub_key, sub_value in value.items(): + if any(include in sub_key for include in includes): + # 如果子值包含关键词,格式化并添加到结果列表 + formatted_results.append(f"{sub_value}") + elif isinstance(value, str): # clause + # 检查是否包含任何 include 关键词 + for include in includes: + if include in value: + # 找到 include 之前的内容 + prefix = value.split(include)[0] + # 检查 prefix 是否不包含任何 exclude 关键词 + if not any(exclude in prefix for exclude in excludes): + # 如果不包含任何 exclude 关键词,添加整个 value 到结果列表 + if '\n' in value: + value = value.split('\n', 1)[-1] + formatted_results.append(value) + break # 找到一个符合条件的就跳出循环 + + # 将当前文件的结果添加到总结果列表 + all_formatted_results.extend(formatted_results) + except FileNotFoundError: + print(f"禁止投标情形: Error: The file '{path}' does not exist.") + except json.JSONDecodeError: + print(f"禁止投标情形: Error: The file '{path}' contains invalid JSON.") + + return all_formatted_results + +def extract_unique_items_from_texts(texts): + # 更新正则表达式以包括更广泛的序号类型,包括中文序号 + pattern = re.compile(r'(?:\d+\.|\(\d+\)|\(\d+\)|\d+\)|\①|\②|\③|\④|\⑤|\⑥|\⑦|\⑧|\⑨|\⑩|\⑪|\⑫)\s*') + intro_pattern = re.compile(r'^.*?[::]') + punctuation_pattern = re.compile(r'[;。,、..,:;!?!?]+$') + url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') + content_check_pattern = re.compile(r'[\u4e00-\u9fa5a-zA-Z0-9]{2,}') # 检查是否含有至少2个连续的字母、数字或汉字 + + all_results = [] + seen = set() + + for text in texts: + # 去除文本中的制表符和换行符 + text = text.replace('\t', '').replace('\n', '') + + # 删除引导性的文本(直到冒号,但保留冒号后的内容) + text = intro_pattern.sub('', text) + + # 替换URL为占位符,并保存URL以便后续还原 + urls = [] + def url_replacer(match): + urls.append(match.group(0)) + return f"{{URL{len(urls)}}}" + text = url_pattern.sub(url_replacer, text) + + # 使用数字和括号的模式分割文本 + items = pattern.split(text) + + for item in items: + cleaned_item = item.strip() + if cleaned_item: + # 进一步清理每个条目 + cleaned_item = pattern.sub('', cleaned_item) + cleaned_item = punctuation_pattern.sub('', cleaned_item) + cleaned_item = cleaned_item.strip() + + # 还原URL + for i, url in enumerate(urls, 1): + cleaned_item = cleaned_item.replace(f"{{URL{i}}}", url) + + # 添加未见过的独特条目,确保它包含足够的实质内容并长度大于3个字符 + if cleaned_item and cleaned_item not in seen and len(cleaned_item) > 3 and content_check_pattern.search(cleaned_item): + seen.add(cleaned_item) + all_results.append(cleaned_item) + + return all_results + +def merge_pdfs(paths, output_filename): + pdf_writer = PdfWriter() + output_path = None + + for path in paths: + pdf_reader = PdfReader(path) + for page in range(len(pdf_reader.pages)): + # 将每页添加到写入对象中 + pdf_writer.add_page(pdf_reader.pages[page]) + + if output_path is None: + # 确定输出文件路径 + output_path = os.path.join(os.path.dirname(path), output_filename) + + # 写入合并的PDF到文件 + if output_path: + with open(output_path, 'wb') as out: + pdf_writer.write(out) + print(f"禁止投标情形: Merged PDF saved to {output_path}") + else: + print("禁止投标情形: No files to merge.") + return output_path + + +def find_forbidden(qualification=""): + try: + if qualification: + file_id = upload_file(qualification) + user_query_forbidden = ( + "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...]," + "请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。" + ) + qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden) + else: + qianwen_forbidden_str = "[]" + + actual_list = process_string_list(qianwen_forbidden_str) # 提取出字符串列表 ["xxx","xx"] + includes = ["不得存在", "不得与", "禁止投标", "对投标人的纪律"] + excludes = ["招标", "评标", "定标"] + # forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes, excludes) + # processed_results = extract_unique_items_from_texts(forbidden_results) + # merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results)) + merged_forbidden_list = list(dict.fromkeys(actual_list)) + forbidden_dict = {'不得存在的其他情形': merged_forbidden_list} + return forbidden_dict + except Exception as e: + print(f"find_forbidden 在处理时发生异常: {e}") + return {'不得存在的其他情形': ""} + +#TODO:不得存在的情况文中有很多内容,货物标中采用了全文搜索的逻辑。 +if __name__ == '__main__': + # truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\truncate_output.json" + # clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json" + qualification = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf" + output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f" + doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile.docx' + res = find_forbidden(qualification) + print(json.dumps(res, ensure_ascii=False, indent=4)) \ No newline at end of file diff --git a/flask_app/old_version/判断截取位置_old.py b/flask_app/old_version/判断截取位置_old.py new file mode 100644 index 0000000..c0b1390 --- /dev/null +++ b/flask_app/old_version/判断截取位置_old.py @@ -0,0 +1,266 @@ +# -*- encoding:utf-8 -*- +import json +import re +import os +import sys +from pathlib import Path +from openai import OpenAI +import concurrent.futures +import queue +import time + +def qianwen_long(file_id, user_query): + print("call qianwen-long...") + """ + Uses a previously uploaded file to generate a response based on a user query. + """ + client = OpenAI( + api_key=os.getenv("DASHSCOPE_API_KEY"), + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" + ) + + # Generate a response based on the file ID + completion = client.chat.completions.create( + model="qwen-long", + top_p=0.5, + temperature=0.4, + messages=[ + { + 'role': 'system', + 'content': f'fileid://{file_id}' + }, + { + 'role': 'user', + 'content': user_query + } + ], + stream=False + ) + + # Return the response content + return completion.choices[0].message.content + +def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type): + + if llm_type==2: + print(f"qianwen_long! question:{question}") + qianwen_res = qianwen_long(file_id,question) + result_queue.put((ans_index,(question,qianwen_res))) + return +def multi_threading(queries, knowledge_name="", file_id="", llm_type=1): + if not queries: + return [] + + print("多线程提问:starting multi_threading...") + result_queue = queue.Queue() + + # 使用 ThreadPoolExecutor 管理线程 + with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor: + # 逐个提交任务,每提交一个任务后休眠1秒 + future_to_query = {} + for index, query in enumerate(queries): + future = executor.submit(llm_call, query, knowledge_name, file_id, result_queue, index, llm_type) + future_to_query[future] = index + time.sleep(1) # 每提交一个任务后等待1秒 + + # 收集每个线程的结果 + for future in concurrent.futures.as_completed(future_to_query): + index = future_to_query[future] + try: + future.result() # 捕获异常或确认任务完成 + except Exception as exc: + print(f"Query {index} generated an exception: {exc}") + # 确保在异常情况下也向 result_queue 添加占位符 + result_queue.put((index, None)) + + # 从队列中获取所有结果并按索引排序 + results = [None] * len(queries) + while not result_queue.empty(): + index, result = result_queue.get() + results[index] = result + # 检查是否所有结果都是 None + if all(result is None for result in results): + return [] + # 过滤掉None值 + results = [r for r in results if r is not None] + # 返回一个保证是列表的结构 + return results + +def upload_file(file_path): + """ + Uploads a file to DashScope and returns the file ID. + """ + client = OpenAI( + api_key=os.getenv("DASHSCOPE_API_KEY"), + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" + ) + file = client.files.create(file=Path(file_path), purpose="file-extract") + return file.id + +def load_data(json_path): + """ + 从指定的JSON文件中加载数据。 + + Args: + json_path (str): JSON文件的路径。 + + Returns: + dict: 加载的JSON数据字典。 + """ + try: + with open(json_path, 'r', encoding='utf-8') as f: + data = json.load(f) + return data + except FileNotFoundError: + print(f"错误:文件未找到 - {json_path}") + sys.exit(1) + except json.JSONDecodeError as e: + print(f"错误:解析JSON文件时出错 - {e}") + sys.exit(1) + + +def define_target_names(): + """ + 定义目标名称列表。 + + Returns: + list: 目标名称列表。 + """ + return [ + "营业执照", + # "开户信息", + "法定代表人身份证", + # "法定代表人授权人身份证", + "人员证书", + "人员社保资料", + # "劳动合同", + "企业证书", + "企业业绩", + "财务审计报告", + "缴纳税收证明", + "公司缴纳社保证明" + ] + + +def generate_user_query(target, chapters, keywords): + """ + 根据目标、章节和关键词生成用户查询模板。 + + Args: + target (str): 目标名称。 + chapters (list): 相关章节列表。 + keywords (list): 相关关键词列表。 + + Returns: + str: 生成的用户查询字符串。 + """ + template3 = f"""这是投标文件模板,作为投标人,我需要把不同的投标材料填充到对应位置,请你根据该文件回答:{target}应该插入在该文件哪个地方?你可能需要查找以下关键词出现的地方:{', '.join([f"'{kw}'" for kw in keywords])},并确认插入的位置。我已在原文中打上若干待插入位置的标记,形如'[$$第5个待插入位置$$]',它的标记与它上面的小节内容关联。你需要返回给我{target}应该插入位置的标记序号,即'[$$第5个待插入位置$$]'中的'5',而不是页码,若有多个位置需要插入,可以返回多个序号,你的回答以数组返回,如[17, 19],若插入位置不明确,那么返回[-1]。 +""" + template2 = f"""你是一个专业撰写投标文件的专家,这是投标文件模板,作为投标人,你需要把不同的投标材料填充到对应位置,请你根据该文件回答:{target}应该插入在该文件哪个地方?你可能需要查找以下关键词出现的地方:{', '.join([f"'{kw}'" for kw in keywords])},或者根据文中'备注'或'附'内的信息确认插入的位置。目前原文中各章节末尾已打上待插入位置的标记,标记格式为'[$$当前章节名:第x个待插入位置$$]'形如'[$$四、投标保证金:第4个待插入位置$$]',每个标记与它紧跟着的上面的这一章内容关联。你需要返回给我{target}应该插入位置的标记序号,即'[$$四、投标保证金:第4个待插入位置$$]'中的'4',而不是当前页码,若有多个位置需要插入,可以返回多个序号,你的回答以数组返回,如[4, 5],若插入位置不明确,那么返回[-1]。 + """ + template1 = f"""这是投标文件模板,作为投标人,我需要把不同的投标材料填充到对应位置,请你根据该文件回答:{target}应该插入在该文件哪个地方?你可能需要查找以下关键词出现的地方:{', '.join([f"'{kw}'" for kw in keywords])},并确认插入的位置。我已在原文中各章节末尾打上待插入位置的标记,标记格式为'[$$当前章节名:第x个待插入位置$$]'形如'[$$四、投标保证金:第4个待插入位置$$]',每个标记与它紧跟着的上面的这一章内容关联。你需要返回给我{target}应该插入位置的标记数字序号,即'[$$四、投标保证金:第4个待插入位置$$]'中的'4',而不是当前页码,若有多个位置需要插入,可以返回多个序号,你的回答以数组返回,如[4, 5],若插入位置不明确,那么返回[-1]。 + """ + + return template1 + + +def generate_user_queries(target_names, data_dict): + """ + 为每个目标生成对应的用户查询。 + + Args: + target_names (list): 目标名称列表。 + data_dict (dict): 数据字典。 + + Returns: + list: 包含目标和查询的字典列表。 + """ + user_queries = [] + for target in target_names: + if target in data_dict: + chapters = data_dict[target].get("章节", []) + keywords = data_dict[target].get("关键字", []) + query = generate_user_query(target, chapters, keywords) + user_queries.append({ + "target": target, + "query": query + }) + else: + print(f"警告:'{target}'未在数据字典中找到相关信息。") + return user_queries + + +def process_string_list(string_list): + """ + 处理字符串列表,提取方括号内的内容并转换为实际列表。 + + Args: + string_list (str): 包含方括号的字符串。 + + Returns: + list: 解析后的列表内容。 + """ + match = re.search(r'\[(.*?)\]', string_list) + if match: + content_inside = match.group(1).strip() + if content_inside: + items = [item.strip() for item in content_inside.split(',')] + if all(item.isdigit() for item in items): + formatted_list = [int(item) for item in items] + else: + formatted_list = items + return formatted_list + return [] + + +def main(): + # 定义JSON文件路径 + # json_path = "flask_app/general/static/插入位置.json" + json_path = "/flask_app/general/static/插入位置.json" + # 加载数据 + data_dict = load_data(json_path) + + # 定义目标名称 + target_names = define_target_names() + + # 生成用户查询列表 + user_query_list = generate_user_queries(target_names, data_dict) + + if not user_query_list: + print("没有生成任何用户查询。") + sys.exit(0) + + # 提取查询 + queries = [item['query'] for item in user_query_list] + + # 定义文件路径 + format_part = "C:\\Users\\Administrator\\Desktop\\bid_format.pdf" + # format_part="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹\\bid_format.docx" + # 检查文件是否存在 + if not os.path.isfile(format_part): + print(f"错误:文件未找到 - {format_part}") + sys.exit(1) + + # 上传文件并获取file_id + file_id = upload_file(format_part) + + if not file_id: + print("错误:文件上传失败。") + sys.exit(1) + + # 使用多线程并行处理查询 + results = multi_threading(queries, "", file_id,2) + + if not results: + print("错误:未收到任何处理结果。") + sys.exit(1) + + # 清理返回结果 + baseinfo_list = [process_string_list(res) for _, res in results] + + # 输出结果 + for i, info in enumerate(baseinfo_list): + print(f'{target_names[i]}:{info}') + +if __name__ == "__main__": + main() diff --git a/flask_app/old_version/回答来源_old.py b/flask_app/old_version/回答来源_old.py new file mode 100644 index 0000000..27bb7fe --- /dev/null +++ b/flask_app/old_version/回答来源_old.py @@ -0,0 +1,216 @@ +#基于多线程提问,现已废弃 +# assistant_id +import queue +import concurrent.futures +from dashscope import Assistants, Messages, Runs, Threads +from llama_index.indices.managed.dashscope import DashScopeCloudRetriever +from flask_app.general.json_utils import extract_content_from_json +prompt = """ +# 角色 +你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。 + +## 技能 +### 技能 1:文档解析与摘要 +- 深入理解并分析${document1}的内容,提取关键信息。 +- 根据需求生成简洁明了的摘要,保持原文核心意义不变。 + +### 技能 2:信息检索与关联 +- 在${document1}中高效检索特定信息或关键词。 +- 能够识别并链接到文档内部或外部的相关内容,增强信息的连贯性和深度。 + +## 限制 +- 所有操作均需基于${document1}的内容,不可超出此范围创造信息。 +- 在处理敏感或机密信息时,需遵守严格的隐私和安全规定。 +- 确保所有生成或改编的内容逻辑连贯,无误导性信息。 + +请注意,上述技能执行时将直接利用并参考${document1}的具体内容,以确保所有产出紧密相关且高质量。 +""" +prom = '请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}' + + + + +#正文和文档名之间的内容 +def extract_content_between_tags(text): + results = [] + + # 使用“【正文】”来分割文本 + parts = text.split('【正文】')[1:] # 跳过第一个分割结果,因为它前面不会有内容 + + for index, part in enumerate(parts): + # 查找“【文档名】”标签的位置 + doc_name_index = part.find('【文档名】') + # 查找 'file_ids' 标签的位置 + file_ids_index = part.find("'file_ids'") + + # 根据是否找到“【文档名】”来决定提取内容的截止点 + if doc_name_index != -1: + end_index = doc_name_index + elif file_ids_index != -1: + end_index = file_ids_index + else: + end_index = len(part) + + # 提取内容 + content = part[:end_index].strip() + results.append(content) + + # 如果存在 file_ids,处理最后一部分特别提取 file_ids 前的内容 + if "'file_ids'" in parts[-1]: + file_ids_index = parts[-1].find("'file_ids'") + if file_ids_index != -1: + last_content = parts[-1][:file_ids_index].strip() + results[-1] = last_content # 更新最后一部分的内容,确保只到 file_ids + + return results + +def find_references_in_extracted(formatted_ans, extracted_references): + results = {} # 用来存储匹配结果的字典 + + # 递归函数,用于处理多层嵌套的字典 + def recurse_through_dict(current_dict, path=[]): + for key, value in current_dict.items(): + # 检查值是否还是字典,如果是,进行递归 + if isinstance(value, dict): + recurse_through_dict(value, path + [key]) + else: + # 特定值处理:如果值为'未知',直接设置索引为-1 + if value == '未知': + results['.'.join(path + [key])] = -1 + else: + # 进行匹配检查 + found = False + for index, reference in enumerate(extracted_references): + if str(value) in reference: # 转换为字符串,确保兼容性 + results['.'.join(path + [key])] = index # 使用点表示法记录路径 + found = True + break + if not found: + results['.'.join(path + [key])] = None + + # 从根字典开始递归 + recurse_through_dict(formatted_ans) + return results + +def send_message(assistant, message='百炼是什么?'): + ans = [] + print(f"Query: {message}") + + # create thread. + thread = Threads.create() + print(thread) + + # create a message. + message = Messages.create(thread.id, content=message) + # create run + + run = Runs.create(thread.id, assistant_id=assistant.id) + print(run) + + # wait for run completed or requires_action + run_status = Runs.wait(run.id, thread_id=thread.id) + print(run_status) + reference_txt = str(run_status) + extracted_references = extract_content_between_tags(reference_txt) #引用的文章来源list + # get the thread messages. + msgs = Messages.list(thread.id) + for message in msgs['data'][::-1]: + ans.append(message['content'][0]['text']['value']) + return ans,extracted_references + +def rag_assistant(knowledge_name): + retriever = DashScopeCloudRetriever(knowledge_name) + pipeline_id = str(retriever.pipeline_id) + assistant = Assistants.create( + model='qwen-max', + name='smart helper', + description='智能助手,支持知识库查询和插件调用。', + temperature='0.3', + instructions=prom, + tools=[ + { + "type": "code_interpreter" + }, + { + "type": "rag", + "prompt_ra": { + "pipeline_id": pipeline_id, + "parameters": { + "type": "object", + "properties": { + "query_word": { + "type": "str", + "value": "${document1}" + } + } + } + } + }] + ) + return assistant + + +def pure_assistant(): + assistant = Assistants.create( + model='qwen-max', + name='smart helper', + description='智能助手,能基于用户的要求精准简洁地回答用户的提问', + instructions='智能助手,能基于用户的要求精准简洁地回答用户的提问', + tools=[ + { + "type": "code_interpreter" + }, + ] + ) + return assistant + +def llm_call(question, knowledge_name, result_queue, ans_index, use_rag=True): + if use_rag: + assistant = rag_assistant(knowledge_name) + else: + assistant = pure_assistant() + ans,extracted_references = send_message(assistant, message=question) + for index, reference in enumerate(extracted_references, start=0): + print(f"{index}. {reference}") + formatted_ans=extract_content_from_json(ans[1]) + print(formatted_ans) + + results = find_references_in_extracted(formatted_ans, extracted_references) + for key, index in results.items(): + print(f"{key}: Found at index {index}") + result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans) + +def multi_threading(queries, knowledge_name, use_rag=True): + result_queue = queue.Queue() + + # 使用 ThreadPoolExecutor 管理线程 + with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: + # 使用字典保存每个提交的任务的Future对象,以便按顺序访问结果 + future_to_query = {executor.submit(llm_call, query, knowledge_name, result_queue, index, use_rag): index for + index, query in enumerate(queries)} + + # 收集每个线程的结果 + for future in concurrent.futures.as_completed(future_to_query): + index = future_to_query[future] + # 由于 llm_call 函数本身会处理结果,这里只需要确保任务执行完成 + try: + future.result() # 可以用来捕获异常或确认任务完成 + except Exception as exc: + print(f"Query {index} generated an exception: {exc}") + + # 从队列中获取所有结果并按索引排序 + results = [None] * len(queries) + while not result_queue.empty(): + index, result = result_queue.get() + results[index] = result + return results + +if __name__ == "__main__": + # 读取问题列表 + questions = ["该招标文件的工程概况(或项目概况)是?招标范围是?请按json格式给我提供信息,键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。"] + knowledge_name = "招标解析5word" + results = multi_threading(questions, knowledge_name, use_rag=True) + # 打印结果 + for question, response in results: + print(f"Question: {question}") + print(f"Response: {response}") diff --git a/flask_app/old_version/基础信息整合_old.py b/flask_app/old_version/基础信息整合_old.py new file mode 100644 index 0000000..e8be244 --- /dev/null +++ b/flask_app/old_version/基础信息整合_old.py @@ -0,0 +1,166 @@ +import json + +from flask_app.general.json_utils import clean_json_string, rename_outer_key +from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice +from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge +from flask_app.general.多线程提问 import read_questions_from_file, multi_threading +from flask_app.general.通义千问long import upload_file +from flask_app.general.通用功能函数 import judge_consortium_bidding + +def aggregate_basic_info_engineering(baseinfo_list): + """ + 将基础信息列表中的数据进行合并和分类。 + + 参数: + - baseinfo_list (list): 包含多个基础信息的列表。 + + 返回: + - dict: 合并和分类后的基础信息字典。 + """ + key_groups = { + "招标人/代理信息": ["招标人", "招标人联系方式", "招标代理机构", "招标代理机构联系方式"], + "项目信息": ["项目名称", "招标编号", "项目概况", "招标范围", "招标控制价", "投标竞争下浮率"], + "关键时间/内容": [ + "投标文件递交截止日期", + "投标文件递交方式", + "开标时间", + "开标地点", + "投标人要求澄清招标文件的截止时间", + "投标有效期", + "评标结果公示媒介" + ], + "保证金相关": ["质量保证金", "退还投标保证金"], + "其他信息": [ + "重新招标、不再招标和终止招标", + "投标费用承担", + "招标代理服务费", + "是否退还投标文件", + ] + } + + combined_data = {} + relevant_keys_detected = set() + + # 合并所有基础信息并收集相关键 + for baseinfo in baseinfo_list: + combined_data.update(baseinfo) + relevant_keys_detected.update(baseinfo.keys()) + + # 动态调整键组 + dynamic_key_handling(key_groups, relevant_keys_detected) + + # 创建一个副本以存储未分类的项目 + unclassified_items = {k: v for k, v in combined_data.items() if k not in [item for sublist in key_groups.values() for item in sublist]} + + # 按键组分类并嵌套 + for group_name, keys in key_groups.items(): + group_data = {key: combined_data.get(key, "未提供") for key in keys} + combined_data[group_name] = group_data + # 从 unclassified_items 中移除已分类的键 + for key in keys: + unclassified_items.pop(key, None) + + # 将剩余未分类的键值对添加到 "其他信息" 组 + combined_data["其他信息"].update(unclassified_items) + + # 移除顶层的未分类键值对 + for key in list(combined_data.keys()): + if key not in key_groups: + del combined_data[key] + + return combined_data + +def dynamic_key_handling(key_groups, detected_keys): + # 检查和调整键组配置 + for key in detected_keys: + # 处理“保证金相关”组,插到"质量保证金"前 + if "保证金" in key: + group = key_groups["保证金相关"] + insert_before = "质量保证金" + if insert_before in group: + index = group.index(insert_before) + if key not in group: # 避免重复插入 + group.insert(index, key) + else: + group.append(key) # 如果没有找到特定键,则追加到末尾 + elif "联合体" in key: + key_groups["项目信息"].append(key) + elif "分包" in key: + key_groups["项目信息"].append(key) + elif "踏勘现场" in key: + key_groups["其他信息"].append(key) + elif "投标预备会" in key: + key_groups["其他信息"].append(key) + elif "偏离" in key: + key_groups["其他信息"].append(key) + +def combine_basic_info(knowledge_name, truncate0, output_folder, clause_path): + """ + 综合和处理基础信息,生成最终的基础信息字典。 + + 参数: + - knowledge_name (str): 知识名称。 + - truncate0 (str): 文件路径。 + - output_folder (str): 输出文件夹路径。 + - clause_path (str): 条款路径。 + + 返回: + - dict: 综合后的基础信息。 + """ + baseinfo_list = [] + baseinfo_file_path = 'flask_app/static/提示词/基本信息工程标.txt' + # baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息工程标.txt' + questions = read_questions_from_file(baseinfo_file_path) + res1 = multi_threading(questions, knowledge_name) + + for index, response in res1: + try: + if response and len(response) > 1: + baseinfo_list.append(clean_json_string(response[1])) + else: + print(f"基础信息整合: Warning: Missing or incomplete response data for query index {index}.") + except Exception as e: + print(f"基础信息整合: Error processing response for query index {index}: {e}") + + # 判断是否分包、是否需要递交投标保证金等 + chosen_numbers, merged = judge_whether_main(truncate0, output_folder) + baseinfo_list.append(merged) + + judge_file_path = 'flask_app/static/提示词/是否相关问题.txt' + # judge_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题.txt' + judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers) + judge_consortium = judge_consortium_bidding(baseinfo_list) # 通过招标公告判断是否接受联合体投标 + + if judge_consortium: + judge_consortium_question = ( + "该招标文件对于联合体投标的要求是怎样的,请按json格式给我提供信息," + "外层键名为'联合体投标要求',其中有一个嵌套键值对为:\"是否接受联合体投标\":\"是\"" + ) + judge_questions.append(judge_consortium_question) + + file_id = upload_file(truncate0) + res2 = multi_threading(judge_questions, "", file_id, 2) # 调用千问-long + + if not res2: + print("基础信息整合: multi_threading error!") + else: + for question, response in res2: + baseinfo_list.append(clean_json_string(response)) + + rebidding_situation = extract_from_notice(clause_path, 3) # "重新招标, 不再招标和终止招标"需从投标人须知正文提取 + update_json = rename_outer_key(rebidding_situation, "重新招标、不再招标和终止招标") + baseinfo_list.append(update_json) + aggregated_baseinfo = aggregate_basic_info_engineering(baseinfo_list) # 现在是一个字典 + return {"基础信息": aggregated_baseinfo} + +if __name__ == "__main__": + knowledge_name = "ztb" + output_folder="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405" + truncate0="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\ztbfile_tobidders_notice_table.pdf" + clause_path="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\clause1.json" + res=combine_basic_info(knowledge_name,truncate0,output_folder,clause_path) + print(json.dumps(res,ensure_ascii=False,indent=4)) + + + + diff --git a/flask_app/old_version/多线程分类_old.py b/flask_app/old_version/多线程分类_old.py new file mode 100644 index 0000000..c50c288 --- /dev/null +++ b/flask_app/old_version/多线程分类_old.py @@ -0,0 +1,373 @@ +# -*- coding: utf-8 -*- +import os +import shutil +from PyPDF2 import PdfReader, PdfWriter + + +def validate_pdf(file_path): + """ 验证PDF文件是否损坏 """ + try: + with open(file_path, "rb") as file: + pdf = PdfReader(file) + return len(pdf.pages) > 0 + except Exception as e: + print(f"Error reading PDF {file_path}: {str(e)}") + return False + + +def truncate_pdf(source_path, target_path, max_pages=15): + """截取PDF文件的前15页并保存""" + try: + with open(source_path, "rb") as infile: + reader = PdfReader(infile) + writer = PdfWriter() + for i in range(min(max_pages, len(reader.pages))): + writer.add_page(reader.pages[i]) + with open(target_path, "wb") as outfile: + writer.write(outfile) + except Exception as e: + print(f"Error processing PDF {source_path}: {str(e)}") + +def copy_file(src, dest): + """ 复制单个文件 """ + os.makedirs(os.path.dirname(dest), exist_ok=True) + shutil.copy2(src, dest) + +def copy_directory(source, destination): + """复制整个目录""" + os.makedirs(destination, exist_ok=True) + for item in os.listdir(source): + src_path = os.path.join(source, item) + dest_path = os.path.join(destination, item) + if os.path.isfile(src_path): + copy_file(src_path, dest_path) + else: + copy_directory(src_path, dest_path) + +def unique_file_name(base_path, file_name): + counter = 1 + name_part, extension = os.path.splitext(file_name) + new_file_name = file_name + # 检查文件是否存在,若存在则修改文件名 + while os.path.exists(os.path.join(base_path, new_file_name)): + new_file_name = f"{name_part}_{counter}{extension}" + counter += 1 + return new_file_name + +def process_pdf_folders(source_dir, target_dir): + """ 处理源目录中的PDF文件,并基于条件选择目标文件夹 """ + for root, dirs, files in os.walk(source_dir, topdown=False): + for file in files: + if file.lower().endswith('.pdf'): + source_file_path = os.path.join(root, file) + if validate_pdf(source_file_path): + relative_path = os.path.relpath(root, source_dir) + target_file_dir = os.path.join(target_dir, relative_path) + target_file_path = os.path.join(target_file_dir, file) + copy_file(source_file_path, target_file_path) + else: + print(f"Deleted corrupt file: {source_file_path}") + # 清除空目录 + if not os.listdir(root): + os.rmdir(root) + + +def classify_folders(source_dir, target_dir, project_index): + """Classifies folders and processes files based on specific criteria.""" + temp_dir = os.path.join(source_dir, 'temp') + os.makedirs(temp_dir, exist_ok=True) + + target_project_dir = None + processed_dirs = set() # Set to track processed directories + + for subdir in os.listdir(source_dir): + subdir_path = os.path.join(source_dir, subdir) + if not os.path.isdir(subdir_path): + continue + + files = [f.lower() for f in os.listdir(subdir_path)] + if 'zb.pdf' in files: + target_project_dir, target_zb_name= process_tender_files(subdir_path, temp_dir, target_dir, project_index) + processed_dirs.add(subdir_path) # Mark this directory as processed + elif subdir.lower() == "输出文件": + process_evaluation_files(subdir_path, target_project_dir) + processed_dirs.add(subdir_path) # Mark this directory as processed + + # Process remaining folders, skipping already processed ones + process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs,target_zb_name) + #删除tmp目录 + # if os.path.exists(temp_dir): + # shutil.rmtree(temp_dir) + if os.path.exists(source_dir): + shutil.rmtree(source_dir) + +def process_tender_files(subdir_path, temp_dir, target_dir, project_index): + """Processes tender files and returns the target project directory and updated index.""" + zb_path = os.path.join(subdir_path, "zb.pdf") + truncated_zb_path = os.path.join(temp_dir, "truncate_zb.pdf") + truncate_pdf(zb_path, truncated_zb_path, 30) # Truncate to the first 30 pages + bot_response = file_parse(truncated_zb_path, project_index, 1) + zb_response = extract_answer(bot_response[1], 1) + zb_response[0]=zb_response[0].replace('/', '-').replace('\\', '-') #用'-'代替'/' ,目录名不允许出现斜杠 + target_zb_name, category = zb_response[2] + ".pdf", zb_response[3] + + new_zb_path = os.path.join(subdir_path, target_zb_name) + os.rename(zb_path, new_zb_path) + + target_category_dir = os.path.join(target_dir, category) + target_project_dir = os.path.join(target_category_dir, zb_response[0] + "_" + zb_response[1]) + copy_directory(subdir_path, os.path.join(target_project_dir, "招标文件夹")) + + os.remove(truncated_zb_path) + shutil.rmtree(subdir_path) + return target_project_dir, zb_response[2] + + +def process_evaluation_files(subdir_path, target_project_dir): + """Processes evaluation folders.""" + copy_directory(subdir_path, os.path.join(target_project_dir, "评标文件夹")) + shutil.rmtree(subdir_path) + + +def process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs, target_zb_name): + """Processes remaining folders containing bid files.""" + target_tb_dir = os.path.join(target_project_dir, "投标文件夹") + + for subdir in os.listdir(source_dir): + subdir_path = os.path.join(source_dir, subdir) + if not os.path.isdir(subdir_path) or subdir_path in processed_dirs: + continue # Skip processed directories + + target_tbcom_dir = None # Initialize outside the file loop + + for item in os.listdir(subdir_path): + item_src_path = os.path.join(subdir_path, item) + new_name = "truncate_" + item + truncated_tb_path = os.path.join(temp_dir, new_name) + truncate_pdf(item_src_path, truncated_tb_path) + bot_response = file_parse(truncated_tb_path, project_index, 2) + tb_response = extract_answer(bot_response[1], 2) + if not tb_response: + continue # If there's no response, skip this file + + # Initialize target_tbcom_dir only once based on the first processed file + if not target_tbcom_dir: + target_tb_name, _ = tb_response + if(target_tb_name != target_zb_name and target_tb_name != "未知"): + target_tbcom_dir = os.path.join(target_tb_dir, target_tb_name) + print(target_tbcom_dir) + os.makedirs(target_tbcom_dir, exist_ok=True) + + tb_section = tb_response[1] + ".pdf" + new_tb_path = os.path.join(subdir_path, tb_section) + # 获取唯一文件名 + new_tb_path = os.path.join(subdir_path, unique_file_name(subdir_path, tb_section)) + os.rename(item_src_path, new_tb_path) + + # Remove temporary truncated file + os.remove(truncated_tb_path) + + # Copy the whole directory at once after all files have been processed and renamed + if target_tbcom_dir: + copy_directory(subdir_path, target_tbcom_dir) + shutil.rmtree(subdir_path) # Optionally remove the original directory after copying + +import re + +def extract_answer(input_string, type): + # 使用正则表达式匹配所需的信息 + # 第一种模式:项目名称、项目编号、招标人、类别 + # 在每个字段值后都添加了\s*以忽略尾随的空格 + pattern1 = r"项目名称[::]\s*(.*?)[;;]\s*项目编号[::]\s*(.*?)[;;]\s*招标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*" + # 第二种模式:投标人、类别 + pattern2 = r"投标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*" + + if type == 1: + match = re.search(pattern1, input_string) + if match: + print(f"1: {match.group(1).strip()} 2: {match.group(2).strip()} 3: {match.group(3).strip()} 4: {match.group(4).strip()}") + return [match.group(1).strip(), match.group(2).strip(), match.group(3).strip(), match.group(4).strip()] + else: + print("No match found for type 1.") + return [] + elif type == 2: + match = re.search(pattern2, input_string) + if match: + # 检查是否包含“投标函”,如果是则替换部分内容为“商务文件” + part = match.group(2).strip() + if "投标函" in part: + part = "商务文件" + return [match.group(1).strip(), part] + else: + print("No match found for type 2.") + return [] + + +from llama_index.readers.dashscope.base import DashScopeParse +from llama_index.readers.dashscope.utils import ResultType +from llama_index.indices.managed.dashscope import DashScopeCloudIndex +from dashscope import Assistants, Messages, Runs, Threads + + +def send_message(assistant, index,documents,message='百炼是什么?'): + print(f"Query: {message}") + + # create thread. + # create a thread. + thread = Threads.create() + + print(thread) + + # create a message. + message = Messages.create(thread.id, content=message) + # create run + + run = Runs.create(thread.id, assistant_id=assistant.id) + print("run:" + str(run)) + + # # get run statue + run_status = Runs.get(run.id, thread_id=thread.id) + # print(run_status) + + # wait for run completed or requires_action + run_status = Runs.wait(run.id, thread_id=thread.id) + # print(run_status) + + # if prompt input tool result, submit tool result. + + run_status = Runs.get(run.id, thread_id=thread.id) + # print(run_status) + # verify_status_code(run_status) + + # get the thread messages. + msgs = Messages.list(thread.id) + # print(msgs) + # print(json.dumps(msgs, default=lambda o: o.__dict__, sort_keys=True, indent=4)) + + ans = [] + + print("运行结果:") + for message in msgs['data'][::-1]: + ans.append(message['content'][0]['text']['value']) + print("content: ", message['content'][0]['text']['value']) + print("\n") + deleteFileFromKnowledge(index,documents) + return ans + + +def file_parse(filepath, knowledge_index, type): + parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND) + documents = parse.load_data(file_path=filepath) + # 创建一个字典来映射index值到知识库名 + index_to_name = { + 0: "文件分类知识库0", + 1: "文件分类知识库1", + 2: "文件分类知识库2", + 3: "文件分类知识库3" + } + # 使用get方法获取对应的知识库名,如果index不存在,返回"默认文件分类知识库" + cloud_index_name = index_to_name.get(knowledge_index, "0") + + index = DashScopeCloudIndex(cloud_index_name) + index._insert(documents) + retriever = index.as_retriever() + pipeline_id = str(retriever.pipeline_id) + assistant = Assistants.create( + model='qwen-plus', + name='smart helper', + description='智能助手,支持知识库查询和插件调用。', + instructions='请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}', + tools=[ + { + "type": "code_interpreter" + }, + { + "type": "rag", + "prompt_ra": { + "pipeline_id": pipeline_id, + "parameters": { + "type": "object", + "properties": { + "query_word": { + "type": "str", + "value": "${document1}" + } + + } + } + } + }] + ) + questions1 = "这份招标文件的项目名称是什么?这份文件的招标编号是多少?这份文件的招标人是谁?这份招标文件属于以下哪类招标:服务类、工程类、还是货物类?你可以结合你对招投标的了解和以下内容:工程类招标投标是指建设单位对拟建的工程项目通过法定的程序和方式吸引建设项目的承包单位竞争;\ + 货物类招投标是指以货物作为采购对象的招标,业主或称购货方为获得货物通过招标的形式选择合格的供货商或称供货方,包括原材料、产品、设备、电能和固态、液态、气态物体等;服务类招标又称为服务采购,指的是除了工程和货之外的其他招标投标活动,服物招标投标范围包含建设工程的勘察、设计、监理、工程咨询评估、科技项目、科研课题、物业管理、金融保险服务等\ + 请按下列键值对格式给我提供信息,避免无关内容:‘项目名称:XXX;项目编号:XXX;招标人:XXX;类别:XXX‘" + questions2 = "这份投标文件的投标人是谁?如果文件中未明确提及投标人,投标人以未知替代;这份文件属于哪类投标文件?你的回答仅限于以下三种:商务文件、技术文件、报价文件,不要给出其他回答。请按下列键值对格式给我提供信息,避免无关内容:‘投标人:XXX;类别:XXX’" + if (type == 1): + questions = questions1 + elif (type == 2): + questions = questions2 + return send_message(assistant,index,documents, message=questions) + +def deleteFileFromKnowledge(index,documents): + # 初始化一个列表来存储所有文档的 ID + file_ids = [] + + # 检查documents是否为列表且不为空 + if isinstance(documents, list) and documents: + # 遍历每个文档 + for document in documents: + # 使用属性访问方式获取每个文档的 id_ + # 确保 document 对象有一个名为 id_ 的属性 + file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常 + if file_id: + file_ids.append(file_id) # 将 id 添加到列表中 + index.delete_ref_doc(file_ids) + +import concurrent.futures +import logging + +# 配置日志,只输出到控制台 +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +def process_directory(subdir_path, base_intermediate_directory, final_directory, project_index): + logging.info(f"Starting to process directory: {subdir_path} with index {project_index}") + + # 为每个子目录创建一个专用的临时目录 + intermediate_directory = os.path.join(base_intermediate_directory, f"intermediate_{project_index}") + os.makedirs(intermediate_directory, exist_ok=True) + logging.info(f"Created intermediate directory: {intermediate_directory}") + + # 这里可以添加具体的目录处理函数,例如: + process_pdf_folders(subdir_path, intermediate_directory) + classify_folders(intermediate_directory, final_directory, project_index % 4) + + # 处理完毕后清理该目录 + shutil.rmtree(intermediate_directory) + logging.info(f"Deleted intermediate directory: {intermediate_directory}") + +def main(base_directory, base_intermediate_directory, final_directory): + os.makedirs(final_directory, exist_ok=True) + logging.info(f"Final directory ensured at: {final_directory}") + + project_index = 0 + with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: + futures = [] + for subdir in os.listdir(base_directory): + subdir_path = os.path.join(base_directory, subdir) + if os.path.isdir(subdir_path): + logging.info(f"Submitting job for directory: {subdir_path}") + future = executor.submit(process_directory, subdir_path, base_intermediate_directory, final_directory, project_index) + futures.append(future) + project_index += 1 + + for future in concurrent.futures.as_completed(futures): + try: + future.result() # 如果执行没有抛出异常,完成该任务 + except Exception as e: + logging.error(f"Thread resulted in an error: {e}") + +if __name__ == "__main__": + base_directory = 'D:\\bidding_trading_files\\招投标文件2023\\2023' + base_intermediate_directory = 'D:\\tmp' + final_directory = 'D:\\output' + main(base_directory, base_intermediate_directory, final_directory) diff --git a/flask_app/old_version/废标项_old.py b/flask_app/old_version/废标项_old.py new file mode 100644 index 0000000..77bec71 --- /dev/null +++ b/flask_app/old_version/废标项_old.py @@ -0,0 +1,66 @@ +import fitz # PyMuPDF +import re + +def extract_text_with_keywords(pdf_path, keywords, follow_up_keywords): + """ + 提取PDF文档中包含特定关键词的段落,包括正文和表格。如果段落中包含特定的后续关键词,也提取其后的段落,直到遇到下一个相似的序列编号。 + :param pdf_path: PDF文件的路径。 + :param keywords: 包含关键词的列表。 + :param follow_up_keywords: 触发连续提取的关键词列表。 + :return: 包含关键词的段落列表。 + """ + doc = fitz.open(pdf_path) + extracted_paragraphs = [] + continue_collecting = False + current_section_pattern = None + + for page in doc: + text_blocks = page.get_text("blocks") + for index, block in enumerate(text_blocks): + text = block[4].strip() # Text content of the block + if text == "": # Skip empty lines + continue + if continue_collecting: + if current_section_pattern and re.match(current_section_pattern, text): + continue_collecting = False + else: + extracted_paragraphs.append(text) + + if any(keyword in text for keyword in keywords): + extracted_paragraphs.append(text) + if any(follow_up in text for follow_up in follow_up_keywords): + continue_collecting = True + section_number = re.match(r'(\d+(\.\d+)*)', text) + if section_number: + current_section_number = section_number.group(1) + base_section_number = current_section_number.rsplit('.', 1)[0] + current_section_pattern = re.compile(rf'^{re.escape(base_section_number)}\.\d+\b') + else: + found_next_number = False + for next_index in range(index + 1, len(text_blocks)): + next_text = text_blocks[next_index][4].strip() + next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)', next_text) + if next_section_number: + found_next_number = True + current_section_pattern = re.compile(rf'^{next_section_number.group(1)}\b') + elif found_next_number: + break + doc.close() + return extracted_paragraphs + +# Example usage +doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标02_invalid.docx' +keywords = ['否决', '无效投标', '被拒绝', '予以拒绝'] +follow_up_keywords = ['情形之一'] +extracted_contents = extract_text_with_keywords(doc_path, keywords, follow_up_keywords) + +# Writing to file and handling duplicates +output_file = "C:\\Users\\Administrator\\Desktop\\temp.txt" +with open(output_file, 'w', encoding='utf-8') as file: + for content in extracted_contents: + file.write(content + '\n') + +# file_id = upload_file(output_file) +# user_query="根据文本中的内容,请你回答,否决投标和拒绝投标的情况有哪些?由于文本中存在冗余且无关的信息,且文本中的序号是混乱的,请你重新编排答案返回给我,以json格式返回,你的回答仅限于原文内容但删去原有序号,请不要自己组织语言。" +# res=qianwen_long(file_id,user_query) +# print("Query Result:", res) \ No newline at end of file diff --git a/flask_app/old_version/招标文件解析_old.py b/flask_app/old_version/招标文件解析_old.py new file mode 100644 index 0000000..7bcf816 --- /dev/null +++ b/flask_app/old_version/招标文件解析_old.py @@ -0,0 +1,305 @@ +# -*- encoding:utf-8 -*- +import json +import logging +import time +from concurrent.futures import ThreadPoolExecutor +from flask_app.main.截取pdf import truncate_pdf_multiple +from flask_app.general.table_content_extraction import extract_tables_main +from flask_app.old_version.文档理解大模型版知识库处理.知识库操作_old import addfileToKnowledge, deleteKnowledge +from flask_app.main.提取json工程标版 import convert_clause_to_json +from flask_app.general.json_utils import transform_json_values +from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid +from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice +import concurrent.futures +from flask_app.old_version.基础信息整合_old import combine_basic_info +from flask_app.old_version.资格审查模块old_old import combine_review_standards +from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards +from flask_app.general.format_change import pdf2docx, docx2pdf +from flask_app.general.docx截取docx import copy_docx + +def get_global_logger(unique_id): + if unique_id is None: + return logging.getLogger() # 获取默认的日志器 + logger = logging.getLogger(unique_id) + return logger + +logger=None + +# 创建全局线程池 +executor = ThreadPoolExecutor() +def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id): + logger.info("starting 文件预处理...") + logger.info("output_folder..." + output_folder) + + # 根据文件类型处理文件路径 + if file_type == 1: # docx + docx_path = downloaded_file_path + pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理 + elif file_type == 2: # pdf + pdf_path = downloaded_file_path + docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库 + else: + logger.error("Unsupported file type provided. Preprocessing halted.") + return None + + # # 异步上传知识库 + future_knowledge = executor.submit(addfileToKnowledge, docx_path, "招标解析" + unique_id) + + # 调用截取PDF多次 + truncate_files = truncate_pdf_multiple(pdf_path, output_folder) + + # 处理各个部分 + truncate0_docpath = pdf2docx(truncate_files[0]) # 投标人须知前附表转docx + invalid_docpath = copy_docx(docx_path) # docx截取无效标部分 + truncate_jsonpath = extract_tables_main(truncate0_docpath, output_folder) # 投标人须知前附表docx->json + truncate0 = truncate_files[0] + truncate1 = truncate_files[1] + truncate3 = truncate_files[3] + merged_baseinfo_path=truncate_files[-1] + clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json + + logger.info("文件预处理done") + + # 提前返回,不等待 future_knowledge 完成,返回包含 Future 对象 + return { + 'file_path': downloaded_file_path, + 'output_folder': output_folder, + 'truncate0': truncate0, + 'truncate1': truncate1, + 'truncate3': truncate3, + 'knowledge_future': future_knowledge, # 返回 Future 对象 + 'truncate0_jsonpath': truncate_jsonpath, + 'merged_baseinfo_path':merged_baseinfo_path, + 'clause_path': clause_path, + 'invalid_docpath': invalid_docpath + } + + +def post_processing(data,includes): + # 初始化结果字典,预设'其他'分类为空字典 + result = {"其他": {}} + + # 遍历原始字典的每一个键值对 + for key, value in data.items(): + if key in includes: + # 如果键在includes列表中,直接保留这个键值对 + result[key] = value + else: + # 如果键不在includes列表中,将这个键值对加入到'其他'分类中 + result["其他"][key] = value + + # 如果'其他'分类没有任何内容,可以选择删除这个键 + if not result["其他"]: + del result["其他"] + + return result +# 基本信息 +def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path): # 投标人须知前附表 + logger.info("starting基础信息...") + basic_res = combine_basic_info(knowledge_name, truncate0, output_folder, clause_path) + logger.info("基础信息done") + return basic_res + + +# 形式、响应、资格评审 +def fetch_qualification_review(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path,input_file,output_folder): + logger.info("starting资格审查...") + review_standards_res = combine_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath, + clause_path,input_file,output_folder) + logger.info("资格审查done") + return review_standards_res + + +# 评分细则 流式 +def fetch_evaluation_standards(truncate1): # 评标办法前附表 + logger.info("starting 商务标和技术标...") + + # 获取评标办法前附表的字典结果 + evaluation_standards_res = combine_evaluation_standards(truncate1) + + # 获取技术标和商务标 + technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})} + commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})} + + logger.info("商务标和技术标 done") + + # 返回将 "技术标" 和 "商务标" 包含在新的键中 + return { + "technical_standards": technical_standards, + "commercial_standards": commercial_standards + } + +# def fetch_evaluation_standards(truncate1): # 评标办法前附表 +# logger.info("starting商务标技术标...") +# evaluation_standards_res = combine_evaluation_standards(truncate1) +# logger.info("商务标技术标done") +# return evaluation_standards_res + + +# 无效、废标项解析 +def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3): + # 废标项要求:千问 + logger.info("starting无效标与废标...") + find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3) + logger.info("无效标与废标done...") + return find_invalid_res + + +# 投标文件要求 +def fetch_bidding_documents_requirements(clause_path): + logger.info("starting投标文件要求...") + fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1) + logger.info("投标文件要求done...") + return {"投标文件要求":fetch_bidding_documents_requirements_json} + +# 开评定标流程 +def fetch_bid_opening(clause_path): + logger.info("starting开评定标流程...") + fetch_bid_opening_json = extract_from_notice(clause_path, 2) + logger.info("开评定标流程done...") + return {"开评定标流程":fetch_bid_opening_json} + +# def main_processing(output_folder, downloaded_file_path, file_type, unique_id): # file_type=1->docx file_type=2->pdf +# global logger +# logger = get_global_logger(unique_id) +# # Preprocess files and get necessary data paths and knowledge index +# processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id) +# if not processed_data: +# return "" +# +# with concurrent.futures.ThreadPoolExecutor() as executor: +# # Submit all tasks to the executor +# futures = { +# 'base_info': executor.submit(fetch_project_basic_info, processed_data['knowledge_name'], +# processed_data['truncate0'], output_folder, +# processed_data['clause_path']), +# 'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'], +# processed_data['truncate3'], +# processed_data['knowledge_name'], processed_data['truncate0_jsonpath'], +# processed_data['clause_path'],processed_data['file_path'],processed_data['output_folder']), +# 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']), +# 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], +# output_folder, processed_data['truncate0_jsonpath'], +# processed_data['clause_path'], processed_data['truncate3']), +# 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, +# processed_data['clause_path']), +# 'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path']) +# } +# +# comprehensive_responses = [] +# # Collect results in the defined order +# for key in ['base_info', 'qualification_review', 'evaluation_standards', 'invalid_requirements', +# 'bidding_documents_requirements', 'opening_bid']: +# try: +# # Wait for the future to complete and get the result +# result = futures[key].result() +# comprehensive_responses.append(result) +# except Exception as exc: +# logger.error(f"Error processing {key}: {exc}") +# # 合并 JSON 结果 +# combined_final_result = combine_json_results(comprehensive_responses) +# includes = ["基础信息", "资格审查", "商务标", "技术标", "无效标与废标项", "投标文件要求", "开评定标流程"] +# result = post_processing(combined_final_result, includes) +# modified_json = transform_json_values(result) +# +# final_result_path = os.path.join(output_folder, "final_result.json") +# with open(final_result_path, 'w', encoding='utf-8') as file: +# json.dump(modified_json, file, ensure_ascii=False, indent=2) +# logger.info("final_result.json has been saved") +# deleteKnowledge(processed_data['knowledge_index']) +# return final_result_path + + +#分段返回 +def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_id): + global logger + logger = get_global_logger(unique_id) + + # 预处理文件,获取处理后的数据 + processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id) + if not processed_data: + yield json.dumps({}) # 如果处理数据失败,返回空的 JSON + + with concurrent.futures.ThreadPoolExecutor() as executor: + # 立即启动不依赖 knowledge_name 和 index 的任务 + futures = { + 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']), + 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], + output_folder, processed_data['truncate0_jsonpath'], + processed_data['clause_path'], processed_data['truncate3']), + 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, processed_data['clause_path']), + 'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path']) + } + + # 提前处理这些不依赖的任务,按完成顺序返回 + for future in concurrent.futures.as_completed(futures.values()): + key = next(k for k, v in futures.items() if v == future) + try: + result = future.result() + + # 如果是 evaluation_standards,拆分技术标和商务标 + if key == 'evaluation_standards': + technical_standards = result["technical_standards"] + commercial_standards = result["commercial_standards"] + + # 分别返回技术标和商务标 + yield json.dumps({'technical_standards': transform_json_values(technical_standards)}, ensure_ascii=False) + yield json.dumps({'commercial_standards': transform_json_values(commercial_standards)}, ensure_ascii=False) + + else: + # 处理其他任务的结果 + yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False) + + except Exception as exc: + logger.error(f"Error processing {key}: {exc}") + yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False) + + # 只有在需要 knowledge_name 和 index 时才等待 future_knowledge 完成 + try: + knowledge_name = "招标解析" + unique_id + index = processed_data['knowledge_future'].result() # 阻塞等待知识库上传任务完成 + + # 提交依赖 knowledge_name 和 index 的任务 + future_dependencies = { + 'base_info': executor.submit(fetch_project_basic_info, knowledge_name, processed_data['truncate0'], + output_folder, processed_data['clause_path']), + 'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'], + processed_data['truncate3'], knowledge_name, + processed_data['truncate0_jsonpath'], + processed_data['clause_path'], processed_data['file_path'], + processed_data['output_folder']), + } + + # 按完成顺序返回依赖任务的结果 + for future in concurrent.futures.as_completed(future_dependencies.values()): + key = next(k for k, v in future_dependencies.items() if v == future) + try: + result = future.result() + yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False) + except Exception as exc: + logger.error(f"Error processing {key}: {exc}") + yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False) + + except Exception as e: + logger.error(f"Error uploading to knowledge base: {e}") + yield json.dumps({'error': f'Knowledge upload failed: {str(e)}'}, ensure_ascii=False) + + # 删除知识索引 + deleteKnowledge(index) + +if __name__ == "__main__": + output_folder = "flask_app/static/output/zytest1" + # truncate0 = os.path.join(output_folder, "ztb_tobidders_notice_table.pdf") + # truncate1=os.path.join(output_folder,"ztb_evaluation_method.pdf") + # clause_path = convert_clause_to_json(truncate1, output_folder) + # truncate0_jsonpath = os.path.join(output_folder, "truncate_output3.json") + + start_time = time.time() + file_type = 1 #1:docx 2:pdf 3:其他 + input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest11.pdf" + # file_path = main_processing(output_folder, input_file, file_type, "uuidzyzy11") + + preprocess_files(output_folder, input_file, file_type, "zytest1") + end_time = time.time() + elapsed_time = end_time - start_time # 计算耗时 + print(f"Function execution took {elapsed_time} seconds.") diff --git a/flask_app/old_version/文件分类普通版_old.py b/flask_app/old_version/文件分类普通版_old.py new file mode 100644 index 0000000..dc333ad --- /dev/null +++ b/flask_app/old_version/文件分类普通版_old.py @@ -0,0 +1,350 @@ +# -*- coding: utf-8 -*- +import os +import shutil +from PyPDF2 import PdfReader, PdfWriter + + +def validate_pdf(file_path): + """ 验证PDF文件是否损坏 """ + try: + with open(file_path, "rb") as file: + pdf = PdfReader(file) + return len(pdf.pages) > 0 + except Exception as e: + print(f"Error reading PDF {file_path}: {str(e)}") + return False + + +def truncate_pdf(source_path, target_path, max_pages=15): + """截取PDF文件的前15页并保存""" + try: + with open(source_path, "rb") as infile: + reader = PdfReader(infile) + writer = PdfWriter() + for i in range(min(max_pages, len(reader.pages))): + writer.add_page(reader.pages[i]) + with open(target_path, "wb") as outfile: + writer.write(outfile) + except Exception as e: + print(f"Error processing PDF {source_path}: {str(e)}") + +def copy_file(src, dest): + """ 复制单个文件 """ + os.makedirs(os.path.dirname(dest), exist_ok=True) + shutil.copy2(src, dest) + +def copy_directory(source, destination): + """复制整个目录""" + os.makedirs(destination, exist_ok=True) + for item in os.listdir(source): + src_path = os.path.join(source, item) + dest_path = os.path.join(destination, item) + if os.path.isfile(src_path): + copy_file(src_path, dest_path) + else: + copy_directory(src_path, dest_path) + +def unique_file_name(base_path, file_name): + counter = 1 + name_part, extension = os.path.splitext(file_name) + new_file_name = file_name + # 检查文件是否存在,若存在则修改文件名 + while os.path.exists(os.path.join(base_path, new_file_name)): + new_file_name = f"{name_part}_{counter}{extension}" + counter += 1 + return new_file_name + +def process_pdf_folders(source_dir, target_dir): + """ 处理源目录中的PDF文件,并基于条件选择目标文件夹 """ + for root, dirs, files in os.walk(source_dir, topdown=False): + for file in files: + if file.lower().endswith('.pdf'): + source_file_path = os.path.join(root, file) + if validate_pdf(source_file_path): + relative_path = os.path.relpath(root, source_dir) + target_file_dir = os.path.join(target_dir, relative_path) + target_file_path = os.path.join(target_file_dir, file) + copy_file(source_file_path, target_file_path) + else: + print(f"Deleted corrupt file: {source_file_path}") + # 清除空目录 + if not os.listdir(root): + os.rmdir(root) + + +def classify_folders(source_dir, target_dir, project_index): + """Classifies folders and processes files based on specific criteria.""" + temp_dir = os.path.join(source_dir, 'temp') + os.makedirs(temp_dir, exist_ok=True) + + target_project_dir = None + processed_dirs = set() # Set to track processed directories + + for subdir in os.listdir(source_dir): + subdir_path = os.path.join(source_dir, subdir) + if not os.path.isdir(subdir_path): + continue + + files = [f.lower() for f in os.listdir(subdir_path)] + if 'zb.pdf' in files: + target_project_dir, project_index ,target_zb_name= process_tender_files(subdir_path, temp_dir, target_dir, project_index) + processed_dirs.add(subdir_path) # Mark this directory as processed + elif subdir.lower() == "输出文件": + process_evaluation_files(subdir_path, target_project_dir) + processed_dirs.add(subdir_path) # Mark this directory as processed + + # Process remaining folders, skipping already processed ones + process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs,target_zb_name) + #删除tmp目录 + # if os.path.exists(temp_dir): + # shutil.rmtree(temp_dir) + if os.path.exists(source_dir): + shutil.rmtree(source_dir) + +def process_tender_files(subdir_path, temp_dir, target_dir, project_index): + """Processes tender files and returns the target project directory and updated index.""" + zb_path = os.path.join(subdir_path, "zb.pdf") + truncated_zb_path = os.path.join(temp_dir, "truncate_zb.pdf") + truncate_pdf(zb_path, truncated_zb_path, 30) # Truncate to the first 30 pages + bot_response = file_parse(truncated_zb_path, "file_" + str(project_index), 1) + project_index += 1 + zb_response = extract_answer(bot_response[1], 1) + zb_response[0]=zb_response[0].replace('/', '-').replace('\\', '-') #用'-'代替'/' ,目录名不允许出现斜杠 + target_zb_name, category = zb_response[2] + ".pdf", zb_response[3] + + new_zb_path = os.path.join(subdir_path, target_zb_name) + os.rename(zb_path, new_zb_path) + + target_category_dir = os.path.join(target_dir, category) + target_project_dir = os.path.join(target_category_dir, zb_response[0] + "_" + zb_response[1]) + copy_directory(subdir_path, os.path.join(target_project_dir, "招标文件夹")) + + os.remove(truncated_zb_path) + shutil.rmtree(subdir_path) + return target_project_dir, project_index,zb_response[2] + + +def process_evaluation_files(subdir_path, target_project_dir): + """Processes evaluation folders.""" + copy_directory(subdir_path, os.path.join(target_project_dir, "评标文件夹")) + shutil.rmtree(subdir_path) + + +def process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs, target_zb_name): + """Processes remaining folders containing bid files.""" + target_tb_dir = os.path.join(target_project_dir, "投标文件夹") + + for subdir in os.listdir(source_dir): + subdir_path = os.path.join(source_dir, subdir) + if not os.path.isdir(subdir_path) or subdir_path in processed_dirs: + continue # Skip processed directories + + target_tbcom_dir = None # Initialize outside the file loop + + for item in os.listdir(subdir_path): + item_src_path = os.path.join(subdir_path, item) + new_name = "truncate_" + item + truncated_tb_path = os.path.join(temp_dir, new_name) + truncate_pdf(item_src_path, truncated_tb_path) + bot_response = file_parse(truncated_tb_path, "file_" + str(project_index), 2) + project_index += 1 + tb_response = extract_answer(bot_response[1], 2) + if not tb_response: + continue # If there's no response, skip this file + + # Initialize target_tbcom_dir only once based on the first processed file + if not target_tbcom_dir: + target_tb_name, _ = tb_response + if(target_tb_name != target_zb_name and target_tb_name != "未知"): + target_tbcom_dir = os.path.join(target_tb_dir, target_tb_name) + print(target_tbcom_dir) + os.makedirs(target_tbcom_dir, exist_ok=True) + + tb_section = tb_response[1] + ".pdf" + new_tb_path = os.path.join(subdir_path, tb_section) + # 获取唯一文件名 + new_tb_path = os.path.join(subdir_path, unique_file_name(subdir_path, tb_section)) + os.rename(item_src_path, new_tb_path) + + # Remove temporary truncated file + os.remove(truncated_tb_path) + + # Copy the whole directory at once after all files have been processed and renamed + if target_tbcom_dir: + copy_directory(subdir_path, target_tbcom_dir) + shutil.rmtree(subdir_path) # Optionally remove the original directory after copying + + + +import re + + +import re + +def extract_answer(input_string, type): + # 使用正则表达式匹配所需的信息 + # 第一种模式:项目名称、项目编号、招标人、类别 + # 在每个字段值后都添加了\s*以忽略尾随的空格 + pattern1 = r"项目名称[::]\s*(.*?)[;;]\s*项目编号[::]\s*(.*?)[;;]\s*招标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*" + # 第二种模式:投标人、类别 + pattern2 = r"投标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*" + + if type == 1: + match = re.search(pattern1, input_string) + if match: + print(f"1: {match.group(1).strip()} 2: {match.group(2).strip()} 3: {match.group(3).strip()} 4: {match.group(4).strip()}") + return [match.group(1).strip(), match.group(2).strip(), match.group(3).strip(), match.group(4).strip()] + else: + print("No match found for type 1.") + return [] + elif type == 2: + match = re.search(pattern2, input_string) + if match: + # 检查是否包含“投标函”,如果是则替换部分内容为“商务文件” + part = match.group(2).strip() + if "投标函" in part: + part = "商务文件" + return [match.group(1).strip(), part] + else: + print("No match found for type 2.") + return [] + + +from llama_index.readers.dashscope.base import DashScopeParse +from llama_index.readers.dashscope.utils import ResultType +from llama_index.indices.managed.dashscope import DashScopeCloudIndex +from dashscope import Assistants, Messages, Runs, Threads + + +def send_message(assistant, index,documents,message='百炼是什么?'): + print(f"Query: {message}") + + # create thread. + # create a thread. + thread = Threads.create() + + print(thread) + + # create a message. + message = Messages.create(thread.id, content=message) + # create run + + run = Runs.create(thread.id, assistant_id=assistant.id) + print("run:" + str(run)) + + # # get run statue + run_status = Runs.get(run.id, thread_id=thread.id) + # print(run_status) + + # wait for run completed or requires_action + run_status = Runs.wait(run.id, thread_id=thread.id) + # print(run_status) + + # if prompt input tool result, submit tool result. + + run_status = Runs.get(run.id, thread_id=thread.id) + # print(run_status) + # verify_status_code(run_status) + + # get the thread messages. + msgs = Messages.list(thread.id) + # print(msgs) + # print(json.dumps(msgs, default=lambda o: o.__dict__, sort_keys=True, indent=4)) + + ans = [] + + print("运行结果:") + for message in msgs['data'][::-1]: + ans.append(message['content'][0]['text']['value']) + print("content: ", message['content'][0]['text']['value']) + print("\n") + deleteFileFromKnowledge(index,documents) + return ans + + +def file_parse(filepath, knowledge_index, type): + parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND) + documents = parse.load_data(file_path=filepath) + index = DashScopeCloudIndex("文件分类临时知识库") + index._insert(documents) + retriever = index.as_retriever() + pipeline_id = str(retriever.pipeline_id) + assistant = Assistants.create( + model='qwen-max', + name='smart helper', + description='智能助手,支持知识库查询和插件调用。', + instructions='请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}', + tools=[ + { + "type": "code_interpreter" + }, + { + "type": "rag", + "prompt_ra": { + "pipeline_id": pipeline_id, + "parameters": { + "type": "object", + "properties": { + "query_word": { + "type": "str", + "value": "${document1}" + } + + } + } + } + }] + ) + questions1 = "这份招标文件的项目名称是什么?这份文件的招标编号是多少?这份文件的招标人是谁?这份招标文件属于以下哪类招标:服务类、工程类、还是货物类?你可以结合你对招投标的了解和以下内容:工程类招标投标是指建设单位对拟建的工程项目通过法定的程序和方式吸引建设项目的承包单位竞争;\ + 货物类招投标是指以货物作为采购对象的招标,业主或称购货方为获得货物通过招标的形式选择合格的供货商或称供货方,包括原材料、产品、设备、电能和固态、液态、气态物体等;服务类招标又称为服务采购,指的是除了工程和货之外的其他招标投标活动,物招标投标范围包含建设工程的勘察、设计、监理、工程咨询评估、科技项目、科研课题、物业管理、金融保险服务等\ + 请按下列格式给我提供信息,避免无关内容:‘项目名称:XXX;项目编号:XXX;招标人:XXX;类别:XXX‘" + questions2 = "这份投标文件的投标人是谁?如果文件中未明确提及投标人,投标人以未知替代;这份文件属于哪类投标文件?你的回答仅限于以下三种:商务文件、技术文件、报价文件,不要给出其他回答。请按下列格式给我提供信息,避免无关内容:‘投标人:XXX;类别:XXX’" + if (type == 1): + questions = questions1 + elif (type == 2): + questions = questions2 + return send_message(assistant,index,documents, message=questions) + +def deleteFileFromKnowledge(index,documents): + # 初始化一个列表来存储所有文档的 ID + file_ids = [] + + # 检查documents是否为列表且不为空 + if isinstance(documents, list) and documents: + # 遍历每个文档 + for document in documents: + # 使用属性访问方式获取每个文档的 id_ + # 确保 document 对象有一个名为 id_ 的属性 + file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常 + if file_id: + file_ids.append(file_id) # 将 id 添加到列表中 + index.delete_ref_doc(file_ids) + +def process_directory(base_directory, intermediate_directory, final_directory): + # 遍历base_directory下的所有子目录 + for subdir in os.listdir(base_directory): + subdir_path = os.path.join(base_directory, subdir) + # 确保是目录 + if os.path.isdir(subdir_path): + # 处理每个项目的目录 + process_pdf_folders(subdir_path, intermediate_directory) + classify_folders(intermediate_directory, final_directory, 0) + + # 清理临时目录以备下一个项目使用 + if os.path.exists(intermediate_directory): + shutil.rmtree(intermediate_directory) + +def main(base_directory, intermediate_directory, final_directory): + # 确保中间目录和最终目录存在 + os.makedirs(intermediate_directory, exist_ok=True) + os.makedirs(final_directory, exist_ok=True) + process_directory(base_directory, intermediate_directory, final_directory) + + +if __name__ == "__main__": + base_directory = 'D:\\bidding_trading_files\\招投标文件2023\\2023' + intermediate_directory = 'D:\\tmp' + final_directory = 'D:\\output' + main(base_directory, intermediate_directory, final_directory) #处理多级目录 + # process_pdf_folders(base_directory, intermediate_directory) ##处理单级目录 + # classify_folders(intermediate_directory, final_directory, 0) diff --git a/flask_app/old_version/文档理解大模型版知识库处理/删除知识库_old.py b/flask_app/old_version/文档理解大模型版知识库处理/删除知识库_old.py new file mode 100644 index 0000000..fa48cdd --- /dev/null +++ b/flask_app/old_version/文档理解大模型版知识库处理/删除知识库_old.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +# This file is auto-generated, don't edit it. Thanks. +import os +from alibabacloud_bailian20231229.client import Client as bailian20231229Client +from alibabacloud_tea_openapi import models as open_api_models +from alibabacloud_bailian20231229 import models as bailian_20231229_models +from alibabacloud_tea_util import models as util_models +from alibabacloud_tea_util.client import Client as UtilClient + +def create_client() -> bailian20231229Client: + """ + 使用AK&SK初始化账号Client + @return: Client + @throws Exception + """ + config = open_api_models.Config( + access_key_id=os.environ['ALIBABA_CLOUD_ACCESS_KEY_ID'], + access_key_secret=os.environ['ALIBABA_CLOUD_ACCESS_KEY_SECRET'] + ) + config.endpoint = 'bailian.cn-beijing.aliyuncs.com' + return bailian20231229Client(config) + +def delete_index(client: bailian20231229Client, workspace_id: str, index_id: str) -> None: + delete_index_request = bailian_20231229_models.DeleteIndexRequest( + index_id=index_id + ) + runtime = util_models.RuntimeOptions() + headers = {} + try: + response = client.delete_index_with_options(workspace_id, delete_index_request, headers, runtime) + print("routes Response:", response) + except Exception as error: + print(error.message) + print(error.data.get("Recommend")) + UtilClient.assert_as_string(error.message) + +async def delete_index_async(client: bailian20231229Client, workspace_id: str, index_id: str) -> None: + delete_index_request = bailian_20231229_models.DeleteIndexRequest( + index_id=index_id + ) + runtime = util_models.RuntimeOptions() + headers = {} + try: + response = await client.delete_index_with_options_async(workspace_id, delete_index_request, headers, runtime) + print("routes Response:", response) + except Exception as error: + print(error.message) + print(error.data.get("Recommend")) + UtilClient.assert_as_string(error.message) + +if __name__ == '__main__': + workspace_id = os.environ['DASHSCOPE_WORKSPACE_ID'] + index_id = 'pg5rrsv26x' + client = create_client() + delete_index(client, workspace_id, index_id) diff --git a/flask_app/old_version/文档理解大模型版知识库处理/知识库操作_old.py b/flask_app/old_version/文档理解大模型版知识库处理/知识库操作_old.py new file mode 100644 index 0000000..f0c1b2d --- /dev/null +++ b/flask_app/old_version/文档理解大模型版知识库处理/知识库操作_old.py @@ -0,0 +1,58 @@ +import os +import uuid +from llama_index.readers.dashscope.base import DashScopeParse +from llama_index.readers.dashscope.utils import ResultType +from llama_index.indices.managed.dashscope import DashScopeCloudIndex +from flask_app.old_version.文档理解大模型版知识库处理.删除知识库_old import delete_index, create_client + + +def addfileToKnowledge(filepath,knowledge_name): + parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND) + documents = parse.load_data(file_path=filepath) + index = DashScopeCloudIndex.from_documents( + documents, + knowledge_name, + verbose=True, + ) + print("knowledge created successfully!!!") + # index = DashScopeCloudIndex(knowledge_name) + # index._insert(documents) + # return index, documents + return index + +def deleteKnowledge(index): + retriever = index.as_retriever() + index_id = str(retriever.pipeline_id) + workspace_id = os.environ.get('DASHSCOPE_WORKSPACE_ID') + client = create_client() + delete_index(client,workspace_id,index_id) + print("knowledge old_version successfully!!!") + + + +def deleteFileFromKnowledge(index, documents): + # 初始化一个列表来存储所有文档的 ID + file_ids = [] + # 检查documents是否为列表且不为空 + if isinstance(documents, list) and documents: + # 遍历每个文档 + for document in documents: + # 使用属性访问方式获取每个文档的 id_ + # 确保 document 对象有一个名为 id_ 的属性 + file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常 + if file_id: + file_ids.append(file_id) # 将 id 添加到列表中 + print("old_version successfully") + index.delete_ref_doc(file_ids) + + +# 示例用法 +if __name__ == "__main__": + filepath = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标01.pdf" + unique_id = str(uuid.uuid4()) + knowledge_name="招标解析"+unique_id + # index = addfileToKnowledge(filepath,knowledge_name) + index = DashScopeCloudIndex("招标解析e8cc45f4-cd41-47cf-a5e6-2b7885debfff") + # 删除文件 + # deleteFileFromKnowledge(index, document) + deleteKnowledge(index) diff --git a/flask_app/old_version/文档理解大模型版知识库处理/调用文件解析状态查询_old.py b/flask_app/old_version/文档理解大模型版知识库处理/调用文件解析状态查询_old.py new file mode 100644 index 0000000..6a5547e --- /dev/null +++ b/flask_app/old_version/文档理解大模型版知识库处理/调用文件解析状态查询_old.py @@ -0,0 +1,33 @@ +from typing import List +from alibabacloud_docmind_api20220711.client import Client as docmind_api20220711Client +from alibabacloud_tea_openapi import models as open_api_models +from alibabacloud_docmind_api20220711 import models as docmind_api20220711_models +from alibabacloud_tea_util.client import Client as UtilClient +from alibabacloud_credentials.client import Client as CredClient + +def query(): + # 使用默认凭证初始化Credentials Client。 + cred=CredClient() + config = open_api_models.Config( + # 通过credentials获取配置中的AccessKey ID + cred.get_credential().access_key_id, + # 通过credentials获取配置中的AccessKey Secret + cred.get_credential().access_key_secret, + ) + # 访问的域名 + config.endpoint = f'docmind-api.cn-hangzhou.aliyuncs.com' + client = docmind_api20220711Client(config) + request = docmind_api20220711_models.QueryDocParserStatusRequest( + # id : 任务提交接口返回的id + id='docmind-20241008-24363df30d274863894f037dbb7244e8' + ) + try: + # 复制代码运行请自行打印 routes 的返回值 + response = client.query_doc_parser_status(request) + # API返回值格式层级为 body -> data -> 具体属性。可根据业务需要打印相应的结果。获取属性值均以小写开头 + # 获取返回结果。建议先把response.body.data转成json,然后再从json里面取具体需要的值。 + print(response.body.data.status) + except Exception as error: + # 如有需要,请打印 error + UtilClient.assert_as_string(error.message) +query() \ No newline at end of file diff --git a/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析异步提交_old.py b/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析异步提交_old.py new file mode 100644 index 0000000..2088e74 --- /dev/null +++ b/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析异步提交_old.py @@ -0,0 +1,40 @@ +from alibabacloud_docmind_api20220711.client import Client as docmind_api20220711Client +from alibabacloud_tea_openapi import models as open_api_models +from alibabacloud_docmind_api20220711 import models as docmind_api20220711_models +from alibabacloud_tea_util.client import Client as UtilClient +from alibabacloud_tea_util import models as util_models +from alibabacloud_credentials.client import Client as CredClient + +def submit_file(): + # 使用默认凭证初始化Credentials Client。 + cred=CredClient() + config = open_api_models.Config( + # 通过credentials获取配置中的AccessKey ID + # access_key_id=cred.get_access_key_id(), + cred.get_credential().access_key_id, + # 通过credentials获取配置中的AccessKey Secret + # access_key_secret=cred.get_access_key_secret() + cred.get_credential().access_key_secret, + ) + # 访问的域名 + config.endpoint = f'docmind-api.cn-hangzhou.aliyuncs.com' + client = docmind_api20220711Client(config) + request = docmind_api20220711_models.SubmitDocParserJobAdvanceRequest( + # file_url_object : 本地文件流 + file_url_object=open("zbtest4.pdf", "rb"), + # file_name :文件名称。名称必须包含文件类型 + file_name='zbtest4.pdf' + # file_name_extension : 文件后缀格式。与文件名二选一 + # file_name_extension='pdf' + ) + runtime = util_models.RuntimeOptions() + try: + # 复制代码运行请自行打印 routes 的返回值 + response = client.submit_doc_parser_job_advance(request, runtime) + # API返回值格式层级为 body -> data -> 具体属性。可根据业务需要打印相应的结果。如下示例为打印返回的业务id格式 + # 获取属性值均以小写开头, + print(response.body.data.id) + except Exception as error: + # 如有需要,请打印 error + UtilClient.assert_as_string(error.message) +submit_file() \ No newline at end of file diff --git a/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析结果获取_old.py b/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析结果获取_old.py new file mode 100644 index 0000000..21b85f8 --- /dev/null +++ b/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析结果获取_old.py @@ -0,0 +1,35 @@ +from typing import List +from alibabacloud_docmind_api20220711.client import Client as docmind_api20220711Client +from alibabacloud_tea_openapi import models as open_api_models +from alibabacloud_docmind_api20220711 import models as docmind_api20220711_models +from alibabacloud_tea_util.client import Client as UtilClient +from alibabacloud_credentials.client import Client as CredClient + +def query(): + # 使用默认凭证初始化Credentials Client。 + cred=CredClient() + config = open_api_models.Config( + # 通过credentials获取配置中的AccessKey ID + cred.get_credential().access_key_id, + # 通过credentials获取配置中的AccessKey Secret + cred.get_credential().access_key_secret, + ) + # 访问的域名 + config.endpoint = f'docmind-api.cn-hangzhou.aliyuncs.com' + client = docmind_api20220711Client(config) + request = docmind_api20220711_models.GetDocParserResultRequest( + # id : 任务提交接口返回的id + id='docmind-20241008-24363df30d274863894f037dbb7244e8', + layout_step_size=10, + layout_num=0 + ) + try: + # 复制代码运行请自行打印 routes 的返回值 + response = client.get_doc_parser_result(request) + # API返回值格式层级为 body -> data -> 具体属性。可根据业务需要打印相应的结果。获取属性值均以小写开头 + # 获取返回结果。建议先把response.body.data转成json,然后再从json里面取具体需要的值。 + print(response.body.data) + except Exception as error: + # 如有需要,请打印 error + UtilClient.assert_as_string(error.message) +query() \ No newline at end of file diff --git a/flask_app/old_version/无效标和废标和禁止投标整合_old.py b/flask_app/old_version/无效标和废标和禁止投标整合_old.py new file mode 100644 index 0000000..67bf587 --- /dev/null +++ b/flask_app/old_version/无效标和废标和禁止投标整合_old.py @@ -0,0 +1,440 @@ +# -*- coding: utf-8 -*- +import json +import os.path +import time +import re + +from flask_app.general.format_change import pdf2docx +from flask_app.general.通义千问long import upload_file, qianwen_long +from concurrent.futures import ThreadPoolExecutor + +from flask_app.general.table_content_extraction import extract_tables_main +from flask_app.old_version.不得存在及禁止投标情形_old import find_forbidden, process_string_list + + +#处理跨页的段落 +def preprocess_paragraphs(paragraphs): + processed = [] # 初始化处理后的段落列表 + index = 0 + while index < len(paragraphs): + current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白 + + # 检查当前段落是否为空 + if current_text == '': + # 确保有前一个和后一个段落 + if 0 < index < len(paragraphs) - 1: + prev_text = paragraphs[index - 1].text.strip() # 获取前一个段落的文本 + next_text = paragraphs[index + 1].text.strip() # 获取后一个段落的文本 + + # 检查前一个段落的文本是否不以标点符号结尾 + if not prev_text.endswith((',', ',', '。', '!', '?')): + # 定义列表项的模式 + list_item_pattern = r'^\s*([($]\d+[)$]|[A-Za-z]\.\s*|[一二三四五六七八九十]+、)' + # 检查前一个段落是否以列表模式开头,且后一个段落不以列表模式开头 + # is_prev_list = re.match(list_item_pattern, prev_text) + is_next_list = re.match(list_item_pattern, next_text) + if not is_next_list and len(prev_text) > 30: + # 合并前一个和后一个段落的文本 + merged_text = prev_text + next_text # 为了可读性添加空格 + if processed: + processed[-1] = merged_text # 更新处理后的最后一个段落 + else: + processed.append(merged_text) # 如果列表为空,直接添加合并后的文本 + + # 跳过下一个段落,因为它已经被合并 + index += 2 + continue + else: + # 非空段落,添加到处理后的列表中 + processed.append(current_text) + index += 1 + + return processed + +#如果当前段落有序号,则向下匹配直接遇到相同的序号样式 +#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。 + +def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): + from collections import OrderedDict + from docx import Document + import re + + if isinstance(keywords, str): + keywords = [keywords] + + doc = Document(doc_path) + extracted_paragraphs = OrderedDict() + continue_collecting = False + current_section_pattern = None + active_key = None + + def match_keywords(text, patterns): + return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns) + + def extract_from_text(text, current_index): + nonlocal continue_collecting, current_section_pattern, active_key + if text == "": + return current_index + + if continue_collecting: + if current_section_pattern and re.match(current_section_pattern, text): + continue_collecting = False + active_key = None + else: + if active_key is not None: + extracted_paragraphs[active_key].append(text) + return current_index + + if match_keywords(text, keywords): + active_key = text + extracted_paragraphs[active_key] = [text] + if match_keywords(text, follow_up_keywords): + continue_collecting = True + section_number = re.match(r'^(\d+([..]\d+)*)\s*[..]?', text) # 修改后的正则,支持 '数字 、' 和 '数字.' + if section_number: + current_section_number = section_number.group(1) + level_count = current_section_number.count('.') + + # Pattern to match current level, e.g., 3.4.5 + pattern = r'^' + (r'\d+\s*[..]\s*') * level_count + r'\d+' + # Generate patterns for next section at same level and parent level + parts = current_section_number.split('.') + matched_patterns = [pattern] # start with the full pattern + + # Next section at same level + parts[-1] = str(int(parts[-1]) + 1) + next_pattern = r'^' + r'\s*\.\s*'.join(parts) + matched_patterns.append(next_pattern) + + # Parent section (if applicable) + if len(parts) > 1: + parent_section_parts = parts[:-1] + parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1) + parent_pattern = r'^' + r'\s*\.\s*'.join(parent_section_parts) + matched_patterns.append(parent_pattern) + + # Combine the patterns + combined_pattern = r'(' + r')|('.join(matched_patterns) + r')' + current_section_pattern = re.compile(combined_pattern) + + else: + found_next_number = False + current_section_pattern = None + + while current_index < len(doc.paragraphs) - 1: + current_index += 1 + next_text = doc.paragraphs[current_index].text.strip() + if not found_next_number: + next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))', next_text) + if next_section_number: + found_next_number = True + if next_section_number.group(1): + section_parts = next_section_number.group(1).split('.') + dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b' + elif next_section_number.group(2): + dynamic_pattern = r'^[\(\(]\d+[\)\)]' + current_section_pattern = re.compile(dynamic_pattern) + if current_section_pattern and re.match(current_section_pattern, next_text): + extracted_paragraphs[active_key].append(next_text) + else: + continue_collecting = False + active_key=None + break + + return current_index + + processed_paragraphs = preprocess_paragraphs(doc.paragraphs) + index = 0 + while index < len(processed_paragraphs): + index = extract_from_text(processed_paragraphs[index].strip(), index) + index += 1 + + return extracted_paragraphs + +def preprocess_text_list(text_list): + new_text_list = [] + # 正则表达式匹配中文字符或标点后的空格,该空格后紧跟字母、数字或带括号的数字 + split_pattern = re.compile(r'(?<=[\u4e00-\u9fff。;!??!;])(?=\s+[a-zA-Z\d]|\s+\([1-9]\d*\)|\s+\([1-9]\d*\))') + for text in text_list: + # 使用正则表达式检查并拆分元素 + parts = split_pattern.split(text) + new_text_list.extend(part.strip() for part in parts if part.strip()) # 添加非空字符串检查 + + return new_text_list + +def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达式提取到的东西格式化 + all_texts1 = [] + all_texts2=[] + # 定义用于分割句子的正则表达式,包括中文和西文的结束标点 + split_pattern = r'(?<=[。!?\!\?])' + + for key, text_list in extracted_contents.items(): + if len(text_list) == 1: + for data in text_list: + # 检查是否包含任何需要排除的字符串 + if any(exclude in data for exclude in excludes): + continue # 如果包含任何排除字符串,跳过这个数据 + # 去掉开头的序号,包括字母+数字的格式 以及括号+数字 + pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)' + data = re.sub(pattern, '', data).strip() + keyword_match = re.search(keywords, data) + if keyword_match: + # 从关键词位置开始查找结束标点符号 + start_pos = keyword_match.start() + # 截取从关键词开始到后面的内容 + substring = data[start_pos:] + # 按定义的结束标点分割 + sentences = re.split(split_pattern, substring, 1) + if len(sentences) > 0 and sentences[0]: + # 只取第一句,保留标点 + cleaned_text = data[:start_pos] + sentences[0] #eg:经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。潜在投标人应自行承担现场考察的全部费用、责任和风险。 + # 经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。 + else: + cleaned_text = data # 如果没有标点,使用整个字符串 + else: + # 如果没有找到关键词,保留原文本 + cleaned_text = data + # 删除空格 + cleaned_text_no_spaces = cleaned_text.replace(' ', '').replace(' ', '') + # 如果长度大于8,则添加到结果列表 + if len(cleaned_text_no_spaces) > 8: + all_texts1.append(cleaned_text_no_spaces) + + else: + # print(text_list) + new_text_list=preprocess_text_list(text_list) + # print(new_text_list) + pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)' + # data = re.sub(pattern, '', new_text_list[0]).strip() #去除序号 + data = re.sub(pattern, '', new_text_list[0]).replace(' ','').strip() + # 将修改后的第一个元素和剩余的元素连接起来 + new_text_list[0] = data # 更新列表中的第一个元素 + joined_text = "\n".join(new_text_list) # 如果列表中有多个元素,则连接它们 + # 删除空格 + joined_text_no_spaces = joined_text.replace(' ', '').replace(' ', '') + all_texts2.append(joined_text_no_spaces) # 将每个列表的内容添加到 all_texts 中 + + return all_texts1,all_texts2 #all_texts1要额外用gpt all_text2直接返回结果 +def find_sentences_with_keywords(data, keywords, follow_up_keywords): + """递归查找并返回包含关键词的句子列表,并根据是否存在后续关键词分别存储到两个列表中。""" + sentences1 = [] # 保存没有后续关键词的情况 + sentences2 = [] # 保存有后续关键词的情况 + + if isinstance(data, dict): + for value in data.values(): + result1, result2 = find_sentences_with_keywords(value, keywords, follow_up_keywords) + sentences1.extend(result1) + sentences2.extend(result2) + elif isinstance(data, list): + for item in data: + result1, result2 = find_sentences_with_keywords(item, keywords, follow_up_keywords) + sentences1.extend(result1) + sentences2.extend(result2) + elif isinstance(data, str): + # 分割句子,保证句子完整性(按标点符号和序号分割) + # split_sentences = re.split(r'(?<=[。!?\!\?])|(?=\d+[\、\.])|(?=[((]\d+[))])', data) # 扩展匹配序号分割 + split_sentences = re.split(r'(?<=[。!?\!\?])|(?=\d+\.\d+)|(?=\d+[\、\.])|(?=[((]\d+[))])', data) + i = 0 + while i < len(split_sentences): + sentence = split_sentences[i].strip() + if re.search(keywords, sentence, re.IGNORECASE): + follow_up_present = any( + re.search(follow_up, sentence, re.IGNORECASE) for follow_up in follow_up_keywords) + if follow_up_present: + # 如果存在后续关键词,则从当前位置开始截取 + start_index = i + end_index = start_index + found_next_section = False + for j in range(start_index + 1, len(split_sentences)): + if re.match(r'\d+\.\d+(\.\d+)?', split_sentences[j].strip()): + end_index = j + found_next_section = True + break + + if found_next_section: + full_text = ' '.join(split_sentences[start_index:end_index]).strip() + else: + full_text = ' '.join(split_sentences[start_index:]).strip() + # pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)' + pattern = r'^\s*([((]\d+[))]|[A-Za-z]\.\s*|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)' + # data=re.sub(pattern,'',full_text) + data = re.sub(pattern, '', full_text).replace(' ','').strip() + sentences2.append(data) # 存储有后续关键词的情况 + i = end_index if found_next_section else len(split_sentences) + else: + # pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)' + pattern = r'^\s*([((]\d+[))]|[A-Za-z]\.\s*|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)' + # data = re.sub(pattern, '', sentence).replace('\n','').strip() + cleaned_sentence = re.sub(pattern, '', sentence).replace('\n', '').replace(' ','').strip() + if len(cleaned_sentence) > 8: + sentences1.append(cleaned_sentence) # 存储没有后续关键词的情况 + i += 1 + else: + i += 1 + + return sentences1, sentences2 # 返回两个列表 + +def extract_sentences_from_json(json_path, keywords,follow_up_keywords): + if not json_path: + return [],[] + with open(json_path, 'r', encoding='utf-8') as file: + data = json.load(file) + """从JSON数据中提取包含关键词的句子。""" + return find_sentences_with_keywords(data, keywords,follow_up_keywords) + +#处理无效投标 +def extract_values_if_contains(data, includes): + """ + 递归检查字典中的值是否包含列表 'includes' 中的内容。 + 如果包含,将这些值添加到一个列表中并返回。 + + 参数: + data (dict): 字典或从 JSON 解析得到的数据。 + includes (list): 包含要检查的关键词的列表。 + + 返回: + list: 包含满足条件的值的列表。 + """ + included_values = [] # 初始化结果列表 + + # 定义递归函数来处理嵌套字典 + def recursive_search(current_data): + if isinstance(current_data, dict): + for key, value in current_data.items(): + if isinstance(value, dict): + # 如果值是字典,递归搜索 + recursive_search(value) + elif isinstance(value, str): + # 如果值是字符串,检查是否包含任何 includes 中的关键词 + if any(include in value for include in includes): + included_values.append(value) + elif isinstance(current_data, list): + for item in current_data: + # 如果是列表,递归每个元素 + recursive_search(item) + + # 开始递归搜索 + recursive_search(data) + + return included_values + + +#你是一个文本助手,文本内的信息以'...............'分割,你负责准确筛选所需的信息并返回,每块信息要求完整,不遗漏,你不得擅自进行总结或删减。 +#以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号。 +#以上是原文内容,文本内的信息以'...............'分割,请你根据该信息回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选所需的信息并返回。最终结果以json列表格式返回给我,键名为'否决和无效投标情形',你的回答完全忠于原文内容,且回答内容与原文内容一致,要求完整与准确,不能擅自总结或者概括。", + +#TODO:truncate_json_path为空的时候,单独提取表格数据 +def handle_query(file_path, user_query, output_file, result_key, keywords, truncate_json_path): + try: + excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:"] + follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下'] + extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) # 提取正文(除表格) + # print(extracted_contents) + all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表 + all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords) # 提取表格数据(json_data) + qianwen_txt = all_texts1 + all_tables1 + selected_contents = set() # 使用 set 去重 + + if qianwen_txt: + with open(output_file, 'w', encoding='utf-8') as file: + counter = 1 + for content in qianwen_txt: + file.write("..............." + '\n') + file.write(f"{counter}. {content}\n") + counter += 1 + + file_id = upload_file(output_file) + # qianwen_ans = qianwen_long(file_id, user_query) + qianwen_ans = qianwen_long(file_id, user_query) + num_list = process_string_list(qianwen_ans) + print(result_key + "选中的序号:" + str(num_list)) + + for index in num_list: + if 1 <= index <= len(qianwen_txt): + content = qianwen_txt[index - 1] + selected_contents.add(content) + + # 无论 qianwen_txt 是否为空,都添加 all_texts2 和 all_tables2 的内容 + selected_contents.update(all_texts2) + selected_contents.update(all_tables2) + + # 如果 selected_contents 不为空,则返回结果,否则返回空字符串 + if selected_contents: + res = {result_key: list(selected_contents)} + else: + res = {result_key: ""} + + return res + except Exception as e: + print(f"handle_query 在处理 {result_key} 时发生异常: {e}") + return {result_key: ""} + +def combine_find_invalid(invalid_docpath, output_dir, tobidders_notice_table, clause_path, qualification): + tobidders_notice_table_docx = pdf2docx(tobidders_notice_table) # 投标人须知前附表转docx + truncate_jsonpath = extract_tables_main(tobidders_notice_table_docx, output_dir) # 投标人须知前附表docx->json + queries = [ + ( + r'否\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效', + "以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。", + os.path.join(output_dir, "temp1.txt"), + "否决和无效投标情形" + ), + ( + r'废\s*标', + "以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:废标项的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。", + os.path.join(output_dir, "temp2.txt"), + "废标项" + ) + ] + results = [] + # 使用线程池来并行处理查询 + with ThreadPoolExecutor() as executor: + futures = [] + for keywords, user_query, output_file, result_key in queries: + future = executor.submit( + handle_query, + invalid_docpath, + user_query, + output_file, + result_key, + keywords, + truncate_jsonpath + ) + futures.append((future, result_key)) + time.sleep(0.5) # 暂停0.5秒后再提交下一个任务 + # 按照提交的顺序收集结果 + for future, result_key in futures: + try: + result = future.result() + except Exception as e: + print(f"线程处理 {result_key} 时出错: {e}") + result = {result_key: ""} + results.append(result) + + # 禁止投标(find_forbidden)部分 + try: + # print("starting不得存在的情形...") + forbidden_res = find_forbidden(truncate_jsonpath, clause_path, qualification) + except Exception as e: + print(f"find_forbidden 处理时出错: {e}") + forbidden_res = {'不得存在的其他情形': ""} + results.append(forbidden_res) + + combined_dict = {} + for d in results: + combined_dict.update(d) + # print("无效标与废标done...") + return {"无效标与废标项": combined_dict} + +if __name__ == '__main__': + start_time = time.time() + tobidders_notice_table="" + clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json" + qualification="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf" + output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\zbout" + # doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx' + invalid_docpath = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_invalid.docx' + results = combine_find_invalid(invalid_docpath, output_dir,tobidders_notice_table,clause_path,qualification) + end_time = time.time() + print("Elapsed time:", str(end_time - start_time)) + print("Results:", json.dumps(results,ensure_ascii=False,indent=4)) diff --git a/flask_app/old_version/解析old_old.py b/flask_app/old_version/解析old_old.py new file mode 100644 index 0000000..c977194 --- /dev/null +++ b/flask_app/old_version/解析old_old.py @@ -0,0 +1,221 @@ +# -*- encoding:utf-8 -*- +import json +import logging +import time +from concurrent.futures import ThreadPoolExecutor +from flask_app.main.截取pdf import truncate_pdf_multiple +from flask_app.general.table_content_extraction import extract_tables_main +from flask_app.main.提取json工程标版 import convert_clause_to_json +from flask_app.general.json_utils import transform_json_values +from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid +from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice +import concurrent.futures +from flask_app.main.基础信息整合快速版 import combine_basic_info +from flask_app.main.资格审查模块 import combine_review_standards +from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards +from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx +from flask_app.general.docx截取docx import copy_docx + +def get_global_logger(unique_id): + if unique_id is None: + return logging.getLogger() # 获取默认的日志器 + logger = logging.getLogger(unique_id) + return logger + +logger=None + +# 创建全局线程池 +executor = ThreadPoolExecutor() +def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id): + logger.info("starting 文件预处理...") + logger.info("output_folder..." + output_folder) + + # 根据文件类型处理文件路径 + if file_type == 1: # docx + docx_path = downloaded_file_path + pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理 + elif file_type == 2: # pdf + pdf_path = downloaded_file_path + docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库 + elif file_type == 3: #doc + pdf_path=docx2pdf(downloaded_file_path) + docx_path=doc2docx(downloaded_file_path) + else: + logger.error("Unsupported file type provided. Preprocessing halted.") + return None + + # 调用截取PDF多次 + truncate_files = truncate_pdf_multiple(pdf_path, output_folder,unique_id) + + # 处理各个部分 + truncate0_docpath = pdf2docx(truncate_files[0]) # 投标人须知前附表转docx + + invalid_path=truncate_files[5] + invalid_docpath = copy_docx(docx_path) # docx截取无效标部分 + # invalid_docpath=pdf2docx(invalid_path) + truncate_jsonpath = extract_tables_main(truncate0_docpath, output_folder) # 投标人须知前附表docx->json + truncate0 = truncate_files[0] + truncate1 = truncate_files[1] + truncate2=truncate_files[2] + truncate3 = truncate_files[3] + merged_baseinfo_path=truncate_files[-1] + clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json + + logger.info("文件预处理done") + + # 返回包含预处理后文件路径的字典 + return { + 'file_path': downloaded_file_path, + 'output_folder': output_folder, + 'invalid_path':invalid_path, + 'truncate0': truncate0, + 'truncate1': truncate1, + 'truncate2':truncate2, + 'truncate3': truncate3, + 'truncate0_jsonpath': truncate_jsonpath, + 'merged_baseinfo_path':merged_baseinfo_path, + 'clause_path': clause_path, + 'invalid_docpath': invalid_docpath + } + + +def post_processing(data,includes): + # 初始化结果字典,预设'其他'分类为空字典 + result = {"其他": {}} + + # 遍历原始字典的每一个键值对 + for key, value in data.items(): + if key in includes: + # 如果键在includes列表中,直接保留这个键值对 + result[key] = value + else: + # 如果键不在includes列表中,将这个键值对加入到'其他'分类中 + result["其他"][key] = value + + # 如果'其他'分类没有任何内容,可以选择删除这个键 + if not result["其他"]: + del result["其他"] + + return result +# 基本信息 +def fetch_project_basic_info(merged_baseinfo_path, truncate0, truncate2, clause_path): # 投标人须知前附表 + logger.info("starting基础信息...") + basic_res = combine_basic_info(merged_baseinfo_path, truncate0, truncate2, clause_path) + logger.info("基础信息done") + return basic_res + + +# 形式、响应、资格评审 +def fetch_qualification_review(truncate1, truncate3,output_folder, truncate0_jsonpath, clause_path,invalid_path,merged_baseinfo_path): + logger.info("starting资格审查...") + review_standards_res = combine_review_standards(truncate1, truncate3,output_folder, truncate0_jsonpath, + clause_path,invalid_path,merged_baseinfo_path) + logger.info("资格审查done") + return review_standards_res + + +# 评分细则 流式 +def fetch_evaluation_standards(truncate1): # 评标办法前附表 + logger.info("starting 商务标和技术标...") + + # 获取评标办法前附表的字典结果 + evaluation_standards_res = combine_evaluation_standards(truncate1) + + # 获取技术标和商务标 + technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})} + commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})} + + logger.info("商务标和技术标 done") + + # 返回将 "技术标" 和 "商务标" 包含在新的键中 + return { + "technical_standards": technical_standards, + "commercial_standards": commercial_standards + } + + +# 无效、废标项解析 +def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3): + # 废标项要求:千问 + logger.info("starting无效标与废标...") + find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3) + logger.info("无效标与废标done...") + return find_invalid_res + + +# 投标文件要求 +def fetch_bidding_documents_requirements(clause_path): + logger.info("starting投标文件要求...") + fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1) + logger.info("投标文件要求done...") + return {"投标文件要求":fetch_bidding_documents_requirements_json} + +# 开评定标流程 +def fetch_bid_opening(clause_path): + logger.info("starting开评定标流程...") + fetch_bid_opening_json = extract_from_notice(clause_path, 2) + logger.info("开评定标流程done...") + return {"开评定标流程":fetch_bid_opening_json} + +#分段返回 +def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_id): + global logger + logger = get_global_logger(unique_id) + # 预处理文件,获取处理后的数据 + processed_data = preprocess_files(output_folder, downloaded_file_path, file_type) + if not processed_data: + yield json.dumps({}) # 如果处理数据失败,返回空的 JSON + + with concurrent.futures.ThreadPoolExecutor() as executor: + # 立即启动不依赖 knowledge_name 和 index 的任务 + futures = { + 'base_info': executor.submit(fetch_project_basic_info, processed_data['merged_baseinfo_path'], + processed_data['truncate0'], + processed_data['truncate2'], processed_data['clause_path']), + 'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'], + processed_data['truncate3'], output_folder, + processed_data['truncate0_jsonpath'], + processed_data['clause_path'], processed_data['invalid_path'], + processed_data['merged_baseinfo_path']), + 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']), + 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], + output_folder, processed_data['truncate0_jsonpath'], + processed_data['clause_path'], processed_data['truncate3']), + 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, processed_data['clause_path']), + 'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path']) + } + + # 提前处理这些不依赖的任务,按完成顺序返回 + for future in concurrent.futures.as_completed(futures.values()): + key = next(k for k, v in futures.items() if v == future) + try: + result = future.result() + + # 如果是 evaluation_standards,拆分技术标和商务标 + if key == 'evaluation_standards': + technical_standards = result["technical_standards"] + commercial_standards = result["commercial_standards"] + + # 分别返回技术标和商务标 + yield json.dumps({'technical_standards': transform_json_values(technical_standards)}, ensure_ascii=False) + yield json.dumps({'commercial_standards': transform_json_values(commercial_standards)}, ensure_ascii=False) + + else: + # 处理其他任务的结果 + yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False) + + except Exception as exc: + logger.error(f"Error processing {key}: {exc}") + yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False) + +if __name__ == "__main__": + start_time = time.time() + output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test1" + file_type = 2 #1:docx 2:pdf 3:其他 + input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf" + print("yes") + for output in engineering_bid_main(output_folder, input_file, file_type, "zytest"): + print(output) + end_time = time.time() + elapsed_time = end_time - start_time # 计算耗时 + print(f"Function execution took {elapsed_time} seconds.") diff --git a/flask_app/old_version/资格审查模块old_old.py b/flask_app/old_version/资格审查模块old_old.py new file mode 100644 index 0000000..6f93927 --- /dev/null +++ b/flask_app/old_version/资格审查模块old_old.py @@ -0,0 +1,42 @@ +import os + +from flask_app.main.提取json工程标版 import convert_clause_to_json +from flask_app.general.json_utils import extract_content_from_json +from flask_app.old_version.形式响应评审old import process_reviews +from flask_app.old_version.资格评审old_old import process_qualification +from flask_app.general.通义千问long import upload_file, qianwen_long +from concurrent.futures import ThreadPoolExecutor + + +def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder): #评标办法前附表 + # 形式评审、响应评审:千问 + file_id=upload_file(truncate1) #评标办法前附表 + user_query_1 = "根据该文档中的评标办法前附表,请你列出该文件中的形式评审标准和响应性评审标准和资格评审标准,请以json格式返回,外层键名为'形式评审标准'和'响应性评审标准'和'资格评审标准',嵌套键名为'评审因素'中的内容,相应的键值为对应'评审标准'中的内容。" + results = qianwen_long(file_id, user_query_1) + original_dict_data = extract_content_from_json(results) + qualification_review = original_dict_data.pop('资格评审标准', '默认值或None') #qianwen-long有关资格评审的内容 + with ThreadPoolExecutor() as executor: + # 创建Future对象 + future_qualification = executor.submit(process_qualification, qualification_review, truncate3, knowledge_name) + future_form_response = executor.submit(process_reviews, original_dict_data, knowledge_name, truncate0_jsonpath, + clause_path,input_file,output_folder) + + # 等待执行结果 + final_qualify_json = future_qualification.result() + form_response_dict = future_form_response.result() + form_response_dict.update(final_qualify_json) + # return nest_json_under_key(form_response_dict,"资格审查") + return {"资格审查":form_response_dict} + +if __name__ == "__main__": + input_file="D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21\\ztbfile.pdf" + output_folder = "D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21" + # truncate0 = os.path.join(output_folder, "zbtest20_tobidders_notice_table.pdf") + truncate2=os.path.join(output_folder,"ztbfile_tobidders_notice.pdf") + knowledge_name="zbtest20" + truncate1=os.path.join(output_folder,"ztbfile_evaluation_method.pdf") + truncate3=os.path.join(output_folder,"ztbfile_qualification.pdf") + clause_path = convert_clause_to_json(truncate2, output_folder) + truncate0_jsonpath = os.path.join(output_folder, "truncate_output.json") + res=combine_review_standards(truncate1,truncate3, knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder) + print(res) \ No newline at end of file diff --git a/flask_app/old_version/资格评审old_old.py b/flask_app/old_version/资格评审old_old.py new file mode 100644 index 0000000..58a3c84 --- /dev/null +++ b/flask_app/old_version/资格评审old_old.py @@ -0,0 +1,182 @@ +# -*- encoding:utf-8 -*- +# 资格审查中,首先排除'联合体投标'和'不得存在的情况',有'符合'等的,加入matching_keys列表,否则保留原字典 +import json +import re +from flask_app.general.json_utils import clean_json_string, combine_json_results, add_keys_to_json +from flask_app.general.多线程提问 import multi_threading, read_questions_from_file +from flask_app.general.通义千问long import upload_file + + +def merge_dictionaries_under_common_key(dicts, common_key): + # 初始化一个空字典来保存合并的结果 + merged_dict = {common_key: {}} + + # 遍历列表中的每个字典 + for d in dicts: + if common_key in d: + # 使用字典解包来合并字典 + merged_dict[common_key].update(d[common_key]) + else: + print(f"资格评审: Warning: Dictionary does not contain the key {common_key}") + + return merged_dict + + +def generate_qual_question(matching_keys_list): # 这里假设资质、信誉与人员要求 要不都有、要不都没 + if not matching_keys_list: + return [] + else: + questions = [] + # 将列表转换为单引号包裹的格式,并用逗号和空格分隔 + formatted_keys = ["'{}'".format(key) for key in matching_keys_list] + # 将格式化后的关键词列表连接成字符串 + keys_string = "、".join(formatted_keys) + # 构造完整的问题语句 + question1 = (f"该招标文件中资格评审的内容是怎样的?具体内容包括{keys_string}," + "请你以json格式返回结果,外层键名为'资格评审',嵌套键名为具体的字段,请你忠于原文,回答要求完整准确,不要擅自总结、删减。") + question2 = "该招标文件中资格评审中有关人员资格的要求是怎样的?请依次给出所需的岗位、需要的数量、资格要求、需要提交的证明材料(如具体的社保证明、技能证书等,若有时间要求请注明时间范围)、在岗要求、备注,若相关要求不存在,则无需返回该键值对。请你以json格式返回结果,外层键名为'资格评审',嵌套键名为具体的要求,请你忠于原文,回答要求完整准确,不要擅自总结、删减。" + questions.append(question1) + questions.append(question2) + return questions + + +def extract_matching_keys_qual(dict_data): + # 定义包含模式的列表 + include_patterns = [re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"), + re.compile(r"符合")] + # 初始化列表,用于存储匹配的键 + matching_keys = [] + non_matching_keys = {} + # 定义排除项 + excludes = ['联合体', '禁止投标', '不存在', '不得存在', '资格', '管理机构', '负责人', '人员'] # 联合体、禁止投标的情况、人员需要额外问 + # 遍历字典中的每个键值对 + for key, value in dict_data.items(): + if "附件" in key and "资质" in key: + return [], [] # 如果同时出现,立即返回空列表 + # 检查值是否符合任何一个包含模式 + if any(pattern.search(value) for pattern in include_patterns): + # 如果值符合包含模式,再检查键是否包含任何排除项 + if not any(ex in key for ex in excludes): + # 如果键不包含排除项,则添加到匹配键列表中 + matching_keys.append(key) + else: + # value中有实质的内容,不需要额外问。 + non_matching_keys[key] = value + + return matching_keys, non_matching_keys # matching:['资质条件', '财务状况'] non_matching_keys:{'营业执照': '具备有效的营业执照', '施工机械设备': '具备完善的施工设备'} + + +# 获取联合体投标的要求,由于它不一定在资格审查表中,故调用rag +def get_consortium_dict(knowledge_name): + qualify_list = [] + consortium_questions = [ + "该招标文件对于联合体投标的要求是怎样的,请按json格式给我提供信息,外层键名为'联合体投标要求(如有)',嵌套键名为你对该要求的总结,而键值需要完全与原文保持一致,不要擅自总结、删减。"] + results1 = multi_threading(consortium_questions, knowledge_name) + for _, response in results1: # _占位,代表ques;response[0]也是ques;response[1]是ans + try: + if response and len(response) > 1: # 检查response存在且有至少两个元素 + qualify_list.append(response[1]) + else: + print(f"资格评审: Warning: Missing or incomplete response data for query index {_}.") + except Exception as e: + print(f"资格评审: Error processing response for query index {_}: {e}") + consortium_dict = combine_json_results(qualify_list) + return consortium_dict + + +def get_all_dict(knowledge_name): + qualification_review_file_path = 'flask_app/static/提示词/资格评审.txt' + questions = read_questions_from_file(qualification_review_file_path) + qualification_list = [] + res1 = multi_threading(questions, knowledge_name) + for _, response in res1: # _占位,代表ques;response[0]也是ques;response[1]是ans + try: + if response and len(response) > 1: # 检查response存在且有至少两个元素 + qualification_list.append(response[1]) + else: + print(f"资格评审: Warning: Missing or incomplete response data for query index {_}.") + except Exception as e: + print(f"资格评审: Error processing response for query index {_}: {e}") + qualification_combined_res = combine_json_results(qualification_list) + return {'资格评审': qualification_combined_res} + + +def process_qualification(qualification_review, truncate3, knowledge_name): + # 资格评审 + matching_keys_list, non_matching_dict = extract_matching_keys_qual( + qualification_review) # matching_keys_list:['资质条件', '财务状况'] non_matching_dict:{'营业执照': '具备有效的营业执照', '施工机械设备': '具备完善的施工设备'} + if not matching_keys_list: + if not non_matching_dict: # 古法提取 + if truncate3 != "": # 提取到资格审查附件的情况 + print("资格评审: type1") + matching_keys_list = ["资质条件", "财务要求", "业绩要求", "信誉要求", "其他要求"] + ques = generate_qual_question(matching_keys_list) + file_id2 = upload_file(truncate3) + results2 = multi_threading(ques, "", file_id2, 2) # 资格评审表,调用qianwen-long + res_list = [] + if not results2: + print("资格评审: 调用大模型未成功获取资格评审文件中的要求!") + else: + # 打印结果 + for question, response in results2: + res_list.append(clean_json_string(response)) # 都是问资格评审表得出的 + if res_list: + merged_dict = merge_dictionaries_under_common_key(res_list, '资格评审') + consortium_dict = get_consortium_dict(knowledge_name) + updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) + return updated_qualify_json + else: + print("资格评审: 无法获取大模型结果,返回空值") + return {"资格评审": ""} + else: + print("资格评审: type2") + return get_all_dict(knowledge_name) or {"资格评审": ""} + + else: # 此时要求全部写在评分办法前附表中,不需要额外提取。 + print("资格评审: type3") + new_non_matching_json = {'资格评审': non_matching_dict} + substring = '联合体' + found_key = any(substring in key for key in non_matching_dict.keys()) # 没有联合体投标,则需生成,防止重复 + if not found_key: + consortium_dict = get_consortium_dict(knowledge_name) + final_qualify_json = add_keys_to_json(new_non_matching_json, consortium_dict) + return final_qualify_json + else: + return new_non_matching_json or {"资格评审": ""} + + elif matching_keys_list and truncate3 == "": # 这种情况是评分办法前附表中有要求,但是没有正确截取到'资格审查表' + print("资格评审: type4") + final_qualification = get_all_dict(knowledge_name) + final_qualify_json = add_keys_to_json(final_qualification, non_matching_dict) + return final_qualify_json or {"资格评审": ""} + else: # 大多数情况 + print("资格评审: type5") + user_querys = generate_qual_question(matching_keys_list) # 生成提问->‘附件:资格审查’ + file_id2 = upload_file(truncate3) + results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表,调用qianwen-long + res_list = [] + if not results2: + print("资格评审: 调用大模型未成功获取资格评审文件中的要求!") + return {"资格评审": ""} + else: + # 打印结果 + for question, response in results2: + cleaned_res = clean_json_string(response) + res_list.append(cleaned_res) # 都是问资格评审表得出的 + merged_dict = merge_dictionaries_under_common_key(res_list, '资格评审') + consortium_dict = get_consortium_dict(knowledge_name) + updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) # 合并字典 + final_qualify_json = add_keys_to_json(updated_qualify_json, non_matching_dict) + return final_qualify_json or {"资格评审": ""} + + +if __name__ == "__main__": + # qualification_review={'营业执照': '具备有效的营业执照', '资质等级': '具备建设行政主管部门颁发的市政公用工程监理乙级及以上资质或房屋建筑工程监理乙级及以上资质或工程监理综合资质证书', '财务状况': '投标人须提供近三年(2018 年、2019 年、2020 年)完', '类似项目业绩': '投标人近 5 年(2017 年至今)须具有至少一项投资概算在4000 万元及以上房屋建筑工程或市政公用工程监理业绩,同时提供中标通知书及监理服务合同,项目规模及项目时间认定以监理服务合同内信息为准', '信誉': '根据《关于在招标投标活动中对失信被执行', '主要人员': '监理工程师:至少提供 1 名具有房屋建筑工程专','不存在禁止投标的':'不存在第二章“投标人须知”第 1.4.3 项规定的情形','联合体投标':'hha'} + qualification_review = {'营业执照': '具备有效的营业执照', '安全生产许可证': '具备有效的安全生产许可证', + '资质等级': '符合第二章“投标人须知”规定', '财务状况': '符合第二章“投标人须知”规定'} + truncate3 = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_qualification.pdf" + knowledge_name = "招标解析word13" + res = process_qualification(qualification_review, truncate3, knowledge_name) + print(json.dumps(res, ensure_ascii=False, indent=4)) + +# 该招标文件中资格评审关于财务状况的内容是怎样的?请你以json格式返回结果,外层键名为'财务状况',请你忠于原文,回答要求完整准确,不要擅自总结、删减,且不要回答诸如'见投标人须知前附表'或'见第x.x项规定'这类无实质性内容的回答。 diff --git a/flask_app/old_version/资格评审前判断_old.py b/flask_app/old_version/资格评审前判断_old.py new file mode 100644 index 0000000..eb3dc87 --- /dev/null +++ b/flask_app/old_version/资格评审前判断_old.py @@ -0,0 +1,29 @@ +from flask_app.general.读取文件.按页读取pdf import extract_text_by_page + +def check_strings_in_pdf(file_path): + judge_list=['施工机械设备', '企业信息登记'] + # Read text from PDF + text = extract_text_by_page(file_path) # Assuming this returns all text from the PDF + full_text = ''.join(text).replace('\n', '').replace(' ', '') # Clean up the text + + # Initialize the questions list + ques_list = [] + + # Check for each string in the judge_list and construct questions accordingly + if judge_list[0] in full_text: + ques_list.append(f"该招标文件对于'{judge_list[0]}'的要求是怎样的,请按json格式给我提供信息,键名为'{judge_list[0]}',若存在未知信息,在对应的键值中填'未知'。") + if len(judge_list) > 1 and judge_list[1] in full_text: + ques_list.append(f"该招标文件对于'{judge_list[1]}'的要求是怎样的,请按json格式给我提供信息,键名为'{judge_list[1]}',若存在未知信息,在对应的键值中填'未知'。") + + if not ques_list: + return None + return ques_list + +# Test cases or example usage +if __name__ == '__main__': + file_path = 'C:/Users/Administrator/Desktop/zbtest18_39-45.pdf' # Replace with your actual PDF file path + judge_list = ['施工机械设备', '企业信息登记'] # List of strings to check in the PDF + + questions = check_strings_in_pdf(file_path, judge_list) + for question in questions: + print(question) diff --git a/flask_app/old_version/通义千问_old.py b/flask_app/old_version/通义千问_old.py new file mode 100644 index 0000000..4a71a37 --- /dev/null +++ b/flask_app/old_version/通义千问_old.py @@ -0,0 +1,48 @@ +import json +import random +from http import HTTPStatus +from dashscope import Generation + +def call_with_messages(messages): + response = Generation.call(model="qwen-max", + messages=messages, + seed=random.randint(1, 10000), + temperature=0.5, + top_p=0.5, + top_k=50, + result_format='message') + if response.status_code == HTTPStatus.OK: + content = response.output['choices'][0]['message']['content'] + return content + else: + raise Exception(f'Request id: {response.request_id}, Status code: {response.status_code}, error code: {response.code}, error message: {response.message}') + +def prepare_question_from_json(json_path, user_query): + with open(json_path, 'r', encoding='utf-8') as file: + json_data = json.load(file) + question = json.dumps(json_data, ensure_ascii=False) + user_query + return question + +#专用于判断是否 +def qianwen_ask(json_path, user_query): + messages = [] + question = prepare_question_from_json(json_path, user_query) + messages.append({'role': 'system', 'content': 'You are a helpful assistant.'}) + messages.append({'role': 'user', 'content': question}) + return call_with_messages(messages) + +#通用问题 +def qianwen_ask2(questions): + messages = [] + messages.append({'role': 'system', 'content': 'You are a helpful assistant.'}) + messages.append({'role': 'user', 'content': questions}) + return call_with_messages(messages) + +if __name__ == '__main__': + json_path = 'judge_exist.json' + prompt = "请你依据以上信息回答,是否组织踏勘现场?是否召开投标预备会?是否允许偏离?是否退还投标文件?是否允许分包? 是否需要递交投标保证金?是否有履约保证金(履约担保)?是否有招标代理服务费?请按json格式给我提供信息,键名分别为'是否组织踏勘现场','是否召开投标预备会','是否允许偏离','是否退还投标文件',是否允许分包','是否需要递交投标保证金','是否有履约保证金','是否有招标代理服务费',键值仅限于'是','否','未知',若存在未知或者矛盾信息,请回答'未知'。" + try: + content = qianwen_ask(json_path, prompt) + print(content) + except Exception as e: + print(f"An error occurred: {e}")