From a256ceceb4e382d1f08bb4ded0c6a58e8270e330 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Wed, 11 Sep 2024 12:02:09 +0800 Subject: [PATCH] =?UTF-8?q?9.11=20=E6=94=B9=E4=B8=BA=E5=85=A8=E5=B1=80logg?= =?UTF-8?q?er=EF=BC=8C=E5=AE=8C=E5=96=84=E4=BA=86=E6=97=A0=E6=95=88?= =?UTF-8?q?=E6=A0=87=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/main/docx截取docx.py | 8 +- flask_app/main/download.py | 8 +- flask_app/main/format_change.py | 12 +- flask_app/main/json_utils.py | 16 +- flask_app/main/start_up.py | 125 ++++++++-------- flask_app/main/table_content_extraction.py | 6 +- flask_app/main/ttt.py | 37 ++--- flask_app/main/判断是否分包等.py | 4 - flask_app/main/商务标技术标整合.py | 2 - flask_app/main/基础信息整合.py | 19 ++- flask_app/main/多线程提问.py | 8 +- flask_app/main/形式响应评审.py | 6 +- flask_app/main/截取pdf.py | 15 +- .../main/投标人须知正文条款提取成json文件.py | 5 +- flask_app/main/招标文件解析.py | 58 ++++---- flask_app/main/提取打勾符号.py | 9 +- flask_app/main/无效标和废标和禁止投标整合.py | 127 +++++++++++----- flask_app/main/知识库操作.py | 3 + flask_app/main/禁止投标情形.py | 98 ++++++++---- flask_app/main/读取docx.py | 2 +- flask_app/main/资格审查模块.py | 3 - flask_app/main/资格评审.py | 26 ++-- flask_app/货物标/test.py | 140 ++++++++++++++---- 23 files changed, 467 insertions(+), 270 deletions(-) diff --git a/flask_app/main/docx截取docx.py b/flask_app/main/docx截取docx.py index ed06b20..bce6e9b 100644 --- a/flask_app/main/docx截取docx.py +++ b/flask_app/main/docx截取docx.py @@ -2,6 +2,8 @@ from docx import Document import re import os +from flask import g + def copy_docx(source_path): doc = Document(source_path) # 打开源文档 @@ -10,7 +12,7 @@ def copy_docx(source_path): # 获取原文件名并添加后缀 original_file_name = os.path.basename(source_path) file_name_without_ext, file_ext = os.path.splitext(original_file_name) - modified_file_name = file_name_without_ext + "_invalid" + file_ext + modified_file_name = file_name_without_ext + "_invalid111" + file_ext destination_path = os.path.join(output_folder, modified_file_name) new_doc = Document() # 创建新文档 @@ -22,6 +24,7 @@ def copy_docx(source_path): # 寻找最后一个begin_pattern的位置 last_begin_index = -1 for i, paragraph in enumerate(doc.paragraphs): + if begin_pattern.search(paragraph.text): last_begin_index = i @@ -42,9 +45,10 @@ def copy_docx(source_path): break new_doc.save(destination_path) # 保存新文档 + g.logger.info("docx截取docx成功!") # 调用函数 if __name__ == '__main__': - source_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output\\zbtest13.docx" + source_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile.docx" copy_docx(source_path) diff --git a/flask_app/main/download.py b/flask_app/main/download.py index 5d5a22f..32bbe92 100644 --- a/flask_app/main/download.py +++ b/flask_app/main/download.py @@ -1,6 +1,8 @@ import requests import mimetypes +from flask import g + def download_file(url, local_filename): try: @@ -27,13 +29,13 @@ def download_file(url, local_filename): else: return full_filename,3 except requests.HTTPError as e: - print(f"HTTP Error: {e}") + g.logger.error(f"download: HTTP Error: {e}") return None except requests.RequestException as e: - print(f"Error downloading the file: {e}") + g.logger.error(f"download: Error downloading the file: {e}") return None except Exception as e: - print(f"An error occurred: {e}") + g.logger.error(f"download: An error occurred: {e}") return None if __name__ == '__main__': diff --git a/flask_app/main/format_change.py b/flask_app/main/format_change.py index 61bc650..38a3296 100644 --- a/flask_app/main/format_change.py +++ b/flask_app/main/format_change.py @@ -1,6 +1,8 @@ import json import os import requests +from flask import g + from flask_app.main.download import download_file @@ -19,14 +21,14 @@ def upload_file(file_path, url): # 检查响应状态码 if response.status_code == 200: - print("文件上传成功") + g.logger.info("format_change 文件上传成功") receive_file_response = response.content.decode('utf-8') receive_file_json = json.loads(receive_file_response) receive_file_url = receive_file_json["data"] else: - print(f"文件上传失败,状态码: {response.status_code}") - print(response.text) + g.logger.info(f"format_change 文件上传失败,状态码: {response.status_code}") + g.logger.info(f"format_change {response.text}") return receive_file_url @@ -44,7 +46,7 @@ def pdf2docx(local_path_in): filename, folder = get_filename_and_folder(local_path_in) #输入输出在同一个文件夹 local_path_out=os.path.join(folder,filename) #输出文件名 downloaded_filepath,file_type=download_file(receive_download_url, local_path_out) - print("have downloaded file to:",downloaded_filepath) + g.logger.info("format_change p2d:have downloaded file to:",downloaded_filepath) return downloaded_filepath def docx2pdf(local_path_in): @@ -53,7 +55,7 @@ def docx2pdf(local_path_in): filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹 local_path_out = os.path.join(folder, filename) # 输出文件名 downloaded_filepath,file_type = download_file(receive_download_url, local_path_out) - print("have downloaded file to:", downloaded_filepath) + g.logger.info("format_change d2p:have downloaded file to:", downloaded_filepath) return downloaded_filepath if __name__ == '__main__': diff --git a/flask_app/main/json_utils.py b/flask_app/main/json_utils.py index 872940e..6076d51 100644 --- a/flask_app/main/json_utils.py +++ b/flask_app/main/json_utils.py @@ -1,6 +1,9 @@ import json import re +from flask import g + + def extract_content_from_json(json_data): """提取 { 和 } 之间的内容,并将其解析为字典""" if not json_data.strip(): @@ -11,10 +14,10 @@ def extract_content_from_json(json_data): json_data = match.group(0) return json.loads(json_data) #返回字典 except json.JSONDecodeError as e: - print(f"JSON decode error: {e}") + g.logger.info(f"json_utils: extract_content_from_json: JSON decode error: {e}") return {} else: - print("No valid JSON content found.") + g.logger.info("json_utils: extract_content_from_json: No valid JSON content found.") return {} def clean_json_string(json_string): @@ -63,18 +66,18 @@ def add_keys_to_json(target_dict, source_dict): dict: 更新后的字典。 """ if not target_dict: - print("Error: Target dictionary is empty.") + g.logger.error("json_utils: Error: Target dictionary is empty.") return {} if len(target_dict) != 1: - print("Error: Target dictionary must contain exactly one top-level key.") + g.logger.error("json_utils: Error: Target dictionary must contain exactly one top-level key.") return target_dict # 获取唯一的外层键 target_key, existing_dict = next(iter(target_dict.items())) if not isinstance(existing_dict, dict): - print(f"Error: The value under the key '{target_key}' is not a dictionary.") + g.logger.error(f"json_utils: Error: The value under the key '{target_key}' is not a dictionary.") return target_dict # 合并字典 @@ -92,7 +95,8 @@ def rename_outer_key(original_data,new_key): # 提取原始数据中的唯一外层值(假设只有一个外层键) if not original_data or not isinstance(original_data, dict): - return {} # 如果输入无效或不是字典,则返回空字典 + g.logger.error("json_utils: Error: Invalid input or input is not a dictionary.") # 如果输入无效或不是字典,则返回空字典 + return {} # 使用 next(iter(...)) 提取第一个键的值 original_value = next(iter(original_data.values()), {}) diff --git a/flask_app/main/start_up.py b/flask_app/main/start_up.py index 0df0488..04fd137 100644 --- a/flask_app/main/start_up.py +++ b/flask_app/main/start_up.py @@ -5,7 +5,7 @@ import time import uuid from datetime import datetime, timedelta -from flask import Flask, request, jsonify, Response, stream_with_context +from flask import Flask, request, jsonify, Response, stream_with_context, g import json import os from flask_app.main.download import download_file @@ -31,30 +31,30 @@ class CSTFormatter(logging.Formatter): return s -def create_logger(unique_id): - """为每个请求创建一个新的日志器,日志器的日志文件存放在指定的输出文件夹中""" +@app.before_request +def before_request(): + # 每个请求开始前初始化 logger + create_logger() # 确保这个函数中设置了 g.logger + + +def create_logger(): + unique_id = str(uuid.uuid4()) + g.unique_id = unique_id output_folder = f"flask_app/static/output/{unique_id}" - # output_folder =f"C:/Users/Administrator/Desktop/招标文件/test/{unique_id}" - if not os.path.exists(output_folder): - os.makedirs(output_folder, exist_ok=True) + os.makedirs(output_folder, exist_ok=True) log_filename = "log.txt" log_path = os.path.join(output_folder, log_filename) - logger = logging.getLogger(unique_id) # 使用 unique_id 作为日志器名字 - if not logger.handlers: # 避免重复添加处理器 - # 文件处理器 + logger = logging.getLogger(unique_id) + if not logger.handlers: file_handler = logging.FileHandler(log_path) file_formatter = CSTFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') file_handler.setFormatter(file_formatter) logger.addHandler(file_handler) - - # 流处理器(控制台输出) - stream_handler = logging.StreamHandler(sys.stdout) - stream_formatter = logging.Formatter('%(message)s') # 简化的格式,只输出消息 - stream_handler.setFormatter(stream_formatter) + stream_handler = logging.StreamHandler() + stream_handler.setFormatter(logging.Formatter('%(message)s')) logger.addHandler(stream_handler) - logger.setLevel(logging.INFO) - return logger, output_folder + g.logger = logger @app.route('/upload', methods=['POST']) @@ -64,10 +64,10 @@ def zbparse(): return file_url try: app.logger.info("starting parsing url:" + file_url) - final_json_path, output_folder, logger = download_and_process_file(file_url) + final_json_path, output_folder = download_and_process_file(file_url) if not final_json_path: return jsonify({'error': 'File processing failed'}), 500 - response = generate_response(final_json_path, logger) # 先获取响应内容 + response = generate_response(final_json_path) # 先获取响应内容 # remove_directory(output_folder) # 然后删除文件夹 return response # 最后返回获取的响应 except Exception as e: @@ -75,6 +75,7 @@ def zbparse(): return jsonify({'error': str(e)}), 500 +# 流式 # def zbparse(): # file_url = validate_request() # if isinstance(file_url, tuple): # Check if the returned value is an error response @@ -87,40 +88,42 @@ def zbparse(): # return jsonify({'error': str(e)}), 500 -def process_and_stream(file_url): - unique_id = str(uuid.uuid4()) - logger, output_folder = create_logger(unique_id) - filename = "ztbfile" - downloaded_filename = os.path.join(output_folder, filename) - - downloaded_filepath, file_type = download_file(file_url, downloaded_filename) - - if downloaded_filepath is None or file_type == 3: - logger.error("Unsupported file type or failed to download file") - error_response = { - 'message': 'File processing failed', - 'filename': None, - 'data': json.dumps({'error': 'File processing failed'}) - } - yield f"data: {json.dumps(error_response)}\n\n" - return - - logger.info("Local file path: " + downloaded_filepath) - - for data in main_processing(output_folder, downloaded_filepath, file_type, unique_id): - response = { - 'message': 'Processing', - 'filename': os.path.basename(downloaded_filepath), - 'data': data - } - yield f"data: {json.dumps(response)}\n\n" - - final_response = { - 'message': 'File uploaded and processed successfully', - 'filename': os.path.basename(downloaded_filepath), - 'data': 'END' - } - yield f"data: {json.dumps(final_response)}\n\n" +# 分段返回 +# def process_and_stream(file_url): +# logger = g.logger +# unique_id = g.unique_id +# output_folder = f"flask_app/static/output/{unique_id}" # 直接使用全局 unique_id 构建路径 +# filename = "ztbfile" +# downloaded_filename = os.path.join(output_folder, filename) +# +# downloaded_filepath, file_type = download_file(file_url, downloaded_filename) +# +# if downloaded_filepath is None or file_type == 3: +# logger.error("Unsupported file type or failed to download file") +# error_response = { +# 'message': 'File processing failed', +# 'filename': None, +# 'data': json.dumps({'error': 'File processing failed'}) +# } +# yield f"data: {json.dumps(error_response)}\n\n" +# return +# +# logger.info("Local file path: " + downloaded_filepath) +# +# for data in main_processing(output_folder, downloaded_filepath, file_type, unique_id): +# response = { +# 'message': 'Processing', +# 'filename': os.path.basename(downloaded_filepath), +# 'data': data +# } +# yield f"data: {json.dumps(response)}\n\n" +# +# final_response = { +# 'message': 'File uploaded and processed successfully', +# 'filename': os.path.basename(downloaded_filepath), +# 'data': 'END' +# } +# yield f"data: {json.dumps(final_response)}\n\n" def validate_request(): @@ -133,8 +136,9 @@ def validate_request(): def download_and_process_file(file_url): - unique_id = str(uuid.uuid4()) # 生成一个唯一的 UUID - logger, output_folder = create_logger(unique_id) + logger = g.logger + unique_id = g.unique_id + output_folder = f"flask_app/static/output/{unique_id}" # 直接使用全局 unique_id 构建路径 filename = "ztbfile" downloaded_filename = os.path.join(output_folder, filename) @@ -147,7 +151,7 @@ def download_and_process_file(file_url): logger.info("Local file path: " + downloaded_filepath) processed_file_path = main_processing(output_folder, downloaded_filepath, file_type, unique_id) - return processed_file_path, output_folder, logger + return processed_file_path, output_folder @app.route('/api/test_zbparse', methods=['POST']) @@ -189,10 +193,15 @@ def test_process_and_stream(): yield f"data: {json.dumps(final_response)}\n\n" -def generate_response(final_json_path, logger): +def generate_response(final_json_path): + logger = g.logger + # 检查final_json_path是否为空或None + if not final_json_path: + logger.error('Empty or None path provided for final_json.') + return jsonify({'error': 'No path provided for final_json.'}), 400 if not os.path.exists(final_json_path): - logger.error('JSON file not found at path: ' + final_json_path) - return jsonify({'error': 'JSON file not found'}), 404 + logger.error('final_json not found at path: ' + final_json_path) + return jsonify({'error': 'final_json not found'}), 404 with open(final_json_path, 'r', encoding='utf-8') as f: logger.info('final_json_path:' + final_json_path) zbparse_data = json.load(f) diff --git a/flask_app/main/table_content_extraction.py b/flask_app/main/table_content_extraction.py index 6dcfb6b..81dd69e 100644 --- a/flask_app/main/table_content_extraction.py +++ b/flask_app/main/table_content_extraction.py @@ -3,6 +3,8 @@ import os from docx import Document import json +from flask import g + def read_tables_from_docx(file_path): """读取DOCX文件中的表格数据,并以嵌套字典的形式返回.""" @@ -87,13 +89,13 @@ def save_data_to_json(data, output_folder): """将数据保存到JSON文件中.""" with open(output_filepath, 'w', encoding='utf-8') as file: json.dump(data, file, ensure_ascii=False, indent=4) - print(f"The data has been processed and saved to '{output_filepath}'.") + g.logger.info(f"table_content_extraction: The data has been processed and saved to '{output_filepath}'.") return output_filepath def extract_tables_main(path, output_folder): if not os.path.exists(path): - print(f"The specified file does not exist: {path}") + g.logger.error(f"table_content_extraction: The specified file does not exist: {path}") return "" # 读取文档表格数据 table_data = read_tables_from_docx(path) diff --git a/flask_app/main/ttt.py b/flask_app/main/ttt.py index d75a150..7eb4d31 100644 --- a/flask_app/main/ttt.py +++ b/flask_app/main/ttt.py @@ -1,28 +1,19 @@ import re -from PyPDF2 import PdfReader +pattern = re.compile(r'(\b\d+\s*\.\s*\d+\s*\.\s*\d+\b)|(\b3\s*\.\s*2\b)') +text = '3.1.3已标价工程量清单中漏报了某个工程子目的单价、合价或总额价则漏报的工程 子目单价、合价和总额价视为已含入其他工程子目的单价、合价和总额价之中。' +match = pattern.search(text) +if match: + print("匹配成功:", match.group()) +else: + print("未找到匹配") -def extract_common_header(pdf_path): - pdf_document = PdfReader(pdf_path) - headers = [] - num_pages_to_read = 3 # 预读页数 +# 使用 findall 查看所有匹配 +all_matches = pattern.findall(text) +print("所有匹配:", all_matches) - for i in range(min(num_pages_to_read, len(pdf_document.pages))): - page = pdf_document.pages[i] - text = page.extract_text() - if text: # 确保页面有文本内容 - first_line = text.strip().split('\n')[0] - headers.append(first_line) - - if len(headers) < 2: - return "" # 如果没有足够的页来比较,返回空字符串 - - # 使用set交集来找出公共部分 - common_header = set(headers[0].split()).intersection(*[set(header.split()) for header in headers[1:]]) - common_header = ' '.join(common_header) - return common_header - -input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\test\\zbtest19.pdf" -res=extract_common_header(input_path) -print(res) \ No newline at end of file +# 打印文本的前10个字符的ASCII值,检查是否有不可见字符 +print("文本前10个字符的ASCII值:") +for char in text[:10]: + print(f"{char}: {ord(char)}") \ No newline at end of file diff --git a/flask_app/main/判断是否分包等.py b/flask_app/main/判断是否分包等.py index 7863ebc..6677f0c 100644 --- a/flask_app/main/判断是否分包等.py +++ b/flask_app/main/判断是否分包等.py @@ -122,14 +122,11 @@ def judge_whether_main(file_path,output_folder): #传入招标文件中‘投 output_json_path = os.path.join(output_folder,'judge_exist.json') read_pdf_and_judge_main(file_path, output_json_path) #提取打勾符号 qianwen_answer = qianwen_ask(output_json_path, user_query1) # 调用普通千问判断是、否、未知 - print("qianwen_answer:" + qianwen_answer) user_query2 = construct_judge_questions(qianwen_answer) # 提取回答为”未知“的键 # 判断user_query是否为空 if user_query2: - print("user_query:" + user_query2) file_id = upload_file(file_path) res = qianwen_long(file_id, user_query2) #整个前附表一起传问千问long - print(res) return process_judge_content(qianwen_answer, res) else: @@ -143,7 +140,6 @@ def process_judge_content(original_json, update_json): #用新的数据合并 original = extract_content_from_json(original_json) updates = extract_content_from_json(update_json) original.update(updates) - print(original) return merge_json_to_list(original) diff --git a/flask_app/main/商务标技术标整合.py b/flask_app/main/商务标技术标整合.py index 10961da..300f477 100644 --- a/flask_app/main/商务标技术标整合.py +++ b/flask_app/main/商务标技术标整合.py @@ -55,7 +55,6 @@ def combine_technical_and_business(data, target_values1, target_values2): def combine_evaluation_standards(truncate2): # 商务标、技术标评分项:千问 - print("starting商务标技术标...") file_id = upload_file(truncate2) user_query_2 = ( "根据该文档中的评标办法前附表,请你列出该文件的技术标,商务标,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术标'中新增子键名'备注'存放该信息。如果评分内容不是这3个,则返回文档中给定的评分内容以及它的评分要求,都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容") @@ -64,7 +63,6 @@ def combine_evaluation_standards(truncate2): target_values2=['投标报价','商务标','业绩','信誉','分值','计算公式','信用','人员','资格','奖项','认证','荣誉'] update_json=combine_technical_and_business(clean_json_string(evaluation_res),target_values1,target_values2) evaluation_combined_res = json.dumps(update_json,ensure_ascii=False,indent=4) - print("商务标技术标done") return evaluation_combined_res if __name__ == "__main__": diff --git a/flask_app/main/基础信息整合.py b/flask_app/main/基础信息整合.py index 48ae62b..1f82ada 100644 --- a/flask_app/main/基础信息整合.py +++ b/flask_app/main/基础信息整合.py @@ -1,5 +1,7 @@ import json +from flask import g + from flask_app.main.json_utils import clean_json_string, nest_json_under_key,rename_outer_key, combine_json_results from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge @@ -31,8 +33,8 @@ def combine_basic_info(baseinfo_list): dynamic_key_handling(key_groups, relevant_keys_detected) # 打印 key_groups 的内容检查它们是否被正确更新 - print("Updated key_groups after dynamic handling:") - print(key_groups) + # g.logger.info("Updated key_groups after dynamic handling:") + # 使用合并后的字典创建最终输出 for group_name, keys in key_groups.items(): @@ -79,7 +81,6 @@ def judge_consortium_bidding(baseinfo_list): return accept_bidding def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #投标人须知前附表 # 调用大模型回答项目基础信息 - print("starting基础信息...") baseinfo_list = [] # baseinfo_file_path='../static/提示词/前两章提问总结.txt' baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt' # 替换为你的txt文件路径 @@ -90,10 +91,9 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): # if response and len(response) > 1: # 检查response存在且有至少两个元素 baseinfo_list.append(response[1]) else: - print(f"Warning: Missing or incomplete response data for query index {_}.") + g.logger.error(f"基础信息整合: Warning: Missing or incomplete response data for query index {_}.") except Exception as e: - print(f"Error processing response for query index {_}: {e}") - print("basic信息done...") + g.logger.error(f"基础信息整合: Error processing response for query index {_}: {e}") # 判断是否分包、是否需要递交投标保证金等 chosen_numbers, merged = judge_whether_main(truncate0,output_folder) baseinfo_list.append(merged) @@ -109,7 +109,7 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): # file_id=upload_file(truncate0) res2 = multi_threading(judge_questions, "",file_id,2) #调用千问-long if not res2: - print("errror!") + g.logger.error("基础信息整合: multi_threading errror!") else: # 打印结果 for question, response in res2: @@ -119,9 +119,9 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): # # if response and len(response) > 1: # 检查response存在且有至少两个元素 # baseinfo_list.append(response[1]) # else: - # print(f"Warning: Missing or incomplete response data for query index {_}.") + # g.error.info(f"基础信息整合: Warning: Missing or incomplete response data for query index {_}.") # except Exception as e: - # print(f"Error processing response for query index {_}: {e}") + # g.logger.error(f"基础信息整合: Error processing response for query index {_}: {e}") rebidding_situation = extract_from_notice(clause_path, 3) #"重新招标, 不再招标和终止招标"需从投标人须知正文提取 @@ -131,7 +131,6 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): # update_baseinfo_list=combine_basic_info(baseinfo_list) #整合基础信息核心代码 baseinfo_combined_res = combine_json_results(update_baseinfo_list) # 返回值是字典 - print("基础信息done") return nest_json_under_key(baseinfo_combined_res, "基础信息") #返回值是json字符串 diff --git a/flask_app/main/多线程提问.py b/flask_app/main/多线程提问.py index b86cd57..1b931e8 100644 --- a/flask_app/main/多线程提问.py +++ b/flask_app/main/多线程提问.py @@ -6,9 +6,9 @@ import concurrent.futures import time from dashscope import Assistants, Messages, Runs, Threads +from flask import g from llama_index.indices.managed.dashscope import DashScopeCloudRetriever from flask_app.main.通义千问long import qianwen_long, upload_file - prompt = """ # 角色 你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。 @@ -31,7 +31,6 @@ prompt = """ """ prom = '请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${documents}' - def read_questions_from_file(file_path): questions = [] with open(file_path, 'r', encoding='utf-8') as file: @@ -119,8 +118,10 @@ def pure_assistant(): def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type): if llm_type==1: + g.logger.info(f"rag_assistant! question:{question}") assistant = rag_assistant(knowledge_name) elif llm_type==2: + g.logger.info(f"qianwen_long! question:{question}") qianwen_res = qianwen_long(file_id,question) result_queue.put((ans_index,(question,qianwen_res))) return @@ -130,6 +131,7 @@ def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans) def multi_threading(queries, knowledge_name="", file_id="",llm_type=1): + g.logger.info("多线程提问:starting multi_threading...") result_queue = queue.Queue() # 使用 ThreadPoolExecutor 管理线程 @@ -148,7 +150,7 @@ def multi_threading(queries, knowledge_name="", file_id="",llm_type=1): try: future.result() # 可以用来捕获异常或确认任务完成 except Exception as exc: - print(f"Query {index} generated an exception: {exc}") + g.logger.error(f"Query {index} generated an exception: {exc}") # 从队列中获取所有结果并按索引排序 results = [None] * len(queries) diff --git a/flask_app/main/形式响应评审.py b/flask_app/main/形式响应评审.py index 27911b2..bc7880a 100644 --- a/flask_app/main/形式响应评审.py +++ b/flask_app/main/形式响应评审.py @@ -3,6 +3,8 @@ import re import json import time +from flask import g + from flask_app.main.多线程提问 import multi_threading from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2 from flask_app.main.json_utils import extract_content_from_json @@ -187,9 +189,9 @@ def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause temp = extract_content_from_json(response[1]) first_response_list.append(temp) else: - print(f"Warning: Missing or incomplete response data for query index {_}.") + g.logger.error(f"形式响应评审:Warning: Missing or incomplete response data for query index {_}.") except Exception as e: - print(f"Error processing response for query index {_}: {e}") + g.logger.error(f"形式响应评审:Error processing response for query index {_}: {e}") # Assume JSON file paths are defined or configured correctly # print(entries_with_numbers) #[{'形式评审标准.多标段投标': '3.7.4(5)'}] diff --git a/flask_app/main/截取pdf.py b/flask_app/main/截取pdf.py index f3d1e70..083a9ab 100644 --- a/flask_app/main/截取pdf.py +++ b/flask_app/main/截取pdf.py @@ -2,6 +2,9 @@ from PyPDF2 import PdfReader, PdfWriter import re # 导入正则表达式库 import os # 用于文件和文件夹操作 +from flask import g + + def extract_common_header(pdf_path): pdf_document = PdfReader(pdf_path) headers = [] @@ -59,9 +62,9 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en with open(output_pdf_path, 'wb') as f: output_doc.write(f) - print(f"已截取并保存页面从 {start_page + 1} 到 {end_page + 1} 为 {output_pdf_path}") + g.logger.info(f"已截取并保存页面从 {start_page + 1} 到 {end_page + 1} 为 {output_pdf_path}") else: - print("提供的页码范围无效。") + g.logger.error("提供的页码范围无效。") return output_pdf_path @@ -105,7 +108,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix): break # 找到结束页后退出循环 if start_page is None or end_page is None: - print(f"未找到起始或结束页在文件 {pdf_path} 中!") + g.logger.error(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!") return "" else: return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page) @@ -154,7 +157,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter if output_suffix == "qualification" or output_suffix =="invalid": return extract_pages_twice(pdf_path, output_folder, output_suffix) else: - print(f"未找到起始或结束页在文件 {pdf_path} 中!") + g.logger.error(f"first: 未找到起始或结束页在文件 {pdf_path} 中!") return "" else: return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page) @@ -182,7 +185,7 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt if output_pdf_path and os.path.isfile(output_pdf_path): return [output_pdf_path] # 以列表形式返回,以保持一致性 else: - print("提供的路径既不是文件夹也不是PDF文件。") + g.logger.error("提供的路径既不是文件夹也不是PDF文件。") return [] @@ -232,7 +235,7 @@ def truncate_pdf_main(input_path, output_folder, selection): end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|第二卷', re.MULTILINE) output_suffix = "invalid" else: - print("无效的选择") + g.logger.error("无效的选择:请选择1-6") return None # Process the selected input diff --git a/flask_app/main/投标人须知正文条款提取成json文件.py b/flask_app/main/投标人须知正文条款提取成json文件.py index 917bd55..9c27e72 100644 --- a/flask_app/main/投标人须知正文条款提取成json文件.py +++ b/flask_app/main/投标人须知正文条款提取成json文件.py @@ -4,6 +4,9 @@ import fitz import re import os +from flask import g + + def extract_text_from_docx(file_path): doc = docx.Document(file_path) return '\n'.join([para.text for para in doc.paragraphs]) @@ -126,7 +129,7 @@ def convert_to_json(file_path, start_word, end_phrases): def convert_clause_to_json(input_path,output_folder,type=1): if not os.path.exists(input_path): - print(f"The specified file does not exist: {input_path}") + g.logger.error(f"The specified file does not exist: {input_path}") return "" if type==1: start_word = "投标人须知正文" diff --git a/flask_app/main/招标文件解析.py b/flask_app/main/招标文件解析.py index 2690a81..a427932 100644 --- a/flask_app/main/招标文件解析.py +++ b/flask_app/main/招标文件解析.py @@ -3,6 +3,9 @@ import json import logging import os import time + +from flask import g + from flask_app.main.截取pdf import truncate_pdf_multiple from flask_app.main.table_content_extraction import extract_tables_main from flask_app.main.知识库操作 import addfileToKnowledge, deleteKnowledge @@ -17,26 +20,27 @@ from flask_app.main.商务标技术标整合 import combine_evaluation_standards from flask_app.main.format_change import pdf2docx, docx2pdf from flask_app.main.docx截取docx import copy_docx -global_logger = None - - -def get_global_logger(unique_id): - if unique_id is None: - return logging.getLogger() # 获取默认的日志器 - logger = logging.getLogger(unique_id) - return logger +# def get_global_logger(unique_id): +# if unique_id is None: +# return logging.getLogger() # 获取默认的日志器 +# logger = logging.getLogger(unique_id) +# return logger # 可能有问题:pdf转docx导致打勾符号消失 def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id): + g.logger.info("starting 文件预处理...") # 根据文件类型处理文件路径 - global docx_path, pdf_path if file_type == 1: # docx docx_path = downloaded_file_path pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理 elif file_type == 2: # pdf pdf_path = downloaded_file_path docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库 + else: + # 如果文件类型不是预期中的1或2,记录错误并返回None + g.logger.error("Unsupported file type provided. Preprocessing halted.") + return None # 上传知识库 knowledge_name = "招标解析" + unique_id @@ -44,7 +48,6 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id): # 调用截取PDF多次 truncate_files = truncate_pdf_multiple(pdf_path, output_folder) # [前附表, 评标办法, 须知正文, 资格审查条件] - print(truncate_files) # 处理各个部分 truncate0_docpath = pdf2docx(truncate_files[0]) # 投标人须知前附表转docx @@ -56,6 +59,7 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id): truncate1 = truncate_files[1] #评标办法前附表 truncate3 = truncate_files[3] #资格审查表 clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json + g.logger.info("文件预处理done") return { 'input_file_path':downloaded_file_path, @@ -90,61 +94,63 @@ def post_processing(data,includes): return result # 基本信息 def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path): # 投标人须知前附表 - global_logger.info("starting基础信息...") + g.logger.info("starting基础信息...") basic_res = project_basic_info(knowledge_name, truncate0, output_folder, clause_path) - global_logger.info("基础信息done") + g.logger.info("基础信息done") return basic_res # 形式、响应、资格评审 def fetch_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path,input_file,output_folder): - global_logger.info("starting资格审查...") + g.logger.info("starting资格审查...") review_standards_res = combine_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path,input_file,output_folder) - global_logger.info("资格审查done") + g.logger.info("资格审查done") return review_standards_res # 评分细则 def fetch_evaluation_standards(truncate1): # 评标办法前附表 - global_logger.info("starting商务标技术标...") + g.logger.info("starting商务标技术标...") evaluation_standards_res = combine_evaluation_standards(truncate1) - global_logger.info("商务标技术标done") + g.logger.info("商务标技术标done") return evaluation_standards_res # 无效、废标项解析 def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3): # 废标项要求:千问 - global_logger.info("starting无效标与废标...") + g.logger.info("starting无效标与废标...") find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3) - global_logger.info("无效标与废标done...") + g.logger.info("无效标与废标done...") return find_invalid_res # 投标文件要求 def fetch_bidding_documents_requirements(clause_path): - global_logger.info("starting投标文件要求...") + g.logger.info("starting投标文件要求...") fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1) qualify_nested_res = nest_json_under_key(fetch_bidding_documents_requirements_json, "投标文件要求") - global_logger.info("投标文件要求done...") + g.logger.info("投标文件要求done...") return qualify_nested_res # 开评定标流程 def fetch_bid_opening(clause_path): - global_logger.info("starting开评定标流程...") + g.logger.info("starting开评定标流程...") fetch_bid_opening_json = extract_from_notice(clause_path, 2) qualify_nested_res = nest_json_under_key(fetch_bid_opening_json, "开评定标流程") - global_logger.info("开评定标流程done...") + g.logger.info("开评定标流程done...") return qualify_nested_res def main_processing(output_folder, downloaded_file_path, file_type, unique_id): # file_type=1->docx file_type=2->pdf - global global_logger - global_logger = get_global_logger(unique_id) + # global global_logger + # global_logger = get_global_logger(unique_id) # Preprocess files and get necessary data paths and knowledge index processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id) + if not processed_data: + return "" with concurrent.futures.ThreadPoolExecutor() as executor: # Submit all tasks to the executor @@ -174,7 +180,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id): result = futures[key].result() comprehensive_responses.append(result) except Exception as exc: - global_logger.info(f"Error processing {key}: {exc}") + g.logger.error(f"Error processing {key}: {exc}") # 合并 JSON 结果 combined_final_result = combine_json_results(comprehensive_responses) includes = ["基础信息", "资格审查", "商务标", "技术标", "无效标与废标项", "投标文件要求", "开评定标流程"] @@ -184,7 +190,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id): final_result_path = os.path.join(output_folder, "final_result.json") with open(final_result_path, 'w', encoding='utf-8') as file: json.dump(modified_json, file, ensure_ascii=False, indent=2) - global_logger.info("final_result.json has been saved") + g.logger.info("final_result.json has been saved") deleteKnowledge(processed_data['knowledge_index']) return final_result_path diff --git a/flask_app/main/提取打勾符号.py b/flask_app/main/提取打勾符号.py index 3d96d32..0ae12fa 100644 --- a/flask_app/main/提取打勾符号.py +++ b/flask_app/main/提取打勾符号.py @@ -2,6 +2,9 @@ import re import PyPDF2 import json +from flask import g + + def extract_key_value_pairs(text): # 更新正则表达式来包括对"团"的处理和行尾斜线 pattern = r'\d+\.\d+\s*([\w\s()\u4e00-\u9fff]+)[\x01\x02☑团]([\w\s\u4e00-\u9fff]+)' @@ -57,15 +60,13 @@ def read_pdf_and_judge_main(file_path, output_json_path): with open(file_path, 'rb') as file: reader = PyPDF2.PdfReader(file) num_pages = len(reader.pages) - print(f"Total pages: {num_pages}") - all_data = {} for page_num in range(num_pages): page = reader.pages[page_num] text = page.extract_text() if page.extract_text() else "" # 使用正则表达式删除每页开头的页码,格式通常为“数字+空格” cleaned_text = re.sub(r'^\d+\s+', '', text) - print(cleaned_text) + # print(cleaned_text) key_value_pairs = extract_key_value_pairs(cleaned_text) all_data.update(key_value_pairs) @@ -74,7 +75,7 @@ def read_pdf_and_judge_main(file_path, output_json_path): with open(output_json_path, "w", encoding="utf-8") as json_file: json.dump(all_data, json_file, ensure_ascii=False, indent=4) - print(f"Data extraction complete and saved to '{output_json_path}'.") + g.logger.info(f"da_gou signal: Data extraction complete and saved to '{output_json_path}'.") if __name__ == "__main__": diff --git a/flask_app/main/无效标和废标和禁止投标整合.py b/flask_app/main/无效标和废标和禁止投标整合.py index c08772c..476e86a 100644 --- a/flask_app/main/无效标和废标和禁止投标整合.py +++ b/flask_app/main/无效标和废标和禁止投标整合.py @@ -3,75 +3,125 @@ import json import os.path import time import re + +from flask import g + from flask_app.main.json_utils import combine_json_results, nest_json_under_key from flask_app.main.通义千问long import upload_file, qianwen_long from concurrent.futures import ThreadPoolExecutor -from flask_app.main.禁止投标情形 import find_forbidden -from 禁止投标情形 import process_string_list +from flask_app.main.禁止投标情形 import find_forbidden, process_string_list + #如果当前段落有序号,则向下匹配直接遇到相同的序号样式 #如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。 + def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): from collections import OrderedDict from docx import Document import re + if isinstance(keywords, str): + keywords = [keywords] + doc = Document(doc_path) - extracted_paragraphs = OrderedDict() # 使用OrderedDict保持顺序 + extracted_paragraphs = OrderedDict() continue_collecting = False current_section_pattern = None - active_key = None # 用来标记当前正在收集的文本块的键 + active_key = None def match_keywords(text, patterns): - """使用正则表达式匹配关键词。""" return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns) - def extract_from_text(text, index): + def extract_from_text(text, current_index): nonlocal continue_collecting, current_section_pattern, active_key - if text == "": # Skip empty lines - return + if text == "": + return current_index + if continue_collecting: if current_section_pattern and re.match(current_section_pattern, text): continue_collecting = False - active_key = None # 结束当前的收集 + active_key = None else: if active_key is not None: extracted_paragraphs[active_key].append(text) + return current_index if match_keywords(text, keywords): - active_key = text # 设置当前的关键词块 - extracted_paragraphs[active_key] = [text] # 初始化列表以收集文本,包括当前匹配的文本 - # 检查是否也匹配后续关键词 + active_key = text + extracted_paragraphs[active_key] = [text] if match_keywords(text, follow_up_keywords): continue_collecting = True - # 设置跟踪模式 section_number = re.match(r'(\d+(\s*\.\s*\d+)*)', text) if section_number: current_section_number = section_number.group(1) level_count = current_section_number.count('.') - pattern = r'\b' + (r'\d+\s*\.\s*') * level_count + r'\d+\b' - current_section_pattern = re.compile(pattern) + + # Pattern to match current level, e.g., 3.4.5 + pattern = r'^' + (r'\d+\s*\.\s*') * level_count + r'\d+' + + # Generate patterns for next section at same level and parent level + parts = current_section_number.split('.') + matched_patterns = [pattern] # start with the full pattern + + # Next section at same level + parts[-1] = str(int(parts[-1]) + 1) + next_pattern = r'^' + r'\s*\.\s*'.join(parts) + matched_patterns.append(next_pattern) + + # Parent section (if applicable) + if len(parts) > 1: + parent_section_parts = parts[:-1] + parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1) + parent_pattern = r'^' + r'\s*\.\s*'.join(parent_section_parts) + matched_patterns.append(parent_pattern) + + # Combine the patterns + combined_pattern = r'(' + r')|('.join(matched_patterns) + r')' + current_section_pattern = re.compile(combined_pattern) + else: found_next_number = False current_section_pattern = None - for next_index in range(index + 1, len(doc.paragraphs)): - next_text = doc.paragraphs[next_index].text.strip() + while current_index < len(doc.paragraphs) - 1: + current_index += 1 + next_text = doc.paragraphs[current_index].text.strip() if not found_next_number: - next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)', next_text) + next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))', next_text) if next_section_number: found_next_number = True - section_parts = next_section_number.group(1).split('.') - dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b' + if next_section_number.group(1): + section_parts = next_section_number.group(1).split('.') + dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b' + elif next_section_number.group(2): + dynamic_pattern = r'^[\(\(]\d+[\)\)]' current_section_pattern = re.compile(dynamic_pattern) - if current_section_pattern and re.match(current_section_pattern, next_text): - extracted_paragraphs[active_key].append(next_text) # 持续收集 + extracted_paragraphs[active_key].append(next_text) + else: + continue_collecting = False + active_key=None + break - for index, para in enumerate(doc.paragraphs): - extract_from_text(para.text.strip(), index) + return current_index - return extracted_paragraphs # 返回字典,键是关键词,值是相关文本列表 + index = 0 + while index < len(doc.paragraphs): + index = extract_from_text(doc.paragraphs[index].text.strip(), index) + index += 1 + + return extracted_paragraphs + +def preprocess_text_list(text_list): + new_text_list = [] + # 正则表达式匹配中文字符或标点后的空格,该空格后紧跟字母、数字或带括号的数字 + split_pattern = re.compile(r'(?<=[\u4e00-\u9fff。;!??!;])(?=\s+[a-zA-Z\d]|\s+\([1-9]\d*\)|\s+\([1-9]\d*\))') + for text in text_list: + # 使用正则表达式检查并拆分元素 + parts = split_pattern.split(text) + new_text_list.extend(part.strip() for part in parts if part.strip()) # 添加非空字符串检查 + + return new_text_list def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达式提取到的东西格式化 all_texts1 = [] @@ -107,11 +157,14 @@ def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达 all_texts1.append(cleaned_text) # 将处理后的文本添加到结果列表 else: + print(text_list) + new_text_list=preprocess_text_list(text_list) + print(new_text_list) pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)' - data = re.sub(pattern, '', text_list[0]).strip() + data = re.sub(pattern, '', new_text_list[0]).strip() #去除序号 # 将修改后的第一个元素和剩余的元素连接起来 - text_list[0] = data # 更新列表中的第一个元素 - joined_text = "\n".join(text_list) # 如果列表中有多个元素,则连接它们 + new_text_list[0] = data # 更新列表中的第一个元素 + joined_text = "\n".join(new_text_list) # 如果列表中有多个元素,则连接它们 all_texts2.append(joined_text) # 将每个列表的内容添加到 all_texts 中 return all_texts1,all_texts2 #all_texts1要额外用gpt all_text2直接返回结果 @@ -219,6 +272,7 @@ def handle_query(file_path, user_query, output_file, result_key, keywords, trunc excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:"] follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下'] extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) #字典结果 + # print(extracted_contents) all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表 all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords) qianwen_txt = all_texts1 + all_tables1 @@ -234,7 +288,6 @@ def handle_query(file_path, user_query, output_file, result_key, keywords, trunc # 更新计数器,每次循环递增 counter += 1 file_id = upload_file(output_file) - print("starting qianwen-long...") qianwen_ans = qianwen_long(file_id, user_query) selected_contents = [] num_list = process_string_list(qianwen_ans) @@ -256,7 +309,6 @@ def handle_query(file_path, user_query, output_file, result_key, keywords, trunc return res def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,truncate3): - print("starting无效标与废标...") queries = [ (r'否\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效', "以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。", @@ -280,7 +332,7 @@ def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,t results.append(future.result()) #禁止投标 - print("starting不得存在的情形...") + # g.logger.info("starting不得存在的情形...") forbidden_res = find_forbidden(truncate_json_path, clause_path, truncate3) results.append(forbidden_res) @@ -288,17 +340,18 @@ def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,t for d in results: combined_dict.update(d) - print("无效标与废标done...") + # g.logger.info("无效标与废标done...") return nest_json_under_key(combined_dict, "无效标与废标项") if __name__ == '__main__': start_time = time.time() - truncate_json_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json" - clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\clause.json" - truncate3="C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest12_qualification.pdf" - output_dir = "C:\\Users\\Administrator\\Desktop\\货物标\\output" - doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output\\zbtest11_invalid.docx' + truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\truncate_output.json" + clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\clause1.json" + truncate3="C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile_qualification.pdf" + output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e" + # doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx' + doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile_invalid.docx' results = combine_find_invalid(doc_path, output_dir,truncate_json_path,clause_path,truncate3) end_time = time.time() print("Elapsed time:", str(end_time - start_time)) diff --git a/flask_app/main/知识库操作.py b/flask_app/main/知识库操作.py index 78d674a..1606a7c 100644 --- a/flask_app/main/知识库操作.py +++ b/flask_app/main/知识库操作.py @@ -1,6 +1,7 @@ import os import uuid +from flask import g from llama_index.readers.dashscope.base import DashScopeParse from llama_index.readers.dashscope.utils import ResultType from llama_index.indices.managed.dashscope import DashScopeCloudIndex @@ -15,6 +16,7 @@ def addfileToKnowledge(filepath,knowledge_name): knowledge_name, verbose=True, ) + g.logger.info("knowledge created successfully!!!") # index = DashScopeCloudIndex(knowledge_name) # index._insert(documents) # return index, documents @@ -26,6 +28,7 @@ def deleteKnowledge(index): workspace_id = os.environ.get('DASHSCOPE_WORKSPACE_ID') client = create_client() delete_index(client,workspace_id,index_id) + g.logger("knowledge deleted successfully!!!") diff --git a/flask_app/main/禁止投标情形.py b/flask_app/main/禁止投标情形.py index cc96540..f16a552 100644 --- a/flask_app/main/禁止投标情形.py +++ b/flask_app/main/禁止投标情形.py @@ -4,16 +4,18 @@ import os import re from PyPDF2 import PdfWriter, PdfReader +from flask import g from flask_app.main.通义千问long import upload_file, qianwen_long -def extract_and_format_from_paths(json_paths, includes): +def extract_and_format_from_paths(json_paths, includes, excludes): """ 从多个 JSON 文件路径读取数据,提取包含特定关键词的内容,并按照格式要求合并。 参数: json_paths (list): 包含多个 JSON 文件路径的列表。 includes (list): 包含要检查的关键词的列表。 + excludes (list): 包含要排除的关键词的列表。 返回: list: 包含所有文件中满足条件的格式化字符串列表。 @@ -33,43 +35,74 @@ def extract_and_format_from_paths(json_paths, includes): if isinstance(value, dict): # 如果值是字典,检查嵌套字典的每个键值对 for sub_key, sub_value in value.items(): - if any(include in sub_value for include in includes): + if any(include in sub_key for include in includes): # 如果子值包含关键词,格式化并添加到结果列表 - formatted_results.append(f"{sub_key}: {sub_value}") - elif isinstance(value, str): - # 如果值是字符串,直接检查是否包含关键词 - if any(include in value for include in includes): - # 如果值包含关键词,添加到结果列表 - formatted_results.append(value) + formatted_results.append(f"{sub_value}") + elif isinstance(value, str): # clause + # 检查是否包含任何 include 关键词 + for include in includes: + if include in value: + # 找到 include 之前的内容 + prefix = value.split(include)[0] + # 检查 prefix 是否不包含任何 exclude 关键词 + if not any(exclude in prefix for exclude in excludes): + # 如果不包含任何 exclude 关键词,添加整个 value 到结果列表 + if '\n' in value: + value = value.split('\n', 1)[-1] + formatted_results.append(value) + break # 找到一个符合条件的就跳出循环 # 将当前文件的结果添加到总结果列表 all_formatted_results.extend(formatted_results) except FileNotFoundError: - print(f"Error: The file '{path}' does not exist.") + g.logger.error(f"禁止投标情形: Error: The file '{path}' does not exist.") except json.JSONDecodeError: - print(f"Error: The file '{path}' contains invalid JSON.") + g.logger.error(f"禁止投标情形: Error: The file '{path}' contains invalid JSON.") return all_formatted_results def extract_unique_items_from_texts(texts): - pattern = re.compile(r'(?:\d+\.|\(\d+\)|\(\d+\)|\d+\))\s*') - intro_pattern = re.compile(r'^.*[::]') + # 更新正则表达式以包括更广泛的序号类型,包括中文序号 + pattern = re.compile(r'(?:\d+\.|\(\d+\)|\(\d+\)|\d+\)|\①|\②|\③|\④|\⑤|\⑥|\⑦|\⑧|\⑨|\⑩|\⑪|\⑫)\s*') + intro_pattern = re.compile(r'^.*?[::]') punctuation_pattern = re.compile(r'[;。,、..,:;!?!?]+$') + url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') + content_check_pattern = re.compile(r'[\u4e00-\u9fa5a-zA-Z0-9]{2,}') # 检查是否含有至少2个连续的字母、数字或汉字 all_results = [] seen = set() for text in texts: + # 去除文本中的制表符和换行符 + text = text.replace('\t', '').replace('\n', '') + + # 删除引导性的文本(直到冒号,但保留冒号后的内容) text = intro_pattern.sub('', text) + + # 替换URL为占位符,并保存URL以便后续还原 + urls = [] + def url_replacer(match): + urls.append(match.group(0)) + return f"{{URL{len(urls)}}}" + text = url_pattern.sub(url_replacer, text) + + # 使用数字和括号的模式分割文本 items = pattern.split(text) for item in items: cleaned_item = item.strip() if cleaned_item: + # 进一步清理每个条目 cleaned_item = pattern.sub('', cleaned_item) cleaned_item = punctuation_pattern.sub('', cleaned_item) cleaned_item = cleaned_item.strip() - if cleaned_item and cleaned_item not in seen: + + # 还原URL + for i, url in enumerate(urls, 1): + cleaned_item = cleaned_item.replace(f"{{URL{i}}}", url) + + # 添加未见过的独特条目,确保它包含足够的实质内容并长度大于3个字符 + if cleaned_item and cleaned_item not in seen and len(cleaned_item) > 3 and content_check_pattern.search(cleaned_item): seen.add(cleaned_item) all_results.append(cleaned_item) @@ -93,9 +126,9 @@ def merge_pdfs(paths, output_filename): if output_path: with open(output_path, 'wb') as out: pdf_writer.write(out) - print(f"Merged PDF saved to {output_path}") + g.logger.info(f"禁止投标情形: Merged PDF saved to {output_path}") else: - print("No files to merge.") + g.logger.error("禁止投标情形: No files to merge.") return output_path def process_string_list(string_list): @@ -120,7 +153,7 @@ def process_string_list(string_list): actual_list = ast.literal_eval(formatted_list) return actual_list except SyntaxError as e: - print(f"Error parsing list: {e}") + g.logger.error(f"禁止投标情形: Error parsing list: {e}") return [] else: # 如果没有匹配到内容,返回空列表 @@ -129,15 +162,17 @@ def find_forbidden(truncate_json_path,clause_path,truncate3): #投标人须 # output_filename="merged.pdf" # paths=[truncate1,truncate4] # merged_filepath=merge_pdfs(paths,output_filename) #暂时废弃,评分前附表中的在'否决投标'中摘录了。 - - file_id=upload_file(truncate3) - # user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请按json列表格式给我提供信息,键名为'不得存在的其他情形',请你不要回答有关\"信誉要求\"的内容,若文件中未说明,请在键值中填'未知'。" - user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...],请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。" - qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden) + if truncate3: + file_id=upload_file(truncate3) + # user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请按json列表格式给我提供信息,键名为'不得存在的其他情形',请你不要回答有关\"信誉要求\"的内容,若文件中未说明,请在键值中填'未知'。" + user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...],请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。" + qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden) + else: + qianwen_forbidden_str="[]" actual_list=process_string_list(qianwen_forbidden_str) #提取出字符串列表 ["xxx","xx"] - - includes = ["不得存在", "禁止投标"] - forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes) + includes = ["不得存在", "不得与", "禁止投标", "对投标人的纪律"] + excludes = ["招标", "评标", "定标"] + forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes,excludes) processed_results = extract_unique_items_from_texts(forbidden_results) # print(processed_results) merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results)) @@ -145,11 +180,12 @@ def find_forbidden(truncate_json_path,clause_path,truncate3): #投标人须 return forbidden_dict - +#TODO:不得存在的情况文中有很多内容 if __name__ == '__main__': - truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\truncate_output.json" - clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\clause.json" - truncate4 = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_qualification.pdf" - output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7" - doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile.docx' - find_forbidden(truncate_json_path,clause_path,truncate4) \ No newline at end of file + truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\truncate_output.json" + clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\clause1.json" + truncate3 = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile_qualification.pdf" + output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e" + doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile.docx' + res = find_forbidden(truncate_json_path, clause_path, truncate3) + print(json.dumps(res, ensure_ascii=False, indent=4)) \ No newline at end of file diff --git a/flask_app/main/读取docx.py b/flask_app/main/读取docx.py index b36bf65..e037f52 100644 --- a/flask_app/main/读取docx.py +++ b/flask_app/main/读取docx.py @@ -14,5 +14,5 @@ def read_docx(file_path): if __name__ == "__main__": - file_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标02_invalid.docx" + file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile.docx" read_docx(file_path) diff --git a/flask_app/main/资格审查模块.py b/flask_app/main/资格审查模块.py index a3d14fa..6a71fc1 100644 --- a/flask_app/main/资格审查模块.py +++ b/flask_app/main/资格审查模块.py @@ -10,13 +10,11 @@ from concurrent.futures import ThreadPoolExecutor def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder): #评标办法前附表 # 形式评审、响应评审:千问 - print("starting形式响应评审...") file_id=upload_file(truncate1) #评标办法前附表 user_query_1 = "根据该文档中的评标办法前附表,请你列出该文件中的形式评审标准和响应性评审标准和资格评审标准,请以json格式返回,外层键名为'形式评审标准'和'响应性评审标准'和'资格评审标准',嵌套键名为'评审因素'中的内容,相应的键值为对应'评审标准'中的内容。" results = qianwen_long(file_id, user_query_1) original_dict_data = extract_content_from_json(results) qualification_review = original_dict_data.pop('资格评审标准', '默认值或None') #qianwen-long有关资格评审的内容 - print(original_dict_data) with ThreadPoolExecutor() as executor: # 创建Future对象 future_qualification = executor.submit(process_qualification, qualification_review, truncate3, knowledge_name) @@ -26,7 +24,6 @@ def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpa # 等待执行结果 final_qualify_json = future_qualification.result() form_response_dict = future_form_response.result() - print("形式响应评审done") form_response_dict.update(final_qualify_json) return nest_json_under_key(form_response_dict,"资格审查") diff --git a/flask_app/main/资格评审.py b/flask_app/main/资格评审.py index 9ceceac..62adad0 100644 --- a/flask_app/main/资格评审.py +++ b/flask_app/main/资格评审.py @@ -3,6 +3,8 @@ import json import re +from flask import g + from flask_app.main.json_utils import clean_json_string, combine_json_results, add_keys_to_json, nest_json_under_key from flask_app.main.多线程提问 import multi_threading,read_questions_from_file from flask_app.main.通义千问long import upload_file @@ -17,7 +19,7 @@ def merge_dictionaries_under_common_key(dicts, common_key): # 使用字典解包来合并字典 merged_dict[common_key].update(d[common_key]) else: - print(f"Warning: Dictionary does not contain the key {common_key}") + g.logger.error(f"资格评审: Warning: Dictionary does not contain the key {common_key}") return merged_dict def generate_qual_question(matching_keys_list): #这里假设资质、信誉与人员要求 要不都有、要不都没 @@ -71,9 +73,9 @@ def get_consortium_dict(knowledge_name): if response and len(response) > 1: # 检查response存在且有至少两个元素 qualify_list.append(response[1]) else: - print(f"Warning: Missing or incomplete response data for query index {_}.") + g.logger.error(f"资格评审: Warning: Missing or incomplete response data for query index {_}.") except Exception as e: - print(f"Error processing response for query index {_}: {e}") + g.logger.error(f"资格评审: Error processing response for query index {_}: {e}") consortium_dict = combine_json_results(qualify_list) return consortium_dict @@ -88,9 +90,9 @@ def get_all_dict(knowledge_name): if response and len(response) > 1: # 检查response存在且有至少两个元素 qualification_list.append(response[1]) else: - print(f"Warning: Missing or incomplete response data for query index {_}.") + g.logger.error(f"资格评审: Warning: Missing or incomplete response data for query index {_}.") except Exception as e: - print(f"Error processing response for query index {_}: {e}") + g.logger.error(f"资格评审: Error processing response for query index {_}: {e}") qualification_combined_res = combine_json_results(qualification_list) return {'资格评审': qualification_combined_res} def process_qualification(qualification_review,truncate3,knowledge_name): @@ -99,14 +101,14 @@ def process_qualification(qualification_review,truncate3,knowledge_name): if not matching_keys_list: #此时要求全部写在评分办法前附表中,不需要额外提取。 if not non_matching_dict: #古法提取 if truncate3!="": - print("type1") + g.logger.info("资格评审: type1") matching_keys_list=["资质条件","财务要求","业绩要求","信誉要求","其他要求"] ques=generate_qual_question(matching_keys_list) file_id2 = upload_file(truncate3) results2 = multi_threading(ques, "", file_id2, 2) # 资格评审表,调用qianwen-long res_list = [] if not results2: - print("未调用大模型询问资格评审文件要求!") + g.logger.error("资格评审: 调用大模型未成功获取资格评审文件中的要求!") else: # 打印结果 for question, response in results2: @@ -117,11 +119,11 @@ def process_qualification(qualification_review,truncate3,knowledge_name): updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) # 合并字典 return updated_qualify_json else: - print("type2") + g.logger.info("资格评审: type2") return get_all_dict(knowledge_name) else: - print("type3") + g.logger.info("资格评审: type3") new_non_matching_json={'资格评审':non_matching_dict} substring = '联合体' found_key = any(substring in key for key in non_matching_dict.keys()) #没有联合体投标,则需生成,防止重复 @@ -133,18 +135,18 @@ def process_qualification(qualification_review,truncate3,knowledge_name): return new_non_matching_json elif matching_keys_list and truncate3=="": #这种情况是评分办法前附表中有要求,但是没有正确截取到'资格审查表' - print("type4") + g.logger.info("资格评审: type4") final_qualification=get_all_dict(knowledge_name) final_qualify_json = add_keys_to_json(final_qualification, non_matching_dict) return final_qualify_json else: #大多数情况 - print("type5") + g.logger.info("资格评审: type5") user_querys = generate_qual_question(matching_keys_list) # 生成提问->‘附件:资格审查’ file_id2 = upload_file(truncate3) results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表,调用qianwen-long res_list = [] if not results2: - print("未调用大模型询问资格评审文件要求!") + g.logger.error("资格评审: 调用大模型未成功获取资格评审文件中的要求!") else: # 打印结果 for question, response in results2: diff --git a/flask_app/货物标/test.py b/flask_app/货物标/test.py index 202777c..cf8e7a1 100644 --- a/flask_app/货物标/test.py +++ b/flask_app/货物标/test.py @@ -1,33 +1,115 @@ -def categorize_keys(data, includes): - # 初始化结果字典,预设'其他'分类为空字典 - result = {"其他": {}} - - # 遍历原始字典的每一个键值对 - for key, value in data.items(): - if key in includes: - # 如果键在includes列表中,直接保留这个键值对 - result[key] = value - else: - # 如果键不在includes列表中,将这个键值对加入到'其他'分类中 - result["其他"][key] = value - - # 如果'其他'分类没有任何内容,可以选择删除这个键 - if not result["其他"]: - del result["其他"] - - return result +import json +import re -# 使用示例 -data = { - "基础信息": "详细描述", - "资格审查": "流程说明", - "商务标": "流程详情", - "技术标": "合同详细条款", - "支付方式": "支付条件说明" -} +def extract_and_format_from_paths(json_paths, includes, excludes): + """ + 从多个 JSON 文件路径读取数据,提取包含特定关键词的内容,并按照格式要求合并。 -includes = ["基础信息", "资格审查", "商务标", "技术标", "无效标与废标项", "投标文件要求", "开评定标流程"] -result = categorize_keys(data, includes) + 参数: + json_paths (list): 包含多个 JSON 文件路径的列表。 + includes (list): 包含要检查的关键词的列表。 + excludes (list): 包含要排除的关键词的列表。 -print(result) + 返回: + list: 包含所有文件中满足条件的格式化字符串列表。 + """ + all_formatted_results = [] + + # 遍历每个文件路径 + for path in json_paths: + try: + with open(path, 'r', encoding='utf-8') as file: + # 加载 JSON 数据 + json_data = json.load(file) + formatted_results = [] + + # 遍历 JSON 数据的每个键值对 + for key, value in json_data.items(): + if isinstance(value, dict): + # 如果值是字典,检查嵌套字典的每个键值对 + for sub_key, sub_value in value.items(): + if any(include in sub_key for include in includes): + # 如果子值包含关键词,格式化并添加到结果列表 + formatted_results.append(f"{sub_value}") + elif isinstance(value, str): # clause + # 检查是否包含任何 include 关键词 + for include in includes: + if include in value: + # 找到 include 之前的内容 + prefix = value.split(include)[0] + # 检查 prefix 是否不包含任何 exclude 关键词 + if not any(exclude in prefix for exclude in excludes): + # 如果不包含任何 exclude 关键词,添加整个 value 到结果列表 + if '\n' in value: + value = value.split('\n', 1)[-1] + formatted_results.append(value) + break # 找到一个符合条件的就跳出循环 + + # 将当前文件的结果添加到总结果列表 + all_formatted_results.extend(formatted_results) + except FileNotFoundError: + print(f"Error: The file '{path}' does not exist.") + except json.JSONDecodeError: + print(f"Error: The file '{path}' contains invalid JSON.") + + return all_formatted_results + + +def extract_unique_items_from_texts(texts): + # 更新正则表达式以包括更广泛的序号类型,包括中文序号 + pattern = re.compile(r'(?:\d+\.|\(\d+\)|\(\d+\)|\d+\)|\①|\②|\③|\④|\⑤|\⑥|\⑦|\⑧|\⑨|\⑩|\⑪|\⑫)\s*') + intro_pattern = re.compile(r'^.*?[::]') + punctuation_pattern = re.compile(r'[;。,、..,:;!?!?]+$') + url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') + content_check_pattern = re.compile(r'[\u4e00-\u9fa5a-zA-Z0-9]{2,}') # 检查是否含有至少2个连续的字母、数字或汉字 + + all_results = [] + seen = set() + + for text in texts: + # 去除文本中的制表符和换行符 + text = text.replace('\t', '').replace('\n', '') + + # 删除引导性的文本(直到冒号,但保留冒号后的内容) + text = intro_pattern.sub('', text) + + # 替换URL为占位符,并保存URL以便后续还原 + urls = [] + def url_replacer(match): + urls.append(match.group(0)) + return f"{{URL{len(urls)}}}" + text = url_pattern.sub(url_replacer, text) + + # 使用数字和括号的模式分割文本 + items = pattern.split(text) + + for item in items: + cleaned_item = item.strip() + if cleaned_item: + # 进一步清理每个条目 + cleaned_item = pattern.sub('', cleaned_item) + cleaned_item = punctuation_pattern.sub('', cleaned_item) + cleaned_item = cleaned_item.strip() + + # 还原URL + for i, url in enumerate(urls, 1): + cleaned_item = cleaned_item.replace(f"{{URL{i}}}", url) + + # 添加未见过的独特条目,确保它包含足够的实质内容并长度大于3个字符 + if cleaned_item and cleaned_item not in seen and len(cleaned_item) > 3 and content_check_pattern.search(cleaned_item): + seen.add(cleaned_item) + all_results.append(cleaned_item) + + return all_results +# 使用上面定义的函数 +truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\truncate_output.json" +clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\clause1.json" +json_paths = [truncate_json_path,clause_path] # 根据实际存放的路径来填写 +includes = ["不得存在","不得与","禁止投标","对投标人的纪律"] +excludes=["招标","评标","定标"] +# 调用函数 +results = extract_and_format_from_paths(json_paths, includes,excludes) +print(results) +res=extract_unique_items_from_texts(results) +print(res)