From 93751f7e748c336a71358b9ebe85c716e337efa7 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Sat, 12 Oct 2024 18:01:59 +0800 Subject: [PATCH] 10.12 --- flask_app/main/download.py | 72 ++- flask_app/main/format_change.py | 14 +- flask_app/main/start_up.py | 149 +++--- flask_app/main/test.py | 428 ++++-------------- flask_app/main/商务标技术标整合.py | 30 +- flask_app/main/多线程提问.py | 10 +- flask_app/main/截取pdf.py | 4 +- flask_app/main/投标人须知正文提取指定内容.py | 2 +- flask_app/main/招标文件解析.py | 9 +- flask_app/货物标/test.py | 67 ++- flask_app/货物标/基础信息解析main.py | 2 + .../投标人须知正文提取指定内容货物标版.py | 22 - .../投标人须知正文条款提取成json文件货物标版.py | 133 ++++-- .../无效标和废标和禁止投标整合货物标版.py | 26 +- flask_app/货物标/评分标准提取.py | 89 ---- flask_app/货物标/评分标准提取main.py | 229 ++++++++++ flask_app/货物标/货物标截取pdf.py | 62 ++- flask_app/货物标/货物标解析main.py | 189 +++++++- flask_app/货物标/资格审查main.py | 277 +++++++++--- 19 files changed, 1117 insertions(+), 697 deletions(-) create mode 100644 flask_app/货物标/基础信息解析main.py delete mode 100644 flask_app/货物标/评分标准提取.py create mode 100644 flask_app/货物标/评分标准提取main.py diff --git a/flask_app/main/download.py b/flask_app/main/download.py index 48d2708..22a045e 100644 --- a/flask_app/main/download.py +++ b/flask_app/main/download.py @@ -2,45 +2,67 @@ import requests import mimetypes - def download_file(url, local_filename): + """ + 下载文件并保存到本地,基于Content-Type设置文件扩展名。 + + 参数: + - url (str): 文件的URL地址。 + - local_filename (str): 本地保存的文件名(不含扩展名)。 + + 返回: + - tuple: (完整文件名, 文件类型代码) + 文件类型代码: + 1 - .docx + 2 - .pdf + 3 - .doc + 4 - 其他 + - None: 下载失败 + """ try: with requests.get(url, stream=True) as response: response.raise_for_status() # 确保请求成功,否则抛出异常 - # 获取文件类型并设置适当的文件扩展名 - content_type = response.headers.get('Content-Type') - extension = mimetypes.guess_extension(content_type, strict=False) - if not extension: - # 如果无法猜测扩展名,默认使用 .docx - extension = '.docx' - full_filename = local_filename + extension # 追加扩展名 + # 获取Content-Type并猜测文件扩展名 + content_type = response.headers.get('Content-Type', '') + extension = mimetypes.guess_extension(content_type, strict=False) or '.docx' + # 分离文件名和现有扩展名 + base, ext = os.path.splitext(local_filename) + if ext.lower() != extension: + full_filename = base + extension + else: + full_filename = local_filename + + # 下载并保存文件 with open(full_filename, 'wb') as file: for chunk in response.iter_content(chunk_size=8192): - file.write(chunk) + if chunk: # 避免写入空块 + file.write(chunk) + + # 定义扩展名到代码的映射 + extension_mapping = { + '.docx': 1, + '.pdf': 2, + '.doc': 3 + } + file_code = extension_mapping.get(extension.lower(), 4) + return full_filename, file_code - # 根据扩展名返回对应的值 - if extension == '.docx': - return full_filename,1 - elif extension == '.pdf': - return full_filename,2 - else: - return full_filename,3 except requests.HTTPError as e: - print(f"download: HTTP Error: {e}") - return None + print(f"download: HTTP 错误: {e}") except requests.RequestException as e: - print(f"download: Error downloading the file: {e}") - return None + print(f"download: 下载文件时出错: {e}") except Exception as e: - print(f"download: An error occurred: {e}") - return None + print(f"download: 发生错误: {e}") + + return None if __name__ == '__main__': # 测试下载的URL - test_url ="https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/01%20%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6-%E5%A4%A7%E8%8D%94%E5%8E%BF%E5%85%AC%E5%AE%89%E5%B1%80%E6%83%85%E6%8C%87%E5%8B%A4%E8%88%86%E4%B8%80%E4%BD%93%E5%8C%96%E5%B9%B3%E5%8F%B0%E5%BB%BA%E8%AE%BE%E9%A1%B9%E7%9B%AE%28%E4%BA%8C%E6%AC%A1%29.pdf?Expires=1726661197&OSSAccessKeyId=TMP.3KdMWkHL1nqHypaY6LmhnzqJTRkTzuYLNNPhW9KZruLGLaM3YL7F45NfuF3JT4CszSe2FD7ZH6WZFUTumokmJsSEW6pPh6&Signature=7BIGVSb9YGYLKFBXeTQZm7QnTI8%3D" - local_file_name = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output\\downloaded_file' - file_path = download_file(test_url, local_file_name) + test_url ="https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/01%20%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6-%E5%A4%A7%E8%8D%94%E5%8E%BF%E5%85%AC%E5%AE%89%E5%B1%80%E6%83%85%E6%8C%87%E5%8B%A4%E8%88%86%E4%B8%80%E4%BD%93%E5%8C%96%E5%B9%B3%E5%8F%B0%E5%BB%BA%E8%AE%BE%E9%A1%B9%E7%9B%AE%28%E4%BA%8C%E6%AC%A1%29.pdf?Expires=1728751377&OSSAccessKeyId=TMP.3KeYhAGeJr2LGiNctSPvSmdSwxhrU8pbaDYRJcNCCgv8ijyWN613QahKb3nhXydfAvHaqpw4nTHXMzq7hmTHmNnPA77DgL&Signature=mwPHW8v7dVmHP1udTDL%2ByzllwCE%3D" + local_file_name = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output\\testdownload' + file_path,file_type = download_file(test_url, local_file_name) if file_path: print(f"Downloaded file path: {file_path}") + print(file_type) diff --git a/flask_app/main/format_change.py b/flask_app/main/format_change.py index 150f80e..5f04f99 100644 --- a/flask_app/main/format_change.py +++ b/flask_app/main/format_change.py @@ -41,10 +41,10 @@ def get_filename_and_folder(file_path): #参数为要转换的文件路径,以及输出的文件名,文件类型为自动补全。 def pdf2docx(local_path_in): remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/p2d' - receive_download_url = upload_file(local_path_in, remote_url) - filename, folder = get_filename_and_folder(local_path_in) #输入输出在同一个文件夹 - local_path_out=os.path.join(folder,filename) #输出文件名 - downloaded_filepath,file_type=download_file(receive_download_url, local_path_out) + receive_download_url = upload_file(local_path_in, remote_url) #转换完成,得到下载链接 + filename, folder = get_filename_and_folder(local_path_in) #输入输出在同一个文件夹 + local_filename=os.path.join(folder,filename) #输出文件名 C:\Users\Administrator\Desktop\货物标\zbfiles\6.2定版视频会议磋商文件 不带后缀 + downloaded_filepath,file_type=download_file(receive_download_url, local_filename) print(f"format_change p2d:have downloaded file to: {downloaded_filepath}") return downloaded_filepath @@ -52,14 +52,14 @@ def docx2pdf(local_path_in): remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p' receive_download_url = upload_file(local_path_in, remote_url) filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹 - local_path_out = os.path.join(folder, filename) # 输出文件名 - downloaded_filepath,file_type = download_file(receive_download_url, local_path_out) + local_filename = os.path.join(folder, filename) # 输出文件名 + downloaded_filepath,file_type = download_file(receive_download_url, local_filename) print(f"format_change d2p:have downloaded file to: {downloaded_filepath}") return downloaded_filepath if __name__ == '__main__': # 替换为你的文件路径和API URL - local_path_in="C:\\Users\\Administrator\\Desktop\\招标文件\\output\\test111.pdf" + local_path_in="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).doc" # pdf2docx(local_path_in) downloaded_file=docx2pdf(local_path_in) print(downloaded_file) diff --git a/flask_app/main/start_up.py b/flask_app/main/start_up.py index 685ea7c..e834083 100644 --- a/flask_app/main/start_up.py +++ b/flask_app/main/start_up.py @@ -8,7 +8,8 @@ from flask import Flask, request, jsonify, Response, stream_with_context, g import json import os from flask_app.main.download import download_file -from flask_app.main.招标文件解析 import main_processing +from flask_app.main.招标文件解析 import engineering_bid_main +from flask_app.货物标.货物标解析main import goods_bid_main app = Flask(__name__) @@ -74,6 +75,51 @@ def create_logger(): # logger.error('Exception occurred: ' + str(e)) # 使用全局 logger 记录 # return jsonify({'error': str(e)}), 500 +# def download_and_process_file(file_url): +# logger = g.logger +# unique_id = g.unique_id +# output_folder = f"flask_app/static/output/{unique_id}" # 直接使用全局 unique_id 构建路径 +# filename = "ztbfile" +# downloaded_filename = os.path.join(output_folder, filename) +# +# # 下载文件,假设 download_file 函数已正确处理异常并返回文件路径 +# downloaded_filepath, file_type = download_file(file_url, downloaded_filename) +# +# if downloaded_filepath is None or file_type == 3: +# logger.error("Unsupported file type or failed to download file") +# return None, output_folder +# +# logger.info("Local file path: " + downloaded_filepath) +# processed_file_path = engineering_bid_main(output_folder, downloaded_filepath, file_type, unique_id) +# return processed_file_path, output_folder + +# def generate_response(final_json_path): +# logger = g.logger +# # 检查final_json_path是否为空或None +# if not final_json_path: +# logger.error('Empty or None path provided for final_json.') +# return jsonify({'error': 'No path provided for final_json.'}), 400 +# if not os.path.exists(final_json_path): +# logger.error('final_json not found at path: ' + final_json_path) +# return jsonify({'error': 'final_json not found'}), 404 +# with open(final_json_path, 'r', encoding='utf-8') as f: +# logger.info('final_json_path:' + final_json_path) +# zbparse_data = json.load(f) +# json_str = json.dumps(zbparse_data, ensure_ascii=False) +# return jsonify({ +# 'message': 'File uploaded and processed successfully', +# 'filename': os.path.basename(final_json_path), +# 'data': json_str +# }) + +def validate_request(default_zb_type=1): + if not request.is_json: + return jsonify({'error': 'Missing JSON in request'}), 400 + file_url = request.json.get('file_url') + zb_type = request.json.get('zb_type', default_zb_type) + if not file_url: + return jsonify({'error': 'No file URL provided'}), 400 + return file_url,zb_type # 流式 @app.route('/upload', methods=['POST']) @@ -83,12 +129,12 @@ def zbparse(): # 获取并显示接收到的 JSON 数据 received_data = request.get_json() logger.info("Received JSON data: " + str(received_data)) - file_url = validate_request() + file_url,zb_type = validate_request() if isinstance(file_url, tuple): # Check if the returned value is an error response return file_url try: logger.info("starting parsing url:" + file_url) - return Response(stream_with_context(process_and_stream(file_url)), content_type='text/event-stream') + return Response(stream_with_context(process_and_stream(file_url,zb_type)), content_type='text/event-stream') except Exception as e: logger.error('Exception occurred: ' + str(e)) return jsonify({'error': str(e)}), 500 @@ -112,17 +158,27 @@ def post_processing(data,includes): return result # 分段返回 -def process_and_stream(file_url): +def process_and_stream(file_url,zb_type): + """ + 下载文件并进行处理,支持工程标和货物标的处理。 + + 参数: + - file_url (str): 文件的URL地址。 + - zb_type (int): 标的类型,1表示工程标,2表示货物标。 + + 返回: + - generator: 生成处理过程中的流式响应。 + """ logger = g.logger unique_id = g.unique_id output_folder = f"flask_app/static/output/{unique_id}" filename = "ztbfile" downloaded_filename = os.path.join(output_folder, filename) - downloaded_filepath, file_type = download_file(file_url, downloaded_filename) - - if downloaded_filepath is None or file_type == 3: - logger.error("Unsupported file type or failed to download file") + # 下载文件 + downloaded = download_file(file_url, downloaded_filename) + if not downloaded: + logger.error("下载文件失败或不支持的文件类型") error_response = { 'message': 'File processing failed', 'filename': None, @@ -131,12 +187,31 @@ def process_and_stream(file_url): yield f"data: {json.dumps(error_response)}\n\n" return - logger.info("Local file path: " + downloaded_filepath) + downloaded_filepath, file_type = downloaded + + # 检查文件类型 + if file_type == 4: + logger.error("不支持的文件类型") + error_response = { + 'message': 'Unsupported file type', + 'filename': None, + 'data': json.dumps({'error': 'Unsupported file type'}) + } + yield f"data: {json.dumps(error_response)}\n\n" + return + + logger.info("本地文件路径: " + downloaded_filepath) combined_data = {} + # 根据zb_type选择调用的处理函数 + processing_functions = { + 1: engineering_bid_main, + 2: goods_bid_main + } + processing_func = processing_functions.get(zb_type, engineering_bid_main) # 从 main_processing 获取数据 - for data in main_processing(output_folder, downloaded_filepath, file_type, unique_id): + for data in processing_func(output_folder, downloaded_filepath, file_type, unique_id): if not data.strip(): logger.error("Received empty data, skipping JSON parsing.") continue # Skip processing empty data @@ -160,7 +235,7 @@ def process_and_stream(file_url): } yield f"data: {json.dumps(response, ensure_ascii=False)}\n\n" # 日志记录已合并数据 - logger.info(f"Updated combined data: {json.dumps(combined_data, ensure_ascii=False, indent=4)}") + logger.info(f"合并后的数据: {json.dumps(combined_data, ensure_ascii=False, indent=4)}") # **保存 combined_data 到 output_folder 下的 'final_result.json'** output_json_path = os.path.join(output_folder, 'final_result.json') includes = ["基础信息", "资格审查", "商务评分", "技术评分", "无效标与废标项", "投标文件要求", "开评定标流程"] @@ -168,9 +243,9 @@ def process_and_stream(file_url): try: with open(output_json_path, 'w', encoding='utf-8') as json_file: json.dump(result, json_file, ensure_ascii=False, indent=4) - logger.info(f"Combined data saved to '{output_json_path}'") + logger.info(f"合并后的数据已保存到 '{output_json_path}'") except IOError as e: - logger.error(f"Error saving JSON file: {e}") + logger.error(f"保存JSON文件时出错: {e}") # 最后发送合并后的完整数据 complete_response = { 'message': 'Combined data', @@ -186,34 +261,6 @@ def process_and_stream(file_url): } yield f"data: {json.dumps(final_response)}\n\n" -def validate_request(): - if not request.is_json: - return jsonify({'error': 'Missing JSON in request'}), 400 - file_url = request.json.get('file_url') - if not file_url: - return jsonify({'error': 'No file URL provided'}), 400 - return file_url - - -def download_and_process_file(file_url): - logger = g.logger - unique_id = g.unique_id - output_folder = f"flask_app/static/output/{unique_id}" # 直接使用全局 unique_id 构建路径 - filename = "ztbfile" - downloaded_filename = os.path.join(output_folder, filename) - - # 下载文件,假设 download_file 函数已正确处理异常并返回文件路径 - downloaded_filepath, file_type = download_file(file_url, downloaded_filename) - - if downloaded_filepath is None or file_type == 3: - logger.error("Unsupported file type or failed to download file") - return None, output_folder - - logger.info("Local file path: " + downloaded_filepath) - processed_file_path = main_processing(output_folder, downloaded_filepath, file_type, unique_id) - return processed_file_path, output_folder - - @app.route('/api/test_zbparse', methods=['POST']) def test_zbparse(): try: @@ -346,26 +393,6 @@ def test_process_and_stream(): yield f"data: {json.dumps(final_response, ensure_ascii=False)}\n\n" -def generate_response(final_json_path): - logger = g.logger - # 检查final_json_path是否为空或None - if not final_json_path: - logger.error('Empty or None path provided for final_json.') - return jsonify({'error': 'No path provided for final_json.'}), 400 - if not os.path.exists(final_json_path): - logger.error('final_json not found at path: ' + final_json_path) - return jsonify({'error': 'final_json not found'}), 404 - with open(final_json_path, 'r', encoding='utf-8') as f: - logger.info('final_json_path:' + final_json_path) - zbparse_data = json.load(f) - json_str = json.dumps(zbparse_data, ensure_ascii=False) - return jsonify({ - 'message': 'File uploaded and processed successfully', - 'filename': os.path.basename(final_json_path), - 'data': json_str - }) - - # @app.route('/get_json', methods=['POST']) # def testjson(): # final_json_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp4\\fd55f067-2cf6-475c-b7ce-4498f6606bf6\\final_result.json" diff --git a/flask_app/main/test.py b/flask_app/main/test.py index 97ac431..a9f34be 100644 --- a/flask_app/main/test.py +++ b/flask_app/main/test.py @@ -1,368 +1,92 @@ -# -*- coding: utf-8 -*- import json -import os.path -import time -import re -from flask_app.main.json_utils import combine_json_results, nest_json_under_key -from flask_app.main.通义千问long import upload_file, qianwen_long -from concurrent.futures import ThreadPoolExecutor -from flask_app.main.禁止投标情形 import find_forbidden, process_string_list +from collections import defaultdict - -#如果当前段落有序号,则向下匹配直接遇到相同的序号样式 -#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。 - -def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): - from collections import OrderedDict - from docx import Document - import re - - if isinstance(keywords, str): - keywords = [keywords] - - doc = Document(doc_path) - extracted_paragraphs = OrderedDict() - continue_collecting = False - current_section_pattern = None - active_key = None - - def match_keywords(text, patterns): - # 首先检查关键词是否匹配 - for pattern in patterns: - match = re.search(pattern, text, re.IGNORECASE) - if match: - # 如果当前的模式是 '不\s*得',则额外检查是否匹配 '不得分' - if pattern == r'不\s*得': - post_match_text = text[match.end():].strip() - if post_match_text.startswith("分"): - continue # 如果是"不得分",跳过这个匹配 - return True - return False - - def extract_from_text(text, current_index): - nonlocal continue_collecting, current_section_pattern, active_key - if text == "": - return current_index - - if continue_collecting: - if current_section_pattern and re.match(current_section_pattern, text): - continue_collecting = False - active_key = None - else: - if active_key is not None: - extracted_paragraphs[active_key].append(text) - return current_index - - if match_keywords(text, keywords): - active_key = text - extracted_paragraphs[active_key] = [text] - if match_keywords(text, follow_up_keywords): - continue_collecting = True - section_number = re.match(r'(\d+(\s*\.\s*\d+)*)', text) - if section_number: - current_section_number = section_number.group(1) - level_count = current_section_number.count('.') - - # Pattern to match current level, e.g., 3.4.5 - pattern = r'^' + (r'\d+\s*\.\s*') * level_count + r'\d+' - - # Generate patterns for next section at same level and parent level - parts = current_section_number.split('.') - matched_patterns = [pattern] # start with the full pattern - - # Next section at same level - parts[-1] = str(int(parts[-1]) + 1) - next_pattern = r'^' + r'\s*\.\s*'.join(parts) - matched_patterns.append(next_pattern) - - # Parent section (if applicable) - if len(parts) > 1: - parent_section_parts = parts[:-1] - parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1) - parent_pattern = r'^' + r'\s*\.\s*'.join(parent_section_parts) - matched_patterns.append(parent_pattern) - - # Combine the patterns - combined_pattern = r'(' + r')|('.join(matched_patterns) + r')' - current_section_pattern = re.compile(combined_pattern) - - else: - found_next_number = False - current_section_pattern = None - - while current_index < len(doc.paragraphs) - 1: - current_index += 1 - next_text = doc.paragraphs[current_index].text.strip() - if not found_next_number: - next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))', next_text) - if next_section_number: - found_next_number = True - if next_section_number.group(1): - section_parts = next_section_number.group(1).split('.') - dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b' - elif next_section_number.group(2): - dynamic_pattern = r'^[\(\(]\d+[\)\)]' - current_section_pattern = re.compile(dynamic_pattern) - if current_section_pattern and re.match(current_section_pattern, next_text): - extracted_paragraphs[active_key].append(next_text) - else: - continue_collecting = False - active_key=None - break - - return current_index - - index = 0 - while index < len(doc.paragraphs): - index = extract_from_text(doc.paragraphs[index].text.strip(), index) - index += 1 - - return extracted_paragraphs - -def preprocess_text_list(text_list): - new_text_list = [] - # 正则表达式匹配中文字符或标点后的空格,该空格后紧跟字母、数字或带括号的数字 - split_pattern = re.compile(r'(?<=[\u4e00-\u9fff。;!??!;])(?=\s+[a-zA-Z\d]|\s+\([1-9]\d*\)|\s+\([1-9]\d*\))') - for text in text_list: - # 使用正则表达式检查并拆分元素 - parts = split_pattern.split(text) - new_text_list.extend(part.strip() for part in parts if part.strip()) # 添加非空字符串检查 - - return new_text_list - -def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达式提取到的东西格式化 - all_texts1 = [] - all_texts2=[] - # 定义用于分割句子的正则表达式,包括中文和西文的结束标点 - split_pattern = r'(?<=[。!?\!\?])' - - for key, text_list in extracted_contents.items(): - if len(text_list) == 1: - for data in text_list: - # 检查是否包含任何需要排除的字符串 - if any(exclude in data for exclude in excludes): - continue # 如果包含任何排除字符串,跳过这个数据 - # 去掉开头的序号,包括字母+数字的格式 以及括号+数字 - pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)' - data = re.sub(pattern, '', data).strip() - keyword_match = re.search(keywords, data) - if keyword_match: - # 从关键词位置开始查找结束标点符号 - start_pos = keyword_match.start() - # 截取从关键词开始到后面的内容 - substring = data[start_pos:] - # 按定义的结束标点分割 - sentences = re.split(split_pattern, substring, 1) - if len(sentences) > 0 and sentences[0]: - # 只取第一句,保留标点 - cleaned_text = data[:start_pos] + sentences[0] #eg:经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。潜在投标人应自行承担现场考察的全部费用、责任和风险。 - # 经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。 - else: - cleaned_text = data # 如果没有标点,使用整个字符串 - else: - # 如果没有找到关键词,保留原文本 - cleaned_text = data - all_texts1.append(cleaned_text) # 将处理后的文本添加到结果列表 - - else: - print(text_list) - new_text_list=preprocess_text_list(text_list) - print(new_text_list) - pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)' - data = re.sub(pattern, '', new_text_list[0]).strip() #去除序号 - # 将修改后的第一个元素和剩余的元素连接起来 - new_text_list[0] = data # 更新列表中的第一个元素 - joined_text = "\n".join(new_text_list) # 如果列表中有多个元素,则连接它们 - all_texts2.append(joined_text) # 将每个列表的内容添加到 all_texts 中 - - return all_texts1,all_texts2 #all_texts1要额外用gpt all_text2直接返回结果 -def find_sentences_with_keywords(data, keywords, follow_up_keywords): - """递归查找并返回包含关键词的句子列表,并根据是否存在后续关键词分别存储到两个列表中。""" - sentences1 = [] # 保存没有后续关键词的情况 - sentences2 = [] # 保存有后续关键词的情况 - - if isinstance(data, dict): - for value in data.values(): - result1, result2 = find_sentences_with_keywords(value, keywords, follow_up_keywords) - sentences1.extend(result1) - sentences2.extend(result2) - elif isinstance(data, list): - for item in data: - result1, result2 = find_sentences_with_keywords(item, keywords, follow_up_keywords) - sentences1.extend(result1) - sentences2.extend(result2) - elif isinstance(data, str): - # 分割句子,保证句子完整性(按标点符号和序号分割) - split_sentences = re.split(r'(?<=[。!?\!\?])|(?=\d+[\、\.])|(?=[((]\d+[))])', data) # 扩展匹配序号分割 - i = 0 - while i < len(split_sentences): - sentence = split_sentences[i].strip() - if re.search(keywords, sentence, re.IGNORECASE): - follow_up_present = any( - re.search(follow_up, sentence, re.IGNORECASE) for follow_up in follow_up_keywords) - if follow_up_present: - # 如果存在后续关键词,则从当前位置开始截取 - start_index = i - end_index = start_index - found_next_section = False - for j in range(start_index + 1, len(split_sentences)): - if re.match(r'\d+\.\d+(\.\d+)?', split_sentences[j].strip()): - end_index = j - found_next_section = True - break - - if found_next_section: - full_text = ' '.join(split_sentences[start_index:end_index]).strip() - else: - full_text = ' '.join(split_sentences[start_index:]).strip() - pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)' - data=re.sub(pattern,'',full_text) - sentences2.append(data) # 存储有后续关键词的情况 - i = end_index if found_next_section else len(split_sentences) - else: - pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)' - data = re.sub(pattern, '', sentence).replace('\n','').strip() - sentences1.append(data) # 存储没有后续关键词的情况 - i += 1 - else: - i += 1 - - return sentences1, sentences2 # 返回两个列表 - -def extract_sentences_from_json(json_path, keywords,follow_up_keywords): - with open(json_path, 'r', encoding='utf-8') as file: - data = json.load(file) - """从JSON数据中提取包含关键词的句子。""" - return find_sentences_with_keywords(data, keywords,follow_up_keywords) - -#处理无效投标 -def extract_values_if_contains(data, includes): +def parse_json_with_duplicates(json_string): """ - 递归检查字典中的值是否包含列表 'includes' 中的内容。 - 如果包含,将这些值添加到一个列表中并返回。 + 解析具有重复键的 JSON 字符串,将所有重复的键值对存储为列表。 - 参数: - data (dict): 字典或从 JSON 解析得到的数据。 - includes (list): 包含要检查的关键词的列表。 + Args: + json_string (str): 需要解析的 JSON 字符串。 - 返回: - list: 包含满足条件的值的列表。 + Returns: + dict: 解析后的字典,重复的键对应的值为列表。 """ - included_values = [] # 初始化结果列表 + def custom_object_pairs_hook(pairs): + d = defaultdict(list) + for key, value in pairs: + # 如果值是字典或列表,递归处理 + if isinstance(value, dict): + value = process_dict(value) + elif isinstance(value, list): + value = process_list(value) + d[key].append(value) + # 将有多个值的键转换为列表,单个值的键保持原样 + return {key: (values if len(values) > 1 else values[0]) for key, values in d.items()} - # 定义递归函数来处理嵌套字典 - def recursive_search(current_data): - if isinstance(current_data, dict): - for key, value in current_data.items(): - if isinstance(value, dict): - # 如果值是字典,递归搜索 - recursive_search(value) - elif isinstance(value, str): - # 如果值是字符串,检查是否包含任何 includes 中的关键词 - if any(include in value for include in includes): - included_values.append(value) - elif isinstance(current_data, list): - for item in current_data: - # 如果是列表,递归每个元素 - recursive_search(item) + def process_dict(d): + """ + 递归处理字典,确保所有重复键的值为列表。 - # 开始递归搜索 - recursive_search(data) + Args: + d (dict): 需要处理的字典。 - return included_values + Returns: + dict: 处理后的字典。 + """ + return custom_object_pairs_hook(d.items()) + def process_list(l): + """ + 递归处理列表,确保列表中的所有字典也被处理。 -#你是一个文本助手,文本内的信息以'...............'分割,你负责准确筛选所需的信息并返回,每块信息要求完整,不遗漏,你不得擅自进行总结或删减。 -#以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号。 -#以上是原文内容,文本内的信息以'...............'分割,请你根据该信息回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选所需的信息并返回。最终结果以json列表格式返回给我,键名为'否决和无效投标情形',你的回答完全忠于原文内容,且回答内容与原文内容一致,要求完整与准确,不能擅自总结或者概括。", + Args: + l (list): 需要处理的列表。 -def handle_query(file_path, user_query, output_file, result_key, keywords, truncate_json_path): - excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:"] - follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下'] - extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) #字典结果 - # print(extracted_contents) - all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表 - all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords) - qianwen_txt = all_texts1 + all_tables1 - # Proceed only if there is content to write - if qianwen_txt: - with open(output_file, 'w', encoding='utf-8') as file: - # 初始化一个计数器 - counter = 1 - for content in qianwen_txt: - file.write("..............."+'\n') - # 写入内容前加上序号,后面接一个点和空格,然后是内容 - file.write(f"{counter}. {content}\n") - # 更新计数器,每次循环递增 - counter += 1 - file_id = upload_file(output_file) - qianwen_ans = qianwen_long(file_id, user_query) - selected_contents = set() # 使用 set 去重 - num_list = process_string_list(qianwen_ans) - print(num_list) + Returns: + list: 处理后的列表。 + """ + return [process_dict(item) if isinstance(item, dict) else item for item in l] - for index in num_list: - if index - 1 < len(qianwen_txt): - content = qianwen_txt[index - 1] # 转换序号为索引(假设序号从1开始) - selected_contents.add(content) - # 将 all_texts2 和 all_tables2 中的内容也添加到 set 中 - selected_contents.update(all_texts2) - selected_contents.update(all_tables2) - # 将 set 转换为 list 来返回结果 - res = {result_key: list(selected_contents)} - # 将结果转换为JSON字符串 - # os.remove(output_file) # Remove the file after use - # print(f"Deleted temporary file: {output_file}") - else: - res = {result_key: ""} # Set the response to empty if no contents were extracted - return res + return json.loads(json_string, object_pairs_hook=custom_object_pairs_hook) -def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path): - queries = [ - (r'否\s*决|无\s*效\s*投\s*标|无\s*效\s*文\s*件|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效', - "以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。", - os.path.join(output_dir, "temp1.txt"), "否决和无效投标情形"), - (r'废\s*标', - "以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:废标项的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。", - os.path.join(output_dir, "temp2.txt"), "废标项"), - (r'不\s*得|禁\s*止\s*投\s*标',"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,每条信息规定了各方不得存在的情形,请回答:在这些信息中,主语是投标人或中标人或供应商或联合体投标各方或磋商小组的信息有哪些?不要返回主语是招标人或采购人或评标委员会的信息,请你筛选所需的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,示例返回为[1,4,6],若情况不存在,返回[]。", - os.path.join(output_dir,"temp3.txt"),"不得存在的情形") - ] - results = [] +# 示例使用 +input_string = ''' +{ + "商务评分": { + "综合实力": { + "评分": "5分", + "要求": "投标人具备有效期内的 ISO9001质量体系认证证书、具备有效期内的OHSA18001职业健康安全体系认证证书、具备有效期内的 IS014001环境管理体系认证证书、具备有效期内的 ISO20000信息技术服务体系认证证书、具备有效期内的 ISO27001信息安全体系认证证书,全部满足得 5分,每缺一项扣 1分,(开标时需提供原件)。" + }, + "综合实力": { + "评分": "2分", + "要求": "投标人具备电子与智能化工程专业承包二级资质及以上证书得 2分,不能够提供不得分(开标时需提供原件)。" + }, + "综合实力": { + "评分": "2分", + "要求": "投标人具有建筑机电安装工程专业承包三级资质或以上资质得 2分,否则不得分。(证书开标原件备查)。" + }, + "综合实力": { + "评分": "3分", + "要求": "投标人需具有健全的信息技术运维服务能力,通过ITSS信息技术服务运维标准符合性认证得 3分,投标时需提供相关证书原件予以证明,否则不得分。" + }, + "综合实力": { + "评分": "2分", + "要求": "投标人具备 CCRC信息安全服务安全集成三级及以上证书得 2分,不能够提供不得分(开标时需提供原件)。" + }, + "类似业绩": { + "评分": "4分", + "要求": "近三年(自投标截止时间前推 36个月,以合同签订日期为准)中标人作为独立承包人有已完成的类似业绩项目(建设内容含应包含会议系统设备采购项目及改造),每提供一份业绩得 2分,最多可得 4分。(业绩证明材料须提供中标公示截图、中标通知书、合同复印件,开标时备查,否则不得分。)" + }, + "质量保证": { + "评分": "2分", + "要求": "投标人所投的MCU及视频会议终端设备产品,如果不是自己生产的,需提供制造商出具的授权及满足招标质保要求的售后服务承诺函,提供得 2分;(开标时提供授权书及售后服务承诺函原件予以证明,否则不得分。)" + } + } +} +''' - # 使用线程池来并行处理查询 - with ThreadPoolExecutor() as executor: - futures = [] - for keywords, user_query, output_file, result_key in queries: - future = executor.submit(handle_query, file_path, user_query, output_file, result_key, keywords, - truncate_json_path) - futures.append(future) - time.sleep(1) # 暂停1秒后再提交下一个任务 +# 解析具有重复键的 JSON 字符串 +parsed_data = parse_json_with_duplicates(input_string) - for future in futures: - results.append(future.result()) - - # #禁止投标 - # print("starting不得存在的情形...") - # forbidden_res = find_forbidden(truncate_json_path, clause_path) - # results.append(forbidden_res) - - combined_dict = {} - for d in results: - combined_dict.update(d) - - print("无效标与废标done...") - # return nest_json_under_key(combined_dict, "无效标与废标项") - return {"无效标与废标项":combined_dict} - -if __name__ == '__main__': - start_time = time.time() - truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\磋商文件_tobidders_notice_part1\\truncate_output.json" - clause_path="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause磋商文件_tobidders_notice_part2.json" - output_dir = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\invalid" - # doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx' - doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件.docx' - results = combine_find_invalid(doc_path, output_dir,truncate_json_path,clause_path) - end_time = time.time() - print("Elapsed time:", str(end_time - start_time)) - print("Results:", json.dumps(results,ensure_ascii=False,indent=4)) +# 打印结果 +print(json.dumps(parsed_data, ensure_ascii=False, indent=4)) diff --git a/flask_app/main/商务标技术标整合.py b/flask_app/main/商务标技术标整合.py index 1d1ede1..e454a3a 100644 --- a/flask_app/main/商务标技术标整合.py +++ b/flask_app/main/商务标技术标整合.py @@ -115,12 +115,36 @@ def combine_evaluation_standards(truncate2): user_query_2 = ( """ 根据该文档中的评标办法前附表,请你列出该文件的技术评分,商务评分,投标报价评审标准以及它们对应的具体评分要求,请以json格式返回结果,请在这三大块评分中分别用若干键值对表示具体要求,其内层的键名为'评分'及'要求',若这三大块评分中存在其他信息,则在相应评分大块中新增键名'备注'存放该信息,键值为具体的要求,否则不需要。如果评分内容(因素)不是这3个,则返回文档中给定的评分内容(因素)以及它们的具体评分要求。不要回答有关形式、资格、响应性评审标准的内容,若存在未知信息,填充'未知'。以下为示例输出: - "技术评分": { + { + "技术评分": { "主要监理岗位的职责": { "评分": "4分", "要求": "1、总监理工程师的职责全面、清晰、合理得 1.2-2分;一般的1.2分。2、其他主要监理人员及岗位的职责全面、清晰、合理得 1.2-2分;一般的 1.2分。" - } } + }, + "商务评分": { + "控制系统内主板": { + "评分": "10分", + "要求": "所投电梯控制系统内主板为制造商原厂原品牌制造生产且为进口部件得 10分。(提供进口部件报关单及原产地证明扫描件加盖公章,否则不得分)" + }, + "制造商技术实力": [ + { + "评分": "3分", + "要求": "一级证书得3分,二级证书得1分,其他不得分" + }, + { + "评分": "2分", + "要求": "行业销量排名连续前 2 名,得 2 分,第 4-6 名得 0.5 分,其他不得分" + } + ] + }, + "投标报价评审": { + "投标报价是否出现违反计价规范": { + "评分": "合格/不合格", + "要求": "A:投标报价未违反计价规范的,评审意见为“合格”;B:投标报价违反计价规范的,评审意见为“不合格”" + } + } + } """ ) @@ -133,7 +157,7 @@ def combine_evaluation_standards(truncate2): # return evaluation_combined_res return update_json #商务标技术标整合 if __name__ == "__main__": - truncate2="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest1_evaluation_method.pdf" + truncate2="C:\\Users\\Administrator\\Desktop\\招标文件\\招标01_evaluation_method.pdf" evaluation_standards_res=combine_evaluation_standards(truncate2) # 从结果中提取"商务标"和"技术标" technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})} diff --git a/flask_app/main/多线程提问.py b/flask_app/main/多线程提问.py index 8957a07..ec52d13 100644 --- a/flask_app/main/多线程提问.py +++ b/flask_app/main/多线程提问.py @@ -160,7 +160,9 @@ def multi_threading(queries, knowledge_name="", file_id="", llm_type=1): while not result_queue.empty(): index, result = result_queue.get() results[index] = result - + # 检查是否所有结果都是 None + if all(result is None for result in results): + return [] # 返回一个保证是列表的结构 return results @@ -194,9 +196,9 @@ if __name__ == "__main__": # for question, response in results: # print(f"Question: {question}") # print(f"Response: {response}") - - ques=["该招标文件的工程名称(项目名称)是?招标编号是?招标人是?招标代理机构是?请按json格式给我提供信息,键名分别是'工程名称','招标编号','招标人','招标代理机构',若存在未知信息,在对应的键值中填'未知'。","该招标文件的工程概况(或项目概况)是?招标范围是?请按json格式给我提供信息,键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。"] - results = multi_threading(ques, "招标解析5word") + ques=["关于'资格要求',本采购文件第一章第二款要求的内容是怎样的?请按json格式给我提供信息,键名为'资格要求',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。"] + # ques=["该招标文件的工程名称(项目名称)是?招标编号是?招标人是?招标代理机构是?请按json格式给我提供信息,键名分别是'工程名称','招标编号','招标人','招标代理机构',若存在未知信息,在对应的键值中填'未知'。","该招标文件的工程概况(或项目概况)是?招标范围是?请按json格式给我提供信息,键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。"] + results = multi_threading(ques, "6.2视频会议docx") if not results: print("errror!") else: diff --git a/flask_app/main/截取pdf.py b/flask_app/main/截取pdf.py index e871234..cfc60e8 100644 --- a/flask_app/main/截取pdf.py +++ b/flask_app/main/截取pdf.py @@ -265,8 +265,8 @@ def truncate_pdf_multiple(input_path, output_folder): if __name__ == "__main__": # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74\\ztbfile.pdf" # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74" - input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标02.pdf" - output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\test4" + input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标03.pdf" + output_folder="C:\\Users\\Administrator\\Desktop\\招标文件" truncate_pdf_multiple(input_path,output_folder) # selection = 3 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标 # generated_files = truncate_pdf_main(input_path, output_folder, selection) diff --git a/flask_app/main/投标人须知正文提取指定内容.py b/flask_app/main/投标人须知正文提取指定内容.py index cef8bc7..5845a9d 100644 --- a/flask_app/main/投标人须知正文提取指定内容.py +++ b/flask_app/main/投标人须知正文提取指定内容.py @@ -256,7 +256,7 @@ def extract_from_notice(clause_path, type): data = json.load(file) extracted_data = extract_json(data, target_values) # 读取json # print(json.dumps(extracted_data,ensure_ascii=False,indent=4)) - sorted_data = sort_clean_data_keys(extracted_data) # 对键进行排序 + sorted_data = sort_clean_data_keys(extracted_data) # 对输入的字典 data 的键进行预处理和排序 transformed_data = transform_json(sorted_data) final_result = process_nested_data(transformed_data) return final_result diff --git a/flask_app/main/招标文件解析.py b/flask_app/main/招标文件解析.py index a63da41..68e2f02 100644 --- a/flask_app/main/招标文件解析.py +++ b/flask_app/main/招标文件解析.py @@ -27,7 +27,6 @@ def get_global_logger(unique_id): logger=None -# 可能有问题:pdf转docx导致打勾符号消失 # 创建全局线程池 executor = ThreadPoolExecutor() def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id): @@ -64,7 +63,7 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id): # 提前返回,不等待 future_knowledge 完成,返回包含 Future 对象 return { - 'input_file_path': downloaded_file_path, + 'file_path': downloaded_file_path, 'output_folder': output_folder, 'truncate0': truncate0, 'truncate1': truncate1, @@ -181,7 +180,7 @@ def fetch_bid_opening(clause_path): # 'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'], # processed_data['truncate3'], # processed_data['knowledge_name'], processed_data['truncate0_jsonpath'], -# processed_data['clause_path'],processed_data['input_file_path'],processed_data['output_folder']), +# processed_data['clause_path'],processed_data['file_path'],processed_data['output_folder']), # 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']), # 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], # output_folder, processed_data['truncate0_jsonpath'], @@ -216,7 +215,7 @@ def fetch_bid_opening(clause_path): #分段返回 -def main_processing(output_folder, downloaded_file_path, file_type, unique_id): +def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_id): global logger logger = get_global_logger(unique_id) @@ -271,7 +270,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id): 'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'], processed_data['truncate3'], knowledge_name, processed_data['truncate0_jsonpath'], - processed_data['clause_path'], processed_data['input_file_path'], + processed_data['clause_path'], processed_data['file_path'], processed_data['output_folder']), } diff --git a/flask_app/货物标/test.py b/flask_app/货物标/test.py index 74731da..2d86d43 100644 --- a/flask_app/货物标/test.py +++ b/flask_app/货物标/test.py @@ -1,14 +1,59 @@ -def test_append_newline(): - def check_append_newline(key): - append_newline = len(key.split('.')) == 2 - return append_newline +import re - # 测试用例 - test_cases = ["1.1", "1."] - for case in test_cases: - result = check_append_newline(case) - print(f"序号 '{case}': append_newline = {result}") +def generate_questions(input_list): + template = ( + "关于'{key}',{value}的内容是怎样的?请按json格式给我提供信息,键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。" + ) -# 运行测试 -test_append_newline() \ No newline at end of file + questions = [] + for input_dict in input_list: + for key, value in input_dict.items(): + processed_value = preprocess_value(value) + question = template.format(key=key, value=processed_value) + questions.append(question) + return questions + +def preprocess_value(value): + # 使用正则表达式查找"第X章"或"第X款" + chapter_match = re.search(r'第(.+?)章', value) + clause_match = re.search(r'第(.+?)款', value) + + if chapter_match or clause_match: + # 以逗号、句号、问号、感叹号为分隔符 + separators = r'[,。?!,\?!]' + + # 分隔符检测函数,确保括号成对闭合时才用作分隔符 + def is_separator(ch, count): + return count['('] == count[')'] and count['('] == count[')'] and re.match(separators, ch) + + parts = [] + current_part = [] + count = {'(': 0, ')': 0, '(': 0, ')': 0} + + for ch in value: + if ch in count: + count[ch] += 1 + if is_separator(ch, count): + parts.append("".join(current_part).strip()) + current_part = [] + else: + current_part.append(ch) + + if current_part: + parts.append("".join(current_part).strip()) + + # 查找包含章节或条款的部分 + target_part = next((part for part in parts if '章' in part or '款' in part), None) + + if target_part: + # 删除开头的"符合"或"应满足" + target_part = re.sub(r'^(符合|应满足)\s*', '', target_part.strip()) + return target_part + + # 如果没有找到特定章节或条款,返回原始值 + return value + +input_list=[{'资格性审查标准.资格要求': '符合本采购文件第一章第二款要求,并提供合格有效的证明材料'}] +res=generate_questions(input_list) +print(res) \ No newline at end of file diff --git a/flask_app/货物标/基础信息解析main.py b/flask_app/货物标/基础信息解析main.py new file mode 100644 index 0000000..e988e69 --- /dev/null +++ b/flask_app/货物标/基础信息解析main.py @@ -0,0 +1,2 @@ +def combine_basic_info(): + return True \ No newline at end of file diff --git a/flask_app/货物标/投标人须知正文提取指定内容货物标版.py b/flask_app/货物标/投标人须知正文提取指定内容货物标版.py index 4999566..451ed97 100644 --- a/flask_app/货物标/投标人须知正文提取指定内容货物标版.py +++ b/flask_app/货物标/投标人须知正文提取指定内容货物标版.py @@ -2,7 +2,6 @@ import json import re #提取两个大标题之间的内容 - def extract_between_sections(data, target_values): target_found = False extracted_data = {} @@ -49,25 +48,6 @@ def process_with_outer_key(data): processed_data[outer_key] = processed_inner_data return processed_data -def sort_clean_data_keys(data): - # 预处理:删除键名中的空格 - def preprocess_key(key): - return re.sub(r'\s+', '', key) - - # 将键转换成由整数构成的元组,作为排序依据 - def key_func(key): - return tuple(int(part) for part in re.split(r'\D+', key) if part) - - # 创建一个新的字典,键名经过预处理 - preprocessed_data = {preprocess_key(key): value for key, value in data.items()} - - # 对预处理后的字典键进行排序 - sorted_keys = sorted(preprocessed_data.keys(), key=key_func) - - # 创建一个新的字典,按照排序后的键添加键值对 - sorted_data = {key: preprocessed_data[key] for key in sorted_keys} - - return sorted_data # 转换结构化的JSON数据 def transform_json(data): @@ -230,8 +210,6 @@ def extract_from_notice(clause_path, type): data = json.load(file) extracted_data = extract_between_sections(data, target_values) # 读取json transformed_data=process_with_outer_key(extracted_data) - # sorted_data=sort_clean_data_keys(extracted_data) #对键进行排序 - # transformed_data = transform_json(extracted_data) final_result=process_nested_data(transformed_data) return final_result diff --git a/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py b/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py index 588d86b..fabedba 100644 --- a/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py +++ b/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py @@ -32,18 +32,15 @@ def extract_text_from_pdf(file_path, start_word, end_pattern): # 在最后一页查找结束位置 if i == len(pdf_document.pages) - 1: - for pattern in end_pattern: - matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE)) - if matches: - end_index = matches[-1].start() - cleaned_text = cleaned_text[:end_index] - break + matches = list(re.finditer(end_pattern, cleaned_text, re.MULTILINE)) + if matches: + end_index = matches[-1].start() + cleaned_text = cleaned_text[:end_index] all_pages_text.append(cleaned_text) # 合并所有页面的文本 full_text = "\n".join(all_pages_text) - # print(full_text) return full_text #fitz库版本 @@ -246,6 +243,84 @@ def parse_text_by_heading(text): return data +#type=2时提取货物标的第一章招标公告时采用该逻辑 +def parse_text_to_dict(text): + """ + 解析文本,根据大标题划分内容,生成字典。 + + 参数: + text (str): 要解析的文本。 + + 返回: + dict: 大标题作为键,内容作为值的字典。 + """ + # 正则表达式模式:匹配以一至十的汉字数字开头,后跟顿号和任意字符,且位于行首 + pattern = re.compile(r'^([一二三四五六七八九十]+\s*、\s*.*)$', re.MULTILINE) + + # 使用 re.finditer 找到所有大标题的位置 + matches = list(pattern.finditer(text)) + + result = {} + for i, match in enumerate(matches): + title = match.group(1).strip() # 获取大标题文本 + start = match.end() # 内容的起始位置 + + if i + 1 < len(matches): + end = matches[i + 1].start() # 下一个大标题的起始位置 + else: + end = len(text) # 最后一个大标题,内容到文本末尾 + + content = text[start:end].strip() # 获取内容并去除前后空白 + # 规范化换行符,并移除每行开头和结尾的空白 + content = content.replace('\r\n', '\n') # 统一换行符 + content = re.sub(r'[ \t]+\n', '\n', content) # 移除行尾空白 + content = re.sub(r'^[ \t]+|[ \t]+$', '', content, flags=re.MULTILINE) # 移除行首和行尾空白 + content = clean_content(content) # 处理内容中的换行符 + result[title] = content + + return result +def clean_content(content): + """ + 处理内容中的换行符: + - 保留在子项编号前的换行符。 + - 保留在冒号 ':' 或全角冒号 ':' 前的第一个换行符。 + - 移除其他位置的换行符,不留下额外的空格。 + + 参数: + content (str): 要处理的内容字符串。 + + 返回: + str: 处理后的内容字符串。 + """ + # 定义子项编号的正则模式,包括: + # - 数字+点号+数字(如 1.1 或 1.1) + # - 数字+顿号(如 2、) + # - 点号+数字(如 .3 或 .3) + # - 数字+右括号(如 1) 或 1)) + # - 圆括号包围的数字(如 (5)) + # - 全角圆括号包围的数字(如 (5)) + # - 数字+点号(如 1. 或 1.) + numbering_pattern = r'(?:\d+[..]\d+(?:[..]\d+)*|\d+、|[..]\d+|\d+[))]|\(\d+\)|(\d+)|\d+[..])' + + # 定义需要保留换行符的情况: + # 1. 换行符前面是子项编号 + # 2. 换行符前面是任意非空白字符,且后面跟着 ':' 或 ':' + pattern_keep = r'\n(?=(?:' + numbering_pattern + r'|[^:\n\r\f\v]+[::]))' + + # 定义占位符,用于暂时替换需要保留的换行符 + placeholder = "___PLACEHOLDER___" + + # Step 1: 将需要保留的换行符替换为占位符 + content_with_placeholder = re.sub(pattern_keep, placeholder, content) + + # Step 2: 移除所有剩余的换行符 + content_no_newlines = content_with_placeholder.replace('\n', '') + + # Step 3: 将占位符替换回换行符 + cleaned_content = content_no_newlines.replace(placeholder, '\n') + + return cleaned_content + # def convert_to_json(file_path, start_word, end_phrases): # if file_path.endswith('.docx'): # text = extract_text_from_docx(file_path) @@ -263,19 +338,21 @@ def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json return "" if type == 1: start_word = r'^\s*[((]?\s*[一1]\s*[))]?\s*[、.]*\s*(说\s*明|总\s*则)' - end_pattern = [r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'] + end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+' + text = extract_text_from_pdf(file_path, start_word, end_pattern) + result = parse_text_by_heading(text) else: - start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:' - end_pattern=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表'] - text = extract_text_from_pdf(file_path, start_word, end_pattern) - result = parse_text_by_heading(text) + start_word = r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*' + end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人|磋商|供应商)须知前附表)' + text = extract_text_from_pdf(file_path, start_word, end_pattern) + result=parse_text_to_dict(text) # result = convert_to_json(input_path, start_word, end_pattern) # 检查输出文件夹是否存在,如果不存在则创建 if not os.path.exists(output_folder): os.makedirs(output_folder) print(f"Created output folder: {output_folder}") - # file_name = "clause1.json" if type == 1 else "clause2.json" - file_name = f"clause{suffix_counter}.json" + file_name = "clause1.json" if type == 1 else "clause2.json" + # file_name = f"clause{suffix_counter}.json" output_path = os.path.join(output_folder, file_name) with open(output_path, 'w', encoding='utf-8') as f: json.dump(result, f, indent=4, ensure_ascii=False) @@ -303,18 +380,18 @@ def process_folder(input_folder, output_folder): #TODO:标题'一'应该越来越大,与'1.1'的逻辑分开'; #TODO:911904c3-4d6e-47e0-acd8-b05e582209cb文件夹运行有问题,百炼出错了 if __name__ == "__main__": - # # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf' - # file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\竞争性谈判文件(3)_tobidders_notice_part2.pdf' - # # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf' - # output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2' - # try: - # output_path = convert_clause_to_json(file_path,output_folder) - # print(f"Final JSON result saved to: {output_path}") - # except ValueError as e: - # print("Error:", e) + # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf' + file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output5\\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件_notice.pdf' + # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf' + output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output5\\tmp2' + try: + output_path = convert_clause_to_json(file_path,output_folder,2) + print(f"Final JSON result saved to: {output_path}") + except ValueError as e: + print("Error:", e) - input_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4' - output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1' - - # 调用 process_folder 来处理整个文件夹中的所有文件 - process_folder(input_folder, output_folder) + # input_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4' + # output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1' + # + # # 调用 process_folder 来处理整个文件夹中的所有文件 + # process_folder(input_folder, output_folder) diff --git a/flask_app/货物标/无效标和废标和禁止投标整合货物标版.py b/flask_app/货物标/无效标和废标和禁止投标整合货物标版.py index 1c8756f..f2dec80 100644 --- a/flask_app/货物标/无效标和废标和禁止投标整合货物标版.py +++ b/flask_app/货物标/无效标和废标和禁止投标整合货物标版.py @@ -31,7 +31,7 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): match = re.search(pattern, text, re.IGNORECASE) if match: # 如果当前的模式是 '不\s*得',则额外检查是否匹配 '不得分' - if pattern == r'不\s*得': + if '不\s*得' in pattern: # 使用字符串匹配检查模式 post_match_text = text[match.end():].strip() if post_match_text.startswith("分"): continue # 如果是"不得分",跳过这个匹配 @@ -259,6 +259,12 @@ def extract_table_with_keywords(data, keywords, follow_up_keywords): while i < len(split_sentences): sentence = split_sentences[i].strip() if re.search(keywords, sentence, re.IGNORECASE): + # 处理 "不\s*得" 的特殊情况,跳过包含 "不得分" 的句子 + if re.search(r'不\s*得', sentence, re.IGNORECASE): + post_match_text = sentence[re.search(r'不\s*得', sentence).end():].strip() + if post_match_text.startswith("分"): + i += 1 # 跳过这个句子 + continue follow_up_present = any( re.search(follow_up, sentence, re.IGNORECASE) for follow_up in follow_up_keywords ) @@ -334,13 +340,13 @@ def extract_values_if_contains(data, includes): #以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号。 #以上是原文内容,文本内的信息以'...............'分割,请你根据该信息回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选所需的信息并返回。最终结果以json列表格式返回给我,键名为'否决和无效投标情形',你的回答完全忠于原文内容,且回答内容与原文内容一致,要求完整与准确,不能擅自总结或者概括。", -def handle_query(file_path, user_query, output_file, result_key, keywords,truncate_file): +def handle_query(file_path, user_query, output_file, result_key, keywords): excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:","我方"] follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下'] extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) #字典结果 all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表 - table_data_list=read_docx_last_column(truncate_file) #从投标人须知前附表中提取信息生成列表data,每个元素为'一行信息' - # table_data_list=read_tables_from_docx(file_path) + # table_data_list=read_docx_last_column(truncate_file) #从投标人须知前附表中提取信息生成列表data,每个元素为'一行信息' + table_data_list=read_tables_from_docx(file_path) all_tables1, all_tables2 = extract_table_with_keywords(table_data_list, keywords,follow_up_keywords) qianwen_txt = all_texts1 + all_tables1 # Proceed only if there is content to write @@ -376,7 +382,7 @@ def handle_query(file_path, user_query, output_file, result_key, keywords,trunca return res -def combine_find_invalid(file_path, output_dir,truncate_file): +def combine_find_invalid(file_path, output_dir): queries = [ (r'否\s*决|无\s*效\s*投\s*标|无\s*效\s*文\s*件|文\s*件\s*无\s*效|无\s*效\s*响\s*应|无\s*效\s*报\s*价|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效', "以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。", @@ -393,7 +399,7 @@ def combine_find_invalid(file_path, output_dir,truncate_file): with ThreadPoolExecutor() as executor: futures = [] for keywords, user_query, output_file, result_key in queries: - future = executor.submit(handle_query, file_path, user_query, output_file, result_key, keywords,truncate_file) + future = executor.submit(handle_query, file_path, user_query, output_file, result_key, keywords) futures.append(future) time.sleep(1) # 暂停1秒后再提交下一个任务 @@ -416,12 +422,12 @@ def combine_find_invalid(file_path, output_dir,truncate_file): if __name__ == '__main__': start_time = time.time() # truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json" - truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招标文件(实高电子显示屏)_tobidders_notice_part1.docx" - clause_path="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause招标文件(实高电子显示屏)_tobidders_notice_part2.json" + # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招标文件(实高电子显示屏)_tobidders_notice_part1.docx" + clause_path="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause招标文件(广水市教育局封闭管理项目二次)_tobidders_notice_part2.json" output_dir = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\invalid" # doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx' - doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件(实高电子显示屏).docx' - results = combine_find_invalid(doc_path, output_dir,truncate_file) + doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件(广水市教育局封闭管理项目二次).docx' + results = combine_find_invalid(doc_path, output_dir) end_time = time.time() print("Elapsed time:", str(end_time - start_time)) print("Results:", json.dumps(results,ensure_ascii=False,indent=4)) diff --git a/flask_app/货物标/评分标准提取.py b/flask_app/货物标/评分标准提取.py deleted file mode 100644 index 71a46dc..0000000 --- a/flask_app/货物标/评分标准提取.py +++ /dev/null @@ -1,89 +0,0 @@ -# -*- encoding:utf-8 -*- -import json - -from flask_app.main.通义千问long import upload_file, qianwen_long -from flask_app.main.json_utils import clean_json_string -def combine_technical_and_business(data, target_values): - extracted_data = {} # 根级别存储所有数据 - technical_found = False - business_found = False - - def extract_nested(data, parent_key='', is_technical=False, is_business=False): - nonlocal technical_found, business_found - if isinstance(data, dict): - for key, value in data.items(): - current_key = f"{parent_key}.{key}" if parent_key else key - - # 检查是否为技术标的内容 - if any(target in key for target in target_values): - if not is_technical: - extracted_data[key] = value - technical_found = True - continue - - # 默认其他所有内容都归为商务标 - else: - if not is_business: - if '商务标' not in extracted_data: - extracted_data['商务标'] = {} - extracted_data['商务标'][key] = value - business_found = True - continue - - if isinstance(value, dict) or isinstance(value, list): - extract_nested(value, current_key, is_technical, is_business) - - elif isinstance(data, list): - for index, item in enumerate(data): - extract_nested(item, f"{parent_key}[{index}]", is_technical, is_business) - - extract_nested(data) - - if not technical_found: - extracted_data['技术标'] = '' - if not business_found: - extracted_data['商务标'] = '' - - return extracted_data - -#如果外键直接是'评分因素',应该这个函数可以提取其中内容。 -def process_data_based_on_key(data, word): - # 获取字典的键列表 - keys = list(data.keys()) - # 检查键的数量是否为1并且该键是否包含指定的词 - if len(keys) == 1 and word in keys[0]: - # 返回内层的字典 - return data[keys[0]] - # 如果条件不满足,则返回原始字典 - return data - -def get_evaluation_standards(truncate_file): - file_id = upload_file(truncate_file) - user_query = "根据该文档中的评标办法前附表或者评分标准表,请你列出该文件的技术标,商务标,投标报价评审标准以及它们对应的具体评分要求,外层键名分别为'技术标','商务标','投标报价'。如果评分内容不是这3个,则返回文档中给定的评分内容以及它的评分要求,都以json的格式返回结果,如果该采购活动有多个包,则最外层键名为对应的包名。请不要回答有关资格审查的内容" - evaluation_res = qianwen_long(file_id, user_query) - cleaned_evaluation_res = clean_json_string(evaluation_res) - result_data = process_data_based_on_key(cleaned_evaluation_res, '评分') - include = ['一包', '二包', '三包', '四包', '五包'] - target_values = ['技术', '设计', '实施'] - updated_jsons = {} - - # 检查是否有外层键匹配include列表 - if any(key for key in result_data if any(included in key for included in include)): - # 有匹配的项,处理这些项 - for key in result_data: - if any(item in key for item in include): - inner_dict = result_data[key] - updated_jsons[key] = combine_technical_and_business(inner_dict, target_values) - else: - # 没有匹配的项,对整个字典运行 - updated_jsons = combine_technical_and_business(result_data, target_values) - - # 将updated_jsons转换为JSON格式 - evaluation_combined_res = json.dumps(updated_jsons, ensure_ascii=False, indent=4) - return evaluation_combined_res - - -if __name__ == "__main__": - truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output2\\竞争性谈判文件(3)_evaluation_method.pdf" - res=get_evaluation_standards(truncate_file) - print(res) diff --git a/flask_app/货物标/评分标准提取main.py b/flask_app/货物标/评分标准提取main.py new file mode 100644 index 0000000..b3c7523 --- /dev/null +++ b/flask_app/货物标/评分标准提取main.py @@ -0,0 +1,229 @@ +# -*- encoding:utf-8 -*- +import json +import re +from collections import defaultdict +from flask_app.main.通义千问long import upload_file, qianwen_long + + +def combine_technical_and_business(data, target_values): + extracted_data = {} # 根级别存储所有数据 + technical_found = False + business_found = False + + def extract_nested(data, parent_key='', is_technical=False, is_business=False): + nonlocal technical_found, business_found + if isinstance(data, dict): + for key, value in data.items(): + current_key = f"{parent_key}.{key}" if parent_key else key + + # 检查是否为技术标的内容 + if any(target in key for target in target_values): + if not is_technical: + extracted_data[key] = value + technical_found = True + continue + + # 默认其他所有内容都归为商务标 + else: + if not is_business: + if '商务标' not in extracted_data: + extracted_data['商务标'] = {} + extracted_data['商务标'][key] = value + business_found = True + continue + + if isinstance(value, dict) or isinstance(value, list): + extract_nested(value, current_key, is_technical, is_business) + + elif isinstance(data, list): + for index, item in enumerate(data): + extract_nested(item, f"{parent_key}[{index}]", is_technical, is_business) + + extract_nested(data) + + if not technical_found: + extracted_data['技术标'] = '' + if not business_found: + extracted_data['商务标'] = '' + + return extracted_data + + +# 防止外键只有一个'一包'的情况 +def process_data_based_on_key(data): + exclude_word = ["包", "未知", "评分因素"] + # 获取字典的键列表 + keys = list(data.keys()) + # 检查键的数量是否为1并且 exclude_word 中的任何词包含在 keys[0] 中 + if len(keys) == 1 and any(word in keys[0] for word in exclude_word): + # 返回内层的字典 + return data[keys[0]] + + # 如果条件不满足,则返回原始字典 + return data + + +def parse_json_with_duplicates(raw_string): + """ + 解析具有重复键的 JSON 字符串,将所有重复的键值对存储为列表。 + + Args: + json_string (str): 需要解析的 JSON 字符串。 + + Returns: + dict: 解析后的字典,重复的键对应的值为列表。 + + eg:输入:"综合实力": { + "评分": "2分", + "要求": "投标人具备电子与智能化工程专业承包二级资质及以上证书得 2分,不能够提供不得分(开标时需提供原件)。" + }, + "综合实力": { + "评分": "2分", + "要求": "投标人具有建筑机电安装工程专业承包三级资质或以上资质得 2分,否则不得分。(证书开标原件备查)。" + } + 输出:"综合实力": [ + { + "评分": "2分", + "要求": "投标人具备电子与智能化工程专业承包二级资质及以上证书得 2分,不能够提供不得分(开标时需提供原件)。" + }, + { + "评分": "2分", + "要求": "投标人具有建筑机电安装工程专业承包三级资质或以上资质得 2分,否则不得分。(证书开标原件备查)。" + }] + """ + + def custom_object_pairs_hook(pairs): + d = defaultdict(list) + for key, value in pairs: + # 如果值是字典或列表,递归处理 + if isinstance(value, dict): + value = process_dict(value) + elif isinstance(value, list): + value = process_list(value) + d[key].append(value) + # 将有多个值的键转换为列表,单个值的键保持原样 + return {key: (values if len(values) > 1 else values[0]) for key, values in d.items()} + + def process_dict(d): + """ + 递归处理字典,确保所有重复键的值为列表。 + + Args: + d (dict): 需要处理的字典。 + + Returns: + dict: 处理后的字典。 + """ + return custom_object_pairs_hook(d.items()) + + def process_list(l): + """ + 递归处理列表,确保列表中的所有字典也被处理。 + + Args: + l (list): 需要处理的列表。 + + Returns: + list: 处理后的列表。 + """ + return [process_dict(item) if isinstance(item, dict) else item for item in l] + + """输入字符串,提取 { 和 } 之间的内容,并将其解析为字典""" + if not raw_string.strip(): + return {} + match = re.search(r'\{[\s\S]*\}', raw_string) + if match: + try: + json_string = match.group(0) + return json.loads(json_string, object_pairs_hook=custom_object_pairs_hook) + except json.JSONDecodeError as e: + print(f"json_utils: extract_content_from_json: JSON decode error: {e}") + return {} + else: + print("json_utils: extract_content_from_json: No valid JSON content found.") + return {} + + +def combine_evaluation_standards(truncate_file): + file_id = upload_file(truncate_file) + user_query1 = "根据该文档,你判断它是否有具体的关于技术评分或商务评分或投标报价的评分要求,如果有,返回'是',否则返回'否'。" # 应对竞争性谈判这种无评分要求的情况 + judge_res = qianwen_long(file_id, user_query1) + # 默认 judge 为 True + judge = True + # 检查 judge_res 的内容 + if '否' in judge_res: + judge = False + if judge: + # 执行 user_query 相关的逻辑 + # user_query = "根据该文档中的评标办法前附表或者评分标准表,请你列出该文件的技术评分,商务评分,投标报价评审标准以及它们对应的具体评分要求,外层键名分别为'技术评分','商务评分','投标报价'。如果评分内容不是这3个,则返回文档中给定的评分内容以及它的评分要求,都以json的格式返回结果,如果该采购活动有多个包,则最外层键名为对应的包名。请不要回答有关资格审查的内容" + user_query = ( + """ + 根据该文档中的评标办法前附表,请你列出该文件的技术评分,商务评分,投标报价评审以及它们对应的具体评分要求,请以json格式返回结果,请在这三大块评分中分别用若干键值对表示具体要求,请精确到具体的评审项,内层的键名为'评分'及'要求',若这三大块评分中存在其他信息,则在相应评分大块中新增键名'备注'存放该信息,键值为具体的要求,否则不需要。如果评分内容(因素)不是这3个,则返回文档中给定的评分内容(因素)以及它们的具体评分要求,如果该采购活动有多个包,则最外层键名为对应的包名,否则不需要。不要回答有关资格审查的内容,若存在未知信息,填充'未知'。以下为示例输出: + { + "一包": { + "技术评分": { + "主要监理岗位的职责": { + "评分": "4分", + "要求": "1、总监理工程师的职责全面、清晰、合理得 1.2-2分;一般的1.2分。2、其他主要监理人员及岗位的职责全面、清晰、合理得 1.2-2分;一般的 1.2分。" + } + }, + "商务评分": { + "控制系统内主板": { + "评分": "10分", + "要求": "所投电梯控制系统内主板为制造商原厂原品牌制造生产且为进口部件得 10分。(提供进口部件报关单及原产地证明扫描件加盖公章,否则不得分)" + }, + "制造商技术实力": [ + { + "评分": "3分", + "要求": "一级证书得3分,二级证书得1分,其他不得分" + }, + { + "评分": "2分", + "要求": "行业销量排名连续前 2 名,得 2 分,第 4-6 名得 0.5 分,其他不得分" + } + ] + }, + "投标报价评审": { + "投标报价是否出现违反计价规范": { + "评分": "合格/不合格", + "要求": "A:投标报价未违反计价规范的,评审意见为“合格”;B:投标报价违反计价规范的,评审意见为“不合格”" + } + } + } +} + """ + ) + # 执行第二个查询 + evaluation_res = qianwen_long(file_id, user_query) + cleaned_evaluation_res = parse_json_with_duplicates(evaluation_res) + result_data = process_data_based_on_key(cleaned_evaluation_res) + include = ['一包', '二包', '三包', '四包', '五包'] + target_values = ['技术', '设计', '实施'] + updated_jsons = {} + + # 检查是否有外层键匹配 include 列表 + if any(key for key in result_data if any(included in key for included in include)): + # 有匹配的项,处理这些项 + for key in result_data: + if any(item in key for item in include): + inner_dict = result_data[key] + updated_jsons[key] = combine_technical_and_business(inner_dict, target_values) + else: + # 没有匹配的项,对整个字典运行 + updated_jsons = combine_technical_and_business(result_data, target_values) + return updated_jsons + else: + # 如果 judge 是 False,直接返回默认的技术标和商务标的结构 + result_data = {} + result_data['技术标'] = '' + result_data['商务标'] = '' + return result_data + + +if __name__ == "__main__": + # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output2\\2-招标文件_evaluation_method.pdf" + # truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output2\\2-招标文件(统计局智能终端二次招标)_evaluation_method.pdf" + # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output2\\广水市妇幼招标文件最新(W改)_evaluation_method.pdf" + truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output2\\622二次视频会议磋商文件_evaluation_method.pdf" + res = combine_evaluation_standards(truncate_file) + print(json.dumps(res, ensure_ascii=False, indent=4)) diff --git a/flask_app/货物标/货物标截取pdf.py b/flask_app/货物标/货物标截取pdf.py index ca38048..d540fac 100644 --- a/flask_app/货物标/货物标截取pdf.py +++ b/flask_app/货物标/货物标截取pdf.py @@ -196,20 +196,27 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt return generated_files -def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,output_suffix="normal"): +#默认逻辑是start_page匹配上就不再设置了,一般不匹配上目录的原因是设置了begin_page=5,但是匹配'第一章 招标公告'的时候start_page可能会错误匹配到目录。 +def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None, + output_suffix="normal"): start_page = None end_page = None for i, page in enumerate(pdf_document.pages): text = page.extract_text() or "" cleaned_text = clean_page_content(text, common_header) - if output_suffix=="tobidders_notice": + if output_suffix == "tobidders_notice": if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and start_page is not None: continue else: if exclusion_pattern and re.search(exclusion_pattern, cleaned_text): continue - if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page: - start_page = i + if output_suffix == "notice": + if re.search(begin_pattern, cleaned_text) and i > begin_page: + start_page = i + else: + if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page: + start_page = i + if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page: end_page = i break @@ -237,7 +244,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter # 原有的处理逻辑保持不变 if output_suffix == "qualification1": exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') - start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern) + start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern,output_suffix) if start_page is None or end_page is None: print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header) @@ -260,7 +267,7 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, beg if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page: start_page = i if start_page is not None and mid_page is None and re.search( - r'^\s*[((]?\s*[一1]\s*[))]?\s*[、.]*\s*(说\s*明|总\s*则)', cleaned_text, re.MULTILINE): + r'^\s*[((]?\s*[一1]\s*[))]?\s*[、..]*\s*(说\s*明|总\s*则)', cleaned_text, re.MULTILINE): mid_page = i if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page: end_page = i @@ -300,6 +307,15 @@ def get_patterns_for_qualification(): return begin_pattern_new, end_pattern_new +def get_patterns_for_notice(): + begin_pattern = re.compile( + r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*', re.MULTILINE + ) + end_pattern = re.compile( + r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)', re.MULTILINE + ) + return begin_pattern, end_pattern + def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header): #投标人须知前附表/正文二次提取 begin_pattern = re.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知前附表)+' @@ -331,17 +347,23 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header): exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') pdf_document = PdfReader(pdf_path) patterns = None + begin_page=0 if output_suffix == "procurement": patterns = [get_patterns_for_procurement()] + begin_page=5 elif output_suffix == "evaluation_method" or output_suffix=="qualification2" or output_suffix=="qualification3": patterns = [get_patterns_for_evaluation_method()] + begin_page = 5 elif output_suffix == "qualification1": patterns = [get_patterns_for_qualification()] # This now returns a tuple of pattern pairs - + begin_page = 5 + elif output_suffix == "notice": + patterns=[get_patterns_for_notice()] + begin_page = 0 # Try each set of patterns until a valid range is found for pattern_pair in patterns: - start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], 5, common_header, - exclusion_pattern) + start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page, common_header, + exclusion_pattern,output_suffix) if start_page is not None and end_page is not None: break if start_page is None or end_page is None: @@ -404,14 +426,23 @@ def truncate_pdf_main(input_path, output_folder, selection, output_suffix="defau elif selection ==4: #投标人须知前附表和正文 begin_page=1 begin_pattern = re.compile( - r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)', re.MULTILINE + r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人|磋商|供应商)须知前附表)', re.MULTILINE ) end_pattern=re.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE ) local_output_suffix = "tobidders_notice" + elif selection==5: + begin_page=0 + begin_pattern = re.compile( + r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*' + ) + end_pattern = re.compile( + r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人|磋商|供应商)须知前附表)' + ) + local_output_suffix = "notice" else: - print("无效的选择") + print("无效的选择:请选择1-6") return None # 如果传入的 output_suffix 是 'default',则使用本地生成的 output_suffix @@ -425,7 +456,7 @@ def truncate_pdf_main(input_path, output_folder, selection, output_suffix="defau def truncate_pdf_multiple(input_path, output_folder): truncate_files = [] - for selection in range(1, 5): + for selection in range(1, 6): files = truncate_pdf_main(input_path, output_folder, selection) truncate_files.extend(files) return truncate_files @@ -434,7 +465,8 @@ def truncate_pdf_multiple(input_path, output_folder): # TODO:交通智能系统和招标(1)(1)文件有问题 sele=4的时候excludsion有问题 if __name__ == "__main__": input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles" - output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output4" - # truncate_pdf_multiple(input_path,output_folder) - selection = 4 # 例如:1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1和qualification2 4.投标人须知前附表 + output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\outputtest" + # files=truncate_pdf_multiple(input_path,output_folder) + # print(files) + selection = 1 # 例如:1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 generated_files = truncate_pdf_main(input_path, output_folder, selection) diff --git a/flask_app/货物标/货物标解析main.py b/flask_app/货物标/货物标解析main.py index 4624a6f..c6d3570 100644 --- a/flask_app/货物标/货物标解析main.py +++ b/flask_app/货物标/货物标解析main.py @@ -1,13 +1,182 @@ +#竞磋 竞谈 磋商 询价 邀请 单一来源 +import json + from flask_app.main.format_change import docx2pdf, pdf2docx -from flask_app.货物标.货物标截取pdf import truncate_pdf_main -def main_processing(output_folder,file_path,file_type, unique_id): - if file_type==1: - docx_path=file_path - pdf_path = docx2pdf(file_path) - elif file_type==2: - pdf_path=file_path - docx_path=pdf2docx(file_path) +from flask_app.main.json_utils import transform_json_values +from flask_app.货物标.基础信息解析main import combine_basic_info +from flask_app.货物标.投标人须知正文提取指定内容货物标版 import extract_from_notice +from flask_app.货物标.货物标截取pdf import truncate_pdf_multiple +from concurrent.futures import ThreadPoolExecutor +import concurrent.futures +from flask_app.main.知识库操作 import addfileToKnowledge, deleteKnowledge +from flask_app.货物标.投标人须知正文条款提取成json文件货物标版 import convert_clause_to_json +from flask_app.货物标.无效标和废标和禁止投标整合货物标版 import combine_find_invalid +from flask_app.货物标.资格审查main import combine_qualification_review +from flask_app.货物标.评分标准提取main import combine_evaluation_standards +import logging +def get_global_logger(unique_id): + if unique_id is None: + return logging.getLogger() # 获取默认的日志器 + logger = logging.getLogger(unique_id) + return logger + +logger=None + +# 创建全局线程池 +executor = ThreadPoolExecutor() +def preprocess_files(output_folder, file_path, file_type, unique_id): + logger.info("starting 文件预处理...") + logger.info("output_folder..." + output_folder) + + # 根据文件类型处理文件路径 + if file_type == 1: # docx + docx_path = file_path + pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理 + elif file_type == 2: # pdf + pdf_path = file_path + docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库 else: - print("未传入指定格式的文件!") + logger.error("Unsupported file type provided. Preprocessing halted.") return None - truncate_file=truncate_pdf_main(pdf_path,output_folder,1) + + # 异步上传知识库 + future_knowledge = executor.submit(addfileToKnowledge, docx_path, "招标解析" + unique_id) + # 调用截取PDF多次 + truncate_files = truncate_pdf_multiple(pdf_path, output_folder) #index: 0->商务技术服务要求 1->评标办法 2->资格审查 3->投标人须知前附表 4->投标人须知正文 + + # 处理各个部分 + invalid_docpath = docx_path #docx截取无效标部分 + procurement_path = truncate_files[0] #商务技术服务要求 + evaluation_method_path = truncate_files[1] #评标办法 + qualification_path=truncate_files[2] #资格审查 + tobidders_notice_path = truncate_files[4] #投标人须知正文 + notice_path=truncate_files[5] + clause_path = convert_clause_to_json(tobidders_notice_path, output_folder) # 投标人须知正文条款pdf->json + + logger.info("文件预处理done") + + # 提前返回,不等待 future_knowledge 完成,返回包含 Future 对象 + return { + 'file_path': file_path, + 'output_folder': output_folder, + 'procurement_path': procurement_path, + 'evaluation_method_path': evaluation_method_path, + 'qualification_path': qualification_path, + 'notice_path':notice_path, + 'knowledge_future': future_knowledge, # 返回 Future 对象 + 'clause_path': clause_path, + 'invalid_docpath': invalid_docpath + } +def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path): # 投标人须知前附表 + logger.info("starting基础信息...") + basic_res = combine_basic_info() + logger.info("基础信息done") + return basic_res + +def fetch_qualification_review(output_folder,qualification_path,notice_path,knowledge_name): #资格审查 + logger.info("starting资格审查...") + review_standards_res = combine_qualification_review(output_folder,qualification_path,notice_path, knowledge_name) + logger.info("资格审查done") + return review_standards_res + +def fetch_evaluation_standards(evaluation_method_path): # 评标细则 + logger.info("starting 商务评分和技术评分...") + # 获取评标办法前附表的字典结果 + evaluation_standards_res = combine_evaluation_standards(evaluation_method_path) + # 获取技术标和商务标 + technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})} + commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})} + logger.info("商务评分和技术评分done") + + # 返回将 "技术标" 和 "商务标" 包含在新的键中 + return { + "technical_standards": technical_standards, + "commercial_standards": commercial_standards + } + +#TODO:doc文档转换 +def fetch_invalid_requirements(invalid_docpath, output_folder): + # 废标项要求:千问 + logger.info("starting无效标与废标...") + find_invalid_res = combine_find_invalid(invalid_docpath, output_folder) + logger.info("无效标与废标done...") + return find_invalid_res + +# 投标文件要求 +def fetch_bidding_documents_requirements(clause_path): + logger.info("starting投标文件要求...") + fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1) + logger.info("投标文件要求done...") + return {"投标文件要求":fetch_bidding_documents_requirements_json} + +# 开评定标流程 +def fetch_bid_opening(clause_path): + logger.info("starting开评定标流程...") + fetch_bid_opening_json = extract_from_notice(clause_path, 2) + logger.info("开评定标流程done...") + return {"开评定标流程":fetch_bid_opening_json} + +def goods_bid_main(output_folder,file_path, file_type, unique_id): + global logger + logger = get_global_logger(unique_id) + + # 预处理文件,获取处理后的数据 + processed_data = preprocess_files(output_folder, file_path, file_type, unique_id) + if not processed_data: + yield json.dumps({}) # 如果处理数据失败,返回空的 JSON + + with concurrent.futures.ThreadPoolExecutor() as executor: + # 立即启动不依赖 knowledge_name 和 index 的任务 + futures = { + 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['evaluation_method_path']), + 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],output_folder), + 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, processed_data['clause_path']), + 'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path']) + } + + # 提前处理这些不依赖的任务,按完成顺序返回 + for future in concurrent.futures.as_completed(futures.values()): + key = next(k for k, v in futures.items() if v == future) + try: + result = future.result() + # 如果是 evaluation_standards,拆分技术标和商务标 + if key == 'evaluation_standards': + technical_standards = result["technical_standards"] + commercial_standards = result["commercial_standards"] + # 分别返回技术标和商务标 + yield json.dumps({'technical_standards': transform_json_values(technical_standards)}, ensure_ascii=False) + yield json.dumps({'commercial_standards': transform_json_values(commercial_standards)}, ensure_ascii=False) + else: + # 处理其他任务的结果 + yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False) + except Exception as exc: + logger.error(f"Error processing {key}: {exc}") + yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False) + + # 只有在需要 knowledge_name 和 index 时才等待 future_knowledge 完成 + try: + knowledge_name = "招标解析" + unique_id + index = processed_data['knowledge_future'].result() # 阻塞等待知识库上传任务完成 + + # 提交依赖 knowledge_name 和 index 的任务 + future_dependencies = { + 'base_info': executor.submit(fetch_project_basic_info, knowledge_name, processed_data['truncate0'], + output_folder, processed_data['clause_path']), + 'qualification_review': executor.submit(fetch_qualification_review,output_folder, processed_data['qualification_path'],processed_data['notice_path'],knowledge_name), + } + # 按完成顺序返回依赖任务的结果 + for future in concurrent.futures.as_completed(future_dependencies.values()): + key = next(k for k, v in future_dependencies.items() if v == future) + try: + result = future.result() + yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False) + except Exception as exc: + logger.error(f"Error processing {key}: {exc}") + yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False) + + except Exception as e: + logger.error(f"Error uploading to knowledge base: {e}") + yield json.dumps({'error': f'Knowledge upload failed: {str(e)}'}, ensure_ascii=False) + + # 删除知识索引 + deleteKnowledge(index) \ No newline at end of file diff --git a/flask_app/货物标/资格审查main.py b/flask_app/货物标/资格审查main.py index 04e3bd7..fe3bc83 100644 --- a/flask_app/货物标/资格审查main.py +++ b/flask_app/货物标/资格审查main.py @@ -1,23 +1,26 @@ # -*- encoding:utf-8 -*- +import copy import json import re - -from flask_app.main.基础信息整合 import combine_basic_info -from flask_app.main.通义千问long import qianwen_long,upload_file +from flask_app.main.通义千问long import qianwen_long, upload_file from flask_app.main.多线程提问 import multi_threading -from flask_app.main.json_utils import combine_json_results,clean_json_string -from flask_app.main.形式响应评审 import update_json_data,extract_matching_keys -from flask_app.main.json_utils import extract_content_from_json -#这个字典可能有嵌套,你需要遍历里面的键名,对键名作判断,而不是键值,具体是这样的:如果处于同一层级的键的数量>1并且键名全由数字或点号组成。那么就将这些序号键名全部删除,重新组织成一个字典格式的数据,你可以考虑用字符串列表来保持部分平级的数据 -#对于同级的键,如果数量>1且键名都统一,那么将键名去掉,用列表保持它们的键值 +from flask_app.main.json_utils import extract_content_from_json, clean_json_string +from flask_app.货物标.投标人须知正文条款提取成json文件货物标版 import convert_clause_to_json +from flask_app.货物标.货物标截取pdf import truncate_pdf_main + + +# 这个字典可能有嵌套,你需要遍历里面的键名,对键名作判断,而不是键值,具体是这样的:如果处于同一层级的键的数量>1并且键名全由数字或点号组成。那么就将这些序号键名全部删除,重新组织成一个字典格式的数据,你可以考虑用字符串列表来保持部分平级的数据 +# 对于同级的键,如果数量>1且键名都统一,那么将键名去掉,用列表保持它们的键值 def is_numeric_key(key): # 这个正则表达式匹配由数字、点、括号中的数字或单个字母(小写或大写)组成的字符串, # 字母后跟数字,或数字后跟字母,单个字母后跟点,但不能是字母-数字-字母的组合 pattern = r'^[\d.]+$|^\(\d+\)$|^(\d+)$|^[a-zA-Z]$|^[a-zA-Z]\d+$|^\d+[a-zA-Z]$|^[a-zA-Z]\.$' return re.match(pattern, key) is not None -#TODO:如果键值中存在数字就不行 -#zbtest20也有问题 + + +# TODO:如果键值中存在数字就不行 +# zbtest20也有问题 def contains_number_or_index(key, value): # 判断值是否是数字或数字字符串 is_number = isinstance(value, (int, float)) or (isinstance(value, str) and value.isdigit()) @@ -33,13 +36,14 @@ def contains_number_or_index(key, value): # 如果值是数字或包含数字,且不包含中文字符,或者键包含 "序号",返回 True return is_number or contains_index or contains_digit -#对于同一个字典中,可能存在若干键值对,若它们的键值都是""或者"/" 你就将它们的键值删去,它们的键名用字符串列表保存 -#如果键名是"序号"或者键值中全是数字,删去序号 + +# 对于同一个字典中,可能存在若干键值对,若它们的键值都是""或者"/" 你就将它们的键值删去,它们的键名用字符串列表保存 +# 如果键名是"序号"或者键值中全是数字,删去序号 def preprocess_dict(data): if isinstance(data, dict): if len(data) > 1: # 检查是否所有值都是 "" 或 "/" - if all(v == "" or v == "/" or (isinstance(v, list) and not v)for v in data.values()): + if all(v == "" or v == "/" or (isinstance(v, list) and not v) for v in data.values()): return list(data.keys()) else: processed = {} @@ -56,7 +60,20 @@ def preprocess_dict(data): else: return data + def process_dict(data): + """ + 递归处理字典,将符合条件的键值对进行转换。 + + 如果键是数字或特定格式的字符串,则将其值放入 'items' 列表中并排序。 + 对于非数字键,如果对应的值是列表且列表中只有一个元素,则将其展平为单个元素。 + + Args: + data (dict): 输入的字典数据。 + + Returns: + dict 或 list 或 原始数据类型: 处理后的数据结构。 + """ if not isinstance(data, dict): return data @@ -64,15 +81,20 @@ def process_dict(data): numeric_keys = [] non_numeric_keys = {} + # 分类键为数字键和非数字键 for key, value in data.items(): if is_numeric_key(key): numeric_keys.append((key, value)) else: non_numeric_keys[key] = value + # 处理数字键,将其值递归处理后放入 'items' 列表中 if numeric_keys: - result['items'] = [process_dict(item[1]) for item in sorted(numeric_keys)] + # 按键排序,确保顺序一致 + numeric_keys_sorted = sorted(numeric_keys, key=lambda x: x[0]) + result['items'] = [process_dict(item[1]) for item in numeric_keys_sorted] + # 处理非数字键 for key, value in non_numeric_keys.items(): if isinstance(value, list): processed_list = [] @@ -92,25 +114,35 @@ def process_dict(data): processed_list.append(processed_item) - result[key] = processed_list + # 新增逻辑:如果 processed_list 只有一个元素,则将其展平为单个元素 + if len(processed_list) == 1: + result[key] = processed_list[0] + else: + result[key] = processed_list else: + # 如果值不是列表,直接递归处理 result[key] = process_dict(value) + # 如果结果只有一个键 'items',则直接返回 'items' 列表 if len(result) == 1 and 'items' in result: return result['items'] - # 检查如果所有键对应的值都是空列表,则将键名转换成列表项 + + # 检查如果所有键对应的值都是空列表,则将键名转换成列表项 if all(isinstance(v, list) and not v for v in result.values()): return list(result.keys()) return result -#查找引用的序号 + +# 查找引用的序号 def find_chapter_clause_references(data, parent_key=""): - exclude_list=["格式要求"] + exclude_list = ["格式要求"] result = [] # 正则匹配"第x章"或"第x款" chapter_clause_pattern = re.compile(r'第[一二三四五六七八九十\d]+[章款]') - + # 如果数据不是字典,则直接返回空列表 + if not isinstance(data, dict): + return result # 遍历字典中的键值对 for key, value in data.items(): # 生成当前的完整键名 @@ -118,12 +150,19 @@ def find_chapter_clause_references(data, parent_key=""): # 检查是否应排除该键或值 if any(exclude_item in full_key for exclude_item in exclude_list) or \ - (isinstance(value, str) and any(exclude_item in value for exclude_item in exclude_list)): + (isinstance(value, str) and any(exclude_item in value for exclude_item in exclude_list)): continue # 如果在排除项中,跳过处理 if isinstance(value, dict): # 如果值是字典,递归调用函数 result.extend(find_chapter_clause_references(value, full_key)) + elif isinstance(value, list): + # 如果值是列表,遍历列表中的元素 + for index, item in enumerate(value): + if isinstance(item, dict): + # 生成新的键路径,包括列表索引 + new_parent_key = f"{full_key}[{index}]" + result.extend(find_chapter_clause_references(item, new_parent_key)) elif isinstance(value, str): # 如果值是字符串,检查是否匹配"第x章"或"第x款" if chapter_clause_pattern.search(value): @@ -131,6 +170,7 @@ def find_chapter_clause_references(data, parent_key=""): return result + def preprocess_value(value): # 使用正则表达式查找"第X章"或"第X款" chapter_match = re.search(r'第(.+?)章', value) @@ -185,7 +225,24 @@ def generate_questions(input_list): questions.append(question) return questions -def update_json_data(original_data,response_list): + +""" +eg: +response_list = [ + { + "person.name": "Bob", + "person.address.city": "Los Angeles" + }, + { + "company.location": "Austin", + "person.age": 35 + } +] +""" + + +# 用新数据更新原有字典 +def update_json_data(original_data, response_list): def recursive_update(data, key, value): # 处理点分隔的键,递归定位并更新嵌套字典 keys = key.split('.') @@ -195,42 +252,158 @@ def update_json_data(original_data,response_list): data[keys[-1]] = {**data.get(keys[-1], {}), **value} else: data[keys[-1]] = value + for response_dict in response_list: for key, value in response_dict.items(): recursive_update(original_data, key, value) return original_data -def qualification_review(truncate_file,knowledge_name): - file_id=upload_file(truncate_file) - user_query=["该招标文件中规定的资格性审查标准是怎样的?请以json格式给出,外层为'资格性审查',你的回答要与原文一致,不可擅自总结删减,也不要回答有关符合性性审查的内容。","该招标文件中规定的符合性审查标准是怎样的?请以json格式给出,外层为'符合性审查',你的回答要与原文一致,不可擅自总结删减,也不要回答有关资格性审查的内容。"] - results=multi_threading(user_query,"",file_id,2) + +def process_match_keys(match_keys, clause_path_file): + """ + 处理 match_keys,根据其中的数字或中文数字提取相应的条款内容,并附加到原始值后面。 + + 参数: + - match_keys (list): 包含键值对的列表。 + - clause_path_file (str): clause_path的JSON文件路径。 + + 返回: + - list: 更新后的match_keys列表。 + """ + # 定义数字到中文数字的映射,扩展到'十' + digit_map = {'1': '一', '2': '二','3': '三','4': '四','5': '五','6': '六','7': '七','8': '八','9': '九','10': '十'} + # 定义中文数字列表 + chinese_numerals = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十'] + + # 编译一个正则表达式,用于查找中文数字后面跟着的不是'章'或'部分'的字符 + # 这个模式会捕获中文数字和紧随其后的一个字符 + pattern = re.compile(r'([一二三四五六七八九十]+)(?!章|部分)(.)') + + # 读取clause_path的内容 + try: + with open(clause_path_file, 'r', encoding='utf-8') as file: + clause_path = json.load(file) + except FileNotFoundError: + print(f"文件未找到: {clause_path_file}") + return match_keys + except json.JSONDecodeError: + print(f"文件内容不是有效的JSON格式: {clause_path_file}") + return match_keys + + for item in match_keys: + for key, value in item.items(): + # 将match_keys中的数字1-10转换为对应的中文数字 + for digit, chinese in digit_map.items(): + value = re.sub(r'{}'.format(digit), chinese, value) + + # 查找值中所有匹配的中文数字 + matches = pattern.findall(value) + # 存储需要附加的条款内容,避免重复 + clauses_to_append = [] + for match in matches: + numeral = match[0] + # 检查提取的中文数字是否在定义的列表中 + if numeral in chinese_numerals: + # 在clause_path的键中查找包含该中文数字的键 + for clause_key in clause_path.keys(): + if numeral in clause_key: + clause_value = clause_path[clause_key] + if clause_value not in clauses_to_append: + clauses_to_append.append(clause_value) + if clauses_to_append: + # 将找到的条款内容用换行符连接 + appended_text = '\n'.join(clauses_to_append) + # 更新当前项的值,添加换行和附加内容 + item[key] = value + '\n' + appended_text + return match_keys + + +#处理如'符合本采购文件第一章第二款要求'的情况,跳转到指定地方摘取内容 +def process_additional_queries(combined_res, match_keys,output_folder, notice_path,knowledge_name): + """ + 处理额外的查询并更新结果。 + + Args: + combined_res: 初始的组合结果。 + match_keys: 匹配的章节或条款引用。 [{'资格性审查.资格要求': '符合本采购文件第一章第二款要求,并提供合格有效的证明材料。'}] + knowledge_name: 知识库的名称。 + + Returns: + dict: 更新后的最终结果。 + """ + clause2_path=convert_clause_to_json(notice_path,output_folder,2) + # 处理match_keys,匹配并附加条款内容 + original_keys = copy.deepcopy(match_keys) + updated_match_keys = process_match_keys(match_keys, clause2_path) + #正则匹配到指定内容后就直接返回 + if updated_match_keys != original_keys: + return update_json_data(combined_res, updated_match_keys) + # 生成进一步的问题 + ques = generate_questions(original_keys) + results = multi_threading(ques, knowledge_name) + + for _, response in results: + if response and len(response) > 1: + try: + temp = extract_content_from_json(response[1]) + updated_match_keys.append(temp) + except Exception as e: + print(f"形式响应评审:Error processing response: {e}") + else: + print(f"形式响应评审:Warning: Missing or incomplete response data.") + + return update_json_data(combined_res, updated_match_keys) + + +def combine_qualification_review(output_folder,qualification_path, notice_path,knowledge_name): + """ + 组合资格性审查和符合性审查的评审结果。 + + Args: + truncate_file: 要上传和处理的文件。 + knowledge_name: 知识库的名称,用于后续查询。 + + Returns: + dict: 最终组合的评审结果。 + """ + # 上传文件并获取文件ID + file_id = upload_file(qualification_path) + # 定义用户查询列表 + user_query = [ + "该招标文件中规定的资格性审查标准是怎样的?请以json格式给出,外层为'资格性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关符合性性审查的内容。", + "该招标文件中规定的符合性审查标准是怎样的?请以json格式给出,外层为'符合性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关资格性审查的内容。" + ] + # 执行多线程查询 + results = multi_threading(user_query, "", file_id, 2) + combined_res = {} for question, response in results: - cleaned_data = clean_json_string(response) - processed1 = preprocess_dict(cleaned_data) - processed2 = process_dict(processed1) - combined_res.update(processed2) - match_keys=find_chapter_clause_references(combined_res) - ques=generate_questions(match_keys) - results = multi_threading(ques, knowledge_name) # 无序号的直接问大模型 - first_response_list = [] - for _, response in results: - try: - if response and len(response) > 1: # 检查response存在且有至少两个元素 - temp = extract_content_from_json(response[1]) - first_response_list.append(temp) - else: - print(f"形式响应评审:Warning: Missing or incomplete response data for query index {_}.") - except Exception as e: - print(f"形式响应评审:Error processing response for query index {_}: {e}") - # print(first_response_list) - final_result=update_json_data(combined_res,first_response_list) - return final_result - # 整合基础信息核心代码 -#[{'资格性审查.资格要求': '符合本采购文件第一章第二款要求,并提供合格有效的证明材料'}, {'资格性审查.没有重大违法记录的书面声明': '是否提交参加政府采购活动前三年内在经营活动中没有重大违法记录的书面承诺或声明(格式要求详见本项目采购文件第六章相关格式要求)'}] -#TODO:有个严重的问题,对于{'资格性审查.资格要求': '符合本采购文件第一章第二款要求,并提供合格有效的证明材料'},调用百炼rag的时候容易得到一模一样的回答,而不是跳转到具体的地方,有两个思路,1.结构化第一章内容 2.优化提示词 3.构造问题的时候不带value,直接问key + if response: + cleaned_data = clean_json_string(response) # 清理大模型回答 + processed1 = preprocess_dict(cleaned_data) + processed2 = process_dict(processed1) + combined_res.update(processed2) + else: + print(f"Warning: No response for question '{question}'.") + + # 查找章节或条款引用 + match_keys = find_chapter_clause_references(combined_res) #[{'资格性审查.资格要求': '符合本采购文件第一章第二款要求,并提供合格有效的证明材料。'}] + + # 如果没有匹配的章节或条款,直接返回 combined_res + if not match_keys: + return combined_res + # 调用新的函数处理后续逻辑 + return process_additional_queries(combined_res, match_keys, output_folder,notice_path,knowledge_name) + + +# 整合基础信息核心代码 +# [{'资格性审查.资格要求': '符合本采购文件第一章第二款要求,并提供合格有效的证明材料'}, {'资格性审查.没有重大违法记录的书面声明': '是否提交参加政府采购活动前三年内在经营活动中没有重大违法记录的书面承诺或声明(格式要求详见本项目采购文件第六章相关格式要求)'}] +# TODO:有个严重的问题,对于{'资格性审查.资格要求': '符合本采购文件第一章第二款要求,并提供合格有效的证明材料'},调用百炼rag的时候容易得到一模一样的回答,而不是跳转到具体的地方,有两个思路,1.结构化第一章内容 2.优化提示词 3.构造问题的时候不带value,直接问key if __name__ == "__main__": - truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\6.2定版视频会议磋商文件_qualification2.pdf" - knowledge_name="6.2视频会议docx" - res=qualification_review(truncate_file,knowledge_name) - print(json.dumps(res,ensure_ascii=False, indent=4)) \ No newline at end of file + # qualification_path="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\6.2定版视频会议磋商文件_qualification2.pdf" + output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zboutpub" + qualification_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output3\\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件_qualification2.pdf" + notice_path="C:\\Users\\Administrator\\Desktop\\货物标\\output5\\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件_notice.pdf" + knowledge_name = "6.2视频会议docx" + res = combine_qualification_review(output_folder,qualification_path, notice_path,knowledge_name) + print(json.dumps(res, ensure_ascii=False, indent=4))