From e63b71957241c02453f6541cf36d9f1c3b5c8235 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Mon, 21 Oct 2024 17:31:48 +0800 Subject: [PATCH] =?UTF-8?q?10.21=E5=B7=A5=E7=A8=8B=E6=A0=87=E5=BF=AB?= =?UTF-8?q?=E9=80=9F=E7=89=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/main/table_content_extraction.py | 4 +- flask_app/main/工程标解析main.py | 218 ++++++++++++++++ flask_app/main/形式响应评审.py | 235 +++++++++++++----- flask_app/main/形式响应评审old.py | 219 ++++++++++++++++ flask_app/main/截取pdf.py | 169 ++++++++----- .../main/投标人须知正文条款提取成json文件.py | 4 +- flask_app/main/招标文件解析.py | 2 +- flask_app/main/根据条款号整合json.py | 4 +- flask_app/main/读取文件/按页读取pdf.py | 4 +- flask_app/main/资格审查模块.py | 38 +-- flask_app/main/资格审查模块old.py | 42 ++++ flask_app/main/资格评审.py | 60 +++-- .../{test.py => 资格评审5种情况测试脚本.py} | 29 ++- flask_app/货物标/货物标截取pdf.py | 34 ++- flask_app/货物标/货物标解析main.py | 49 ++-- 15 files changed, 874 insertions(+), 237 deletions(-) create mode 100644 flask_app/main/工程标解析main.py create mode 100644 flask_app/main/形式响应评审old.py create mode 100644 flask_app/main/资格审查模块old.py rename flask_app/main/{test.py => 资格评审5种情况测试脚本.py} (74%) diff --git a/flask_app/main/table_content_extraction.py b/flask_app/main/table_content_extraction.py index c719523..cfa9b05 100644 --- a/flask_app/main/table_content_extraction.py +++ b/flask_app/main/table_content_extraction.py @@ -130,8 +130,8 @@ def process_all_part1_pdfs(folder_path, output_folder): extract_tables_main(file_path, subfolder_path) if __name__ == "__main__": - path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件.docx' - output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\tmp" # 前附表json文件 + path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_tobidders_notice_table.docx' + output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test" # 前附表json文件 res=extract_tables_main(path, output_folder) # # folder_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4' diff --git a/flask_app/main/工程标解析main.py b/flask_app/main/工程标解析main.py new file mode 100644 index 0000000..3660e78 --- /dev/null +++ b/flask_app/main/工程标解析main.py @@ -0,0 +1,218 @@ +# -*- encoding:utf-8 -*- +import json +import logging +import time +from concurrent.futures import ThreadPoolExecutor +from flask_app.main.截取pdf import truncate_pdf_multiple +from flask_app.main.table_content_extraction import extract_tables_main +from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json +from flask_app.main.json_utils import transform_json_values, combine_json_results +from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid +from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice +import concurrent.futures +from flask_app.main.基础信息整合快速版 import combine_basic_info +from flask_app.main.资格审查模块 import combine_review_standards +from flask_app.main.商务标技术标整合 import combine_evaluation_standards +from flask_app.main.format_change import pdf2docx, docx2pdf +from flask_app.main.docx截取docx import copy_docx + +def get_global_logger(unique_id): + if unique_id is None: + return logging.getLogger() # 获取默认的日志器 + logger = logging.getLogger(unique_id) + return logger + +logger=None + +# 创建全局线程池 +executor = ThreadPoolExecutor() +def preprocess_files(output_folder, downloaded_file_path, file_type): + logger.info("starting 文件预处理...") + logger.info("output_folder..." + output_folder) + + # 根据文件类型处理文件路径 + if file_type == 1: # docx + docx_path = downloaded_file_path + pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理 + elif file_type == 2: # pdf + pdf_path = downloaded_file_path + docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库 + else: + logger.error("Unsupported file type provided. Preprocessing halted.") + return None + + # 调用截取PDF多次 + truncate_files = truncate_pdf_multiple(pdf_path, output_folder) + + # 处理各个部分 + truncate0_docpath = pdf2docx(truncate_files[0]) # 投标人须知前附表转docx + + invalid_path=truncate_files[5] + invalid_docpath = copy_docx(docx_path) # docx截取无效标部分 + # invalid_docpath=pdf2docx(invalid_path) + truncate_jsonpath = extract_tables_main(truncate0_docpath, output_folder) # 投标人须知前附表docx->json + truncate0 = truncate_files[0] + truncate1 = truncate_files[1] + truncate2=truncate_files[2] + truncate3 = truncate_files[3] + merged_baseinfo_path=truncate_files[-1] + clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json + + logger.info("文件预处理done") + + # 返回包含预处理后文件路径的字典 + return { + 'file_path': downloaded_file_path, + 'output_folder': output_folder, + 'invalid_path':invalid_path, + 'truncate0': truncate0, + 'truncate1': truncate1, + 'truncate2':truncate2, + 'truncate3': truncate3, + 'truncate0_jsonpath': truncate_jsonpath, + 'merged_baseinfo_path':merged_baseinfo_path, + 'clause_path': clause_path, + 'invalid_docpath': invalid_docpath + } + + +def post_processing(data,includes): + # 初始化结果字典,预设'其他'分类为空字典 + result = {"其他": {}} + + # 遍历原始字典的每一个键值对 + for key, value in data.items(): + if key in includes: + # 如果键在includes列表中,直接保留这个键值对 + result[key] = value + else: + # 如果键不在includes列表中,将这个键值对加入到'其他'分类中 + result["其他"][key] = value + + # 如果'其他'分类没有任何内容,可以选择删除这个键 + if not result["其他"]: + del result["其他"] + + return result +# 基本信息 +def fetch_project_basic_info(merged_baseinfo_path, truncate0, truncate2, clause_path): # 投标人须知前附表 + logger.info("starting基础信息...") + basic_res = combine_basic_info(merged_baseinfo_path, truncate0, truncate2, clause_path) + logger.info("基础信息done") + return basic_res + + +# 形式、响应、资格评审 +def fetch_qualification_review(truncate1, truncate3,output_folder, truncate0_jsonpath, clause_path,invalid_path,merged_baseinfo_path): + logger.info("starting资格审查...") + review_standards_res = combine_review_standards(truncate1, truncate3,output_folder, truncate0_jsonpath, + clause_path,invalid_path,merged_baseinfo_path) + logger.info("资格审查done") + return review_standards_res + + +# 评分细则 流式 +def fetch_evaluation_standards(truncate1): # 评标办法前附表 + logger.info("starting 商务标和技术标...") + + # 获取评标办法前附表的字典结果 + evaluation_standards_res = combine_evaluation_standards(truncate1) + + # 获取技术标和商务标 + technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})} + commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})} + + logger.info("商务标和技术标 done") + + # 返回将 "技术标" 和 "商务标" 包含在新的键中 + return { + "technical_standards": technical_standards, + "commercial_standards": commercial_standards + } + + +# 无效、废标项解析 +def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3): + # 废标项要求:千问 + logger.info("starting无效标与废标...") + find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3) + logger.info("无效标与废标done...") + return find_invalid_res + + +# 投标文件要求 +def fetch_bidding_documents_requirements(clause_path): + logger.info("starting投标文件要求...") + fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1) + logger.info("投标文件要求done...") + return {"投标文件要求":fetch_bidding_documents_requirements_json} + +# 开评定标流程 +def fetch_bid_opening(clause_path): + logger.info("starting开评定标流程...") + fetch_bid_opening_json = extract_from_notice(clause_path, 2) + logger.info("开评定标流程done...") + return {"开评定标流程":fetch_bid_opening_json} + +#分段返回 +def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_id): + global logger + logger = get_global_logger(unique_id) + # 预处理文件,获取处理后的数据 + processed_data = preprocess_files(output_folder, downloaded_file_path, file_type) + if not processed_data: + yield json.dumps({}) # 如果处理数据失败,返回空的 JSON + + with concurrent.futures.ThreadPoolExecutor() as executor: + # 立即启动不依赖 knowledge_name 和 index 的任务 + futures = { + 'base_info': executor.submit(fetch_project_basic_info, processed_data['merged_baseinfo_path'], + processed_data['truncate0'], + processed_data['truncate2'], processed_data['clause_path']), + 'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'], + processed_data['truncate3'], output_folder, + processed_data['truncate0_jsonpath'], + processed_data['clause_path'], processed_data['invalid_path'], + processed_data['merged_baseinfo_path']), + 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']), + 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], + output_folder, processed_data['truncate0_jsonpath'], + processed_data['clause_path'], processed_data['truncate3']), + 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, processed_data['clause_path']), + 'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path']) + } + + # 提前处理这些不依赖的任务,按完成顺序返回 + for future in concurrent.futures.as_completed(futures.values()): + key = next(k for k, v in futures.items() if v == future) + try: + result = future.result() + + # 如果是 evaluation_standards,拆分技术标和商务标 + if key == 'evaluation_standards': + technical_standards = result["technical_standards"] + commercial_standards = result["commercial_standards"] + + # 分别返回技术标和商务标 + yield json.dumps({'technical_standards': transform_json_values(technical_standards)}, ensure_ascii=False) + yield json.dumps({'commercial_standards': transform_json_values(commercial_standards)}, ensure_ascii=False) + + else: + # 处理其他任务的结果 + yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False) + + except Exception as exc: + logger.error(f"Error processing {key}: {exc}") + yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False) + +if __name__ == "__main__": + start_time = time.time() + output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test1" + file_type = 2 #1:docx 2:pdf 3:其他 + input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf" + print("yes") + for output in engineering_bid_main(output_folder, input_file, file_type, "zytest"): + print(output) + end_time = time.time() + elapsed_time = end_time - start_time # 计算耗时 + print(f"Function execution took {elapsed_time} seconds.") diff --git a/flask_app/main/形式响应评审.py b/flask_app/main/形式响应评审.py index d0e286b..847f715 100644 --- a/flask_app/main/形式响应评审.py +++ b/flask_app/main/形式响应评审.py @@ -1,13 +1,16 @@ # -*- encoding:utf-8 -*- +import os import re import json import time from flask_app.main.多线程提问 import multi_threading from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2 -from flask_app.main.json_utils import extract_content_from_json +from flask_app.main.json_utils import clean_json_string from flask_app.main.截取pdf import truncate_pdf_main from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json +from flask_app.main.多线程提问 import upload_file +from flask_app.main.截取pdf import merge_pdfs prompt = """ # 角色 你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。 @@ -63,27 +66,60 @@ def update_json_data(original_data, updates1, updates2,second_response_list): return original_data + def judge_second_jump(input_dict): formatted_questions = [] keys_to_remove = [] + chapter_pattern = re.compile(r'招\s*标\s*公\s*告|第\s*一\s*章') + + question_template = ( + "请你根据招标文件中的内容回答:关于{short_key}{additional_text},第一章招标公告中的内容是怎样的?" + "请按json格式给我提供信息,键名为'{full_key}',而键值需要完全与原文保持一致,不要擅自总结、删减," + "如果存在未知信息,请在对应键值处填'未知'。" + ) for key, value in input_dict.items(): - if '招标公告' in value or '第一章' in value: - formatted_entry = f"关于'{key}',第一章招标公告中的内容是怎样的?请按json格式给我提供信息,键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。" + if chapter_pattern.search(value): + short_key = key.split('.')[-1] if '.' in key else key + additional_text = get_additional_text(key) + formatted_entry = question_template.format( + short_key=short_key, + full_key=key, + additional_text=additional_text + ) formatted_questions.append(formatted_entry) keys_to_remove.append(key) # 标记需要删除的键 + # 从原始字典中删除处理过的键值对 for key in keys_to_remove: del input_dict[key] return input_dict, formatted_questions -def extract_matching_keys(json_data): +def extract_matching_keys(original_data): + """ + 提取带序号的内容,对于已经完整的内容不提取,但会在后面用新数据更新orginal_data + :param json_data: + { + "投标要求": { + "资质要求": "符合第3章第2条规定", + "业绩要求": "满足招标文件第4章相关条目" + }, + "其他信息": { + "投标文件格式": "按要求填写" + } +} + :return: + [ + {"投标要求.资质要求": "符合第3章第2条规定"}, + {"投标要求.业绩要求": "满足招标文件第4章相关条目"} + ] + """ #'投标人名称': '符合招标公告第3.2条规定' # 函数首先检查输入 json_data 是否为字符串类型。如果是,它会使用 json.loads() 将字符串解析为字典。 - if isinstance(json_data, str): - data = json.loads(json_data) + if isinstance(original_data, str): + data = json.loads(original_data) else: - data = json_data + data = original_data # 正则表达式匹配 include_patterns = [re.compile(r"符合第"), re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"),re.compile(r"第.*?条")] @@ -125,91 +161,160 @@ def extract_matching_keys(json_data): return final_matching -def reformat_questions(match_keys, input_path, output_folder): + #同类表达扩展 +def get_additional_text(key): + additional_text_map = { + "投标内容": "(或招标范围)", + "工程质量": "(或质量要求)", + # 可以在这里添加更多的映射 + } + for keyword, text in additional_text_map.items(): + if keyword in key: + return text + return "" + +def process_tender_entries(match_keys,output_folder): + """ + :param match_keys: + match_keys = [ + {"资质要求": "符合相关资质要求"}, + {"业绩要求": "3.1.2 近三年完成类似项目"}, + {"财务要求": "招标公告 2.3 财务状况良好"}, + {"人员要求": "项目经理具有相关资格"} +] + :param notice_path: + notice_path="xxxx" + :return: + [{'业绩要求': '3.1.2'}] + ["关于'资质要求',相关资质要求的内容是怎样的?请按json格式给我提供信息,键名为'资质要求',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。", "关于'人员要求',项目经理具有相关资格的内容是怎样的?请按json格式给我提供信息,键名为'人员要求',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。"] + [{'财务要求': '2.3'}] + "xxxxxx.json" + """ entries_with_numbers = [] - entries_with_numbers2 = [] # 用于保存特别处理的条目 + entries_with_numbers2 = [] formatted_questions = [] clause2_path = "" # 正则表达式,同时匹配全角和半角括号 - pattern = re.compile(r'(\d+(?:\.\d+)+)(?:[\(\(](\d+)[\)\)])?') # 识别包含数字序列的特定格式 + pattern = re.compile(r'(\d+(?:\.\d+)+)(?:[$\(](\d+)[$\)])?') + + # 新的正则表达式,用于匹配 '招标公告' 和 '第一章',允许内部有空格 + chapter_pattern = re.compile(r'招\s*标\s*公\s*告|第\s*一\s*章') - # 初始化文件处理 - generated_files = None - file_processed = False question_template = ( - "关于'{key}',{revised_value}的内容是怎样的?请按json格式给我提供信息,键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。" + "请你根据招标文件中的内容回答:关于{short_key}{additional_text},{revised_value}的内容是怎样的?请按json格式给我提供信息,键名为'{full_key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。" ) + + def format_num(match): + return match.group(1) + (f"({match.group(2)})" if match.group(2) else "") + def add_formatted_question(key, value): + revised_value = value.replace('符合', '') + short_key = key.split('.')[-1] if '.' in key else key + additional_text = get_additional_text(key) + formatted_questions.append(question_template.format( + short_key=short_key, + full_key=key, + revised_value=revised_value, + additional_text=additional_text + )) + for entry in match_keys: key, value = next(iter(entry.items())) match = pattern.search(value) - if '招标公告' in value or '第一章' in value: - if not file_processed: - # 只处理一次文件操作 - generated_files = truncate_pdf_main(input_path, output_folder, 5) - file_processed = True # 设置标志位,避免重复处理文件 - if generated_files: - clause2_path = convert_clause_to_json(generated_files[0], output_folder, 2) - - if match and generated_files: - # 提取序号并添加到entries_with_numbers2 - num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "") - entries_with_numbers2.append({key: num}) + if chapter_pattern.search(value): + if match: + notice_path = next((os.path.join(root, file) for root, _, files in os.walk(output_folder) + for file in files if file.endswith('_notice.pdf')), "") + if notice_path: + clause2_path = convert_clause_to_json(notice_path, output_folder, 2) + entries_with_numbers2.append({key: format_num(match)}) else: - revised_value = value.replace('符合', '') - formatted_entry = question_template.format(key=key, revised_value=revised_value) - formatted_questions.append(formatted_entry) + add_formatted_question(key, value) elif match: - num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "") - entries_with_numbers.append({key: num}) + entries_with_numbers.append({key: format_num(match)}) else: - revised_value = value.replace('符合', '') - formatted_entry = question_template.format(key=key, revised_value=revised_value) - formatted_questions.append(formatted_entry) + add_formatted_question(key, value) return entries_with_numbers, formatted_questions, entries_with_numbers2, clause2_path -def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause_json_path,input_file,output_folder): - combined_results2={} - matched_keys = extract_matching_keys(original_dict_data) #output:[{'形式评审标准.投标文件签字盖章': '符合第一章“投标人须知”第 3.7.3(4)目 规定'}, {'形式评审标准.多标段投标': '符合第二章“投标人须知”第 10.1款规定'}] 提取值中包含"符合"的字典 - entries_with_numbers, formatted_questions1,entries_with_numbers2, clause2_path = reformat_questions(matched_keys,input_file,output_folder) - combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath, - clause_json_path) # 调用根据条款号整合json.py - combined_results1, formatted_questions2 = judge_second_jump(combined_results) #判断是否需要二次跳转 - if entries_with_numbers2: #跳转第一章招标公告 - combined_results2=process_and_merge2(entries_with_numbers2,clause2_path) +def fetch_specific_pdf(output_folder): + pdf_paths = [] + # 遍历指定目录 + for root, dirs, files in os.walk(output_folder): + for file in files: + # 检查文件是否以指定的名称结尾 + if file.endswith('merged_baseinfo.pdf') or file.endswith('tobidders_notice.pdf'): + # 如果是,将完整路径添加到列表中 + full_path = os.path.join(root, file) + pdf_paths.append(full_path) + if pdf_paths: # 只有在找到匹配的PDF文件时才进行合并 + output_path = os.path.join(output_folder, "merged_reviews.pdf") + merge_pdfs(pdf_paths, output_path) + return output_path + else: + return "" # 如果没有找到匹配的PDF文件,返回None - results_2 = multi_threading(formatted_questions1+formatted_questions2, knowledge_name) #无序号的直接问大模型 +def process_reviews(original_dict_data, output_folder, truncate0_jsonpath, clause_json_path): + combined_results2 = {} first_response_list = [] + matched_keys = extract_matching_keys(original_dict_data) - for _, response in results_2: - try: - if response and len(response) > 1: # 检查response存在且有至少两个元素 - temp = extract_content_from_json(response[1]) - first_response_list.append(temp) - else: - print(f"形式响应评审:Warning: Missing or incomplete response data for query index {_}.") - except Exception as e: - print(f"形式响应评审:Error processing response for query index {_}: {e}") + entries_with_numbers, formatted_questions1, entries_with_numbers2, clause2_path = process_tender_entries(matched_keys, output_folder) - # Assume JSON file paths are defined or configured correctly - # print(entries_with_numbers) #[{'形式评审标准.多标段投标': '3.7.4(5)'}] - updated_json = update_json_data(original_dict_data, combined_results1, combined_results2,first_response_list) + combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath, clause_json_path) + + combined_results1, formatted_questions2 = judge_second_jump(combined_results) + + if entries_with_numbers2: + combined_results2 = process_and_merge2(entries_with_numbers2, clause2_path) + + # 只有当有问题需要处理时才执行PDF相关操作 + formatted_questions = formatted_questions1 + formatted_questions2 + if formatted_questions: + output_path = fetch_specific_pdf(output_folder) + if output_path: + file_id = upload_file(output_path) + results = multi_threading(formatted_questions, "", file_id, 2) + first_response_list = [clean_json_string(res) for _, res in results] if results else [] + + updated_json = update_json_data(original_dict_data, combined_results1, combined_results2, first_response_list) return updated_json - if __name__ == "__main__": start_time=time.time() - knowledge_name="zbfile" - truncate_tobidders_table_json_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\truncate_output.json" - clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\clause.json" - original_dict_data={'形式评审标准': {'投标文件': '符合第一章规定', '投标人名称': '符合招标公告第3.2条规定', '投标文件签字': '符合第二章“投标人须知”第3.7.3(4)目规定', '投标文件格式、内容': '符合第七章“投标文件格式”的要求,实质性内容齐全、关键字迹清晰可辨', '联合体投标人': '提交联合体协议书,并明确联合体牵头人', '报价唯一': '只能有一个有效报价', '多标段投标': '符合第二章“投标人须知”第 10.1款规定'}, '响应性评审标准': {'投标内容': '符合第二章“投标人须知”第1.3.1项规定', '工期': '符合第二章“投标人须知”第1.3.2项规定', '工程质量': '符合第二章“投标人须知”第1.3.3项规定'}} - - input_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile.pdf" - output_folder="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7" - formal_json = process_reviews(original_dict_data,knowledge_name, truncate_tobidders_table_json_path, clause_path,input_file,output_folder) + # knowledge_name="zbfile" + truncate_tobidders_table_json_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\truncate_output.json" + clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\clause1.json" + original_dict_data={ + "形式评审标准": { + "投标文件": "投标文件能正常打开", + "投标人名称": "与营业执照、资质证书、安全生产许可证一致", + "投标文件签字盖章": "符合第二章“投标人须知”规定", + "投标文件格式、内容": "符合第八章“投标文件格式”的要求,实质性内容齐全、关键字迹清晰可辨", + "联合体投标人(如有)": "提交联合体协议书,并明确联合体牵头人", + "报价唯一": "只能有一个有效报价(指投标函中的大写报价)", + "多标段投标": "符合第二章“投标人须知”规定", + "“技术暗标”": "符合第二章“投标人须知”规定" + }, + "响应性评审标准": { + "投标内容": "符合第二章“投标人须知”规定", + "工期": "符合第二章“投标人须知”规定", + "工程质量": "符合第二章“投标人须知”规定", + "投标有效期": "符合第二章“投标人须知”规定", + "投标保证金": "符合第二章“投标人须知”规定", + "权利义务": "投标函附录中的相关承诺符合或优于第四章“合同条款及格式”的相关规定", + "技术标准和要求": "符合第七章“技术标准和要求”规定", + "分包计划": "符合第二章“投标人须知”规定", + "算术错误修正": "1)投标人接受算术错误修正后的报价 2)修正后的报价与投标报价相比偏差率不超过±1%", + "投标价格": "1)投标函中的大写报价与已标价工程量清单中的投标总价一致 2)投标函中的大写报价不大于本标段招标控制价总价 3)算术错误修正后的投标总报价不大于本标段招标控制价总价 4)投标报价不低于其成本", + "已标价工程量清单": "1)已标价工程量清单项目编码顺序与第五章“工程量清单”给出的项目编码顺序一致; 2)已标价工程量清单符合第五章“工程量清单”给出的项目编码、项目名称、项目特征、计量单位和工程量; 3)暂列金额符合第五章“工程量清单”列出的金额; 4)专业工程暂估价符合第五章“工程量清单”列出的金额; 5)材料(工程设备)暂估价符合第五章“工程量清单”列出的单价并计入综合单价; 6)安全文明施工费、规费和税金等不可竞争费用,按照规定的标准计取; 7)计税方法符合招标文件的约定; 8)已标价工程量清单项目未填报的项目不超过三项,或不超过三项未填报的项目的费用合计(按招标控制价相应项目的费用合计计算)不超过其投标总报价(修正后的投标总报价,如有)的±1%" + } + } + output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test" + # notice_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_notice.pdf" + formal_json = process_reviews(original_dict_data,output_folder, truncate_tobidders_table_json_path, clause_path) data = json.dumps(formal_json, ensure_ascii=False, indent=4) end_time=time.time() elapsed_time = end_time - start_time diff --git a/flask_app/main/形式响应评审old.py b/flask_app/main/形式响应评审old.py new file mode 100644 index 0000000..d0e286b --- /dev/null +++ b/flask_app/main/形式响应评审old.py @@ -0,0 +1,219 @@ +# -*- encoding:utf-8 -*- +import re +import json +import time + +from flask_app.main.多线程提问 import multi_threading +from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2 +from flask_app.main.json_utils import extract_content_from_json +from flask_app.main.截取pdf import truncate_pdf_main +from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json +prompt = """ +# 角色 +你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。 + +## 技能 +### 技能 1:文档解析与摘要 +- 深入理解并分析${document1}的内容,提取关键信息。 +- 根据需求生成简洁明了的摘要,保持原文核心意义不变。 + +### 技能 2:信息检索与关联 +- 在${document1}中高效检索特定信息或关键词。 +- 能够识别并链接到文档内部或外部的相关内容,增强信息的连贯性和深度。 + +## 限制 +- 所有操作均需基于${document1}的内容,不可超出此范围创造信息。 +- 在处理敏感或机密信息时,需遵守严格的隐私和安全规定。 +- 确保所有生成或改编的内容逻辑连贯,无误导性信息。 + +请注意,上述技能执行时将直接利用并参考${document1}的具体内容,以确保所有产出紧密相关且高质量。 +""" + + +def update_json_data(original_data, updates1, updates2,second_response_list): + """ + 根据提供的更新字典覆盖原始JSON数据中对应的键值,支持点分隔的键来表示嵌套结构。 + 参数: + - original_data: dict, 原始的JSON数据。 + - updates: dict, 包含需要更新的键值对。 + - second_response_list: list, 包含多个字典,每个字典包含需要更新的键值对。 + 返回: + - updated_data: dict, 更新后的JSON数据。 + """ + def recursive_update(data, key, value): + # 处理点分隔的键,递归定位并更新嵌套字典 + keys = key.split('.') + for k in keys[:-1]: + data = data.setdefault(k, {}) + if isinstance(value, dict) and isinstance(data.get(keys[-1], None), dict): + data[keys[-1]] = {**data.get(keys[-1], {}), **value} + else: + data[keys[-1]] = value + + # 合并 updates 到 original_data 中 + for key, value in updates1.items(): + recursive_update(original_data, key, value) + for key,value in updates2.items(): + recursive_update(original_data, key, value) + + # 遍历 second_response_list 中的每个字典,并合并到 original_data 中 + for response_dict in second_response_list: + for key, value in response_dict.items(): + recursive_update(original_data, key, value) + + return original_data + +def judge_second_jump(input_dict): + formatted_questions = [] + keys_to_remove = [] + for key, value in input_dict.items(): + if '招标公告' in value or '第一章' in value: + formatted_entry = f"关于'{key}',第一章招标公告中的内容是怎样的?请按json格式给我提供信息,键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。" + formatted_questions.append(formatted_entry) + keys_to_remove.append(key) # 标记需要删除的键 + # 从原始字典中删除处理过的键值对 + for key in keys_to_remove: + del input_dict[key] + + return input_dict, formatted_questions + +def extract_matching_keys(json_data): + #'投标人名称': '符合招标公告第3.2条规定' + # 函数首先检查输入 json_data 是否为字符串类型。如果是,它会使用 json.loads() 将字符串解析为字典。 + if isinstance(json_data, str): + data = json.loads(json_data) + else: + data = json_data + + # 正则表达式匹配 + include_patterns = [re.compile(r"符合第"), re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"),re.compile(r"第.*?条")] + additional_include_patterns = [re.compile(r"规定"), re.compile(r"条目")] + exclude_patterns = ["投标文件格式", "权利义务", "技术标准","工程量清单"] + additional_exclude_patterns = ["按要求", "按规定"] + + # Initialize a list to hold filtered key-value pairs + final_matching = [] + + # Recursive function to traverse and filter data + def recursive_search(current_data, path=[]): + if isinstance(current_data, dict): + for key, value in current_data.items(): + new_path = path + [key] # Update path for nested keys + if isinstance(value, (dict, list)): + recursive_search(value, new_path) + else: + process_value(key, str(value), new_path) + elif isinstance(current_data, list): + for item in current_data: + recursive_search(item, path) + + # Function to process each value against the patterns + def process_value(key, value, path): + # Check exclude patterns first + if any(ex in key or ex in value for ex in exclude_patterns): + return + # Main include patterns + if any(pattern.search(value) for pattern in include_patterns): + # Additional exclude patterns + if not any(ex in key or ex in value for ex in additional_exclude_patterns): + # Additional include patterns + if any(pattern.search(value) for pattern in additional_include_patterns): + final_matching.append({".".join(path): value}) # Use dot notation for nested keys + + # Start the recursive search + recursive_search(data) + + return final_matching + +def reformat_questions(match_keys, input_path, output_folder): + entries_with_numbers = [] + entries_with_numbers2 = [] # 用于保存特别处理的条目 + formatted_questions = [] + clause2_path = "" + + # 正则表达式,同时匹配全角和半角括号 + pattern = re.compile(r'(\d+(?:\.\d+)+)(?:[\(\(](\d+)[\)\)])?') # 识别包含数字序列的特定格式 + + # 初始化文件处理 + generated_files = None + file_processed = False + question_template = ( + "关于'{key}',{revised_value}的内容是怎样的?请按json格式给我提供信息,键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。" + ) + for entry in match_keys: + key, value = next(iter(entry.items())) + match = pattern.search(value) + + if '招标公告' in value or '第一章' in value: + if not file_processed: + # 只处理一次文件操作 + generated_files = truncate_pdf_main(input_path, output_folder, 5) + file_processed = True # 设置标志位,避免重复处理文件 + if generated_files: + clause2_path = convert_clause_to_json(generated_files[0], output_folder, 2) + + if match and generated_files: + # 提取序号并添加到entries_with_numbers2 + num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "") + entries_with_numbers2.append({key: num}) + else: + revised_value = value.replace('符合', '') + formatted_entry = question_template.format(key=key, revised_value=revised_value) + formatted_questions.append(formatted_entry) + elif match: + num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "") + entries_with_numbers.append({key: num}) + else: + revised_value = value.replace('符合', '') + formatted_entry = question_template.format(key=key, revised_value=revised_value) + formatted_questions.append(formatted_entry) + + return entries_with_numbers, formatted_questions, entries_with_numbers2, clause2_path + + +def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause_json_path,input_file,output_folder): + combined_results2={} + matched_keys = extract_matching_keys(original_dict_data) #output:[{'形式评审标准.投标文件签字盖章': '符合第一章“投标人须知”第 3.7.3(4)目 规定'}, {'形式评审标准.多标段投标': '符合第二章“投标人须知”第 10.1款规定'}] 提取值中包含"符合"的字典 + entries_with_numbers, formatted_questions1,entries_with_numbers2, clause2_path = reformat_questions(matched_keys,input_file,output_folder) + combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath, + clause_json_path) # 调用根据条款号整合json.py + combined_results1, formatted_questions2 = judge_second_jump(combined_results) #判断是否需要二次跳转 + if entries_with_numbers2: #跳转第一章招标公告 + combined_results2=process_and_merge2(entries_with_numbers2,clause2_path) + + results_2 = multi_threading(formatted_questions1+formatted_questions2, knowledge_name) #无序号的直接问大模型 + first_response_list = [] + + for _, response in results_2: + try: + if response and len(response) > 1: # 检查response存在且有至少两个元素 + temp = extract_content_from_json(response[1]) + first_response_list.append(temp) + else: + print(f"形式响应评审:Warning: Missing or incomplete response data for query index {_}.") + except Exception as e: + print(f"形式响应评审:Error processing response for query index {_}: {e}") + + # Assume JSON file paths are defined or configured correctly + # print(entries_with_numbers) #[{'形式评审标准.多标段投标': '3.7.4(5)'}] + updated_json = update_json_data(original_dict_data, combined_results1, combined_results2,first_response_list) + return updated_json + + +if __name__ == "__main__": + start_time=time.time() + knowledge_name="zbfile" + truncate_tobidders_table_json_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\truncate_output.json" + clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\clause.json" + original_dict_data={'形式评审标准': {'投标文件': '符合第一章规定', '投标人名称': '符合招标公告第3.2条规定', '投标文件签字': '符合第二章“投标人须知”第3.7.3(4)目规定', '投标文件格式、内容': '符合第七章“投标文件格式”的要求,实质性内容齐全、关键字迹清晰可辨', '联合体投标人': '提交联合体协议书,并明确联合体牵头人', '报价唯一': '只能有一个有效报价', '多标段投标': '符合第二章“投标人须知”第 10.1款规定'}, '响应性评审标准': {'投标内容': '符合第二章“投标人须知”第1.3.1项规定', '工期': '符合第二章“投标人须知”第1.3.2项规定', '工程质量': '符合第二章“投标人须知”第1.3.3项规定'}} + + input_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile.pdf" + output_folder="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7" + formal_json = process_reviews(original_dict_data,knowledge_name, truncate_tobidders_table_json_path, clause_path,input_file,output_folder) + data = json.dumps(formal_json, ensure_ascii=False, indent=4) + end_time=time.time() + elapsed_time = end_time - start_time + print(f"Function execution took {elapsed_time} seconds.") + print(data) + +#关于'技术暗标',第二章“投标人须知”规定的内容是怎样的?请按json格式给我提供信息,键名为'技术暗标',请你忠于原文,回答要求完整准确,不要擅自总结、删减,且不要回答诸如'见投标人须知前附表'或'见第x.x项规定'这类无实质性内容的回答。 \ No newline at end of file diff --git a/flask_app/main/截取pdf.py b/flask_app/main/截取pdf.py index 38f61bc..ac8d414 100644 --- a/flask_app/main/截取pdf.py +++ b/flask_app/main/截取pdf.py @@ -14,7 +14,7 @@ def clean_page_content(text, common_header): text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码 text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码 - + text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码 return text #PYPDF2库 @@ -170,7 +170,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix): break # 找到结束页后退出循环 if start_page is None or end_page is None: - print(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!") + print(f"{output_suffix} twice: 未找到起始或结束页在文件 {pdf_path} 中!") return "" else: return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header) @@ -219,7 +219,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter if output_suffix == "qualification" or output_suffix =="invalid": return extract_pages_twice(pdf_path, output_folder, output_suffix) else: - print(f"first: 未找到起始或结束页在文件 {pdf_path} 中!") + print(f"{output_suffix} first: 未找到起始或结束页在文件 {pdf_path} 中!") return "" else: return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header) @@ -250,58 +250,64 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt print("提供的路径既不是文件夹也不是PDF文件。") return [] - def truncate_pdf_main(input_path, output_folder, selection): - if selection == 1: - # Configure patterns and phrases for "投标人须知前附表" - begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*投标人须知') - begin_page = 3 - end_pattern = re.compile(r'投标人须知正文') - output_suffix = "tobidders_notice_table" - elif selection == 2: - # Configure patterns and phrases for "评标办法" - begin_pattern = re.compile( - r'第[一二三四五六七八九十]+章\s*评标办法') # 考虑到这种情况 '第三章 第三章 第三章 第三章 评标办法 评标办法 评标办法 评标办法' - begin_page = 10 - end_pattern = re.compile(r'评标办法正文|评标办法') - output_suffix = "evaluation_method" - elif selection == 3: - # Configure patterns and phrases for "投标人须知正文" - begin_pattern = re.compile(r'投标人须知正文') - begin_page = 5 - end_pattern = re.compile( - r'^第[一二三四五六七八九十]+章\s*评标办法|^评标办法前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]', - re.MULTILINE) - output_suffix = "tobidders_notice" - elif selection == 4: - # 配置用于 "资格审查条件" 的正则表达式模式和短语 - common_pattern = r'^(?:附录(?:一)?[::]|附件(?:一)?[::]|附表(?:一)?[::])' - begin_pattern = re.compile(common_pattern + r'.*(?:资质|能力|信誉).*$', re.MULTILINE) - begin_page = 5 - end_pattern = re.compile( - common_pattern + r'(?!.*(?:资质|能力|信誉)).*$|' # 原有的模式 - r'^(第[一二三四五六七八九十]+章\s*评标办法|评标办法前附表)', # 新增的匹配项 - re.MULTILINE - ) - output_suffix = "qualification" - elif selection == 5: - # 配置用于 "招标公告" 的正则表达式模式和短语 - begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|投标邀请书') - begin_page = 0 - end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*投标人须知', re.MULTILINE) - output_suffix = "notice" - elif selection == 6: - # 配置用于 "无效标" 的正则表达式模式和短语 - begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:') - begin_page = 0 - end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|第二卷', re.MULTILINE) - output_suffix = "invalid" - else: - print("无效的选择:请选择1-6") - return None + try: + if selection == 1: + # Configure patterns and phrases for "投标人须知前附表" + begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*投标人须知') + begin_page = 3 + end_pattern = re.compile(r'投标人须知正文') + output_suffix = "tobidders_notice_table" + elif selection == 2: + # Configure patterns and phrases for "评标办法" + begin_pattern = re.compile( + r'第[一二三四五六七八九十]+章\s*评标办法') # 考虑到这种情况 '第三章 第三章 第三章 第三章 评标办法 评标办法 评标办法 评标办法' + begin_page = 10 + end_pattern = re.compile(r'评标办法正文|评标办法') + output_suffix = "evaluation_method" + elif selection == 3: + # Configure patterns and phrases for "投标人须知正文" + begin_pattern = re.compile(r'投标人须知正文') + begin_page = 5 + end_pattern = re.compile( + r'^第[一二三四五六七八九十]+章\s*评标办法|^评标办法前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]', + re.MULTILINE) + output_suffix = "tobidders_notice" + elif selection == 4: + # 配置用于 "资格审查条件" 的正则表达式模式和短语 + common_pattern = r'^(?:附录(?:一)?[::]|附件(?:一)?[::]|附表(?:一)?[::])' + begin_pattern = re.compile(common_pattern + r'.*(?:资质|能力|信誉).*$', re.MULTILINE) + begin_page = 5 + end_pattern = re.compile( + common_pattern + r'(?!.*(?:资质|能力|信誉)).*$|' # 原有的模式 + r'^(第[一二三四五六七八九十]+章\s*评标办法|评标办法前附表)', # 新增的匹配项 + re.MULTILINE + ) + output_suffix = "qualification" + elif selection == 5: + # 配置用于 "招标公告" 的正则表达式模式和短语 + begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|投标邀请书') + begin_page = 0 + end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*投标人须知', re.MULTILINE) + output_suffix = "notice" + elif selection == 6: + # 配置用于 "无效标" 的正则表达式模式和短语 + begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:') + begin_page = 0 + end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|第二卷', re.MULTILINE) + output_suffix = "invalid" + else: + print("无效的选择:请选择1-6") + return [""] + # 调用 process_input 进行截取 + output_paths = process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) + if not output_paths: + return [""] # 截取失败时返回空字符串 + return output_paths + except Exception as e: + print(f"Error in truncate_pdf_main for selection {selection}: {e}") + return [""] # 捕获异常时返回空字符串 - # Process the selected input - return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) def truncate_pdf_multiple_old(input_path, output_folder): @@ -311,21 +317,33 @@ def truncate_pdf_multiple_old(input_path, output_folder): truncate_files.extend(files) return truncate_files + def truncate_pdf_multiple(input_path, output_folder): - base_file_name = os.path.splitext(os.path.basename(input_path))[0] #纯文件名 + base_file_name = os.path.splitext(os.path.basename(input_path))[0] # 纯文件名 truncate_files = [] for selection in range(1, 7): files = truncate_pdf_main(input_path, output_folder, selection) - truncate_files.extend(files) - if truncate_files: + if files: + truncate_files.extend(files) + else: + truncate_files.append("") # 截取失败时添加空字符串 + + if any(f for f in truncate_files if f): # 检查是否有有效的文件路径 merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf") - merge_selected_pdfs(output_folder, truncate_files, merged_output_path,base_file_name) - truncate_files.append(merged_output_path) - print(f"已生成合并文件: {merged_output_path}") + merged_result = merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name) + if merged_result: + truncate_files.append(merged_result) + print(f"已生成合并文件: {merged_output_path}") + else: + truncate_files.append("") # 如果merged_result未生成,添加空字符串 + print("未生成合并文件,因为没有找到需要合并的 PDF 文件。") else: + truncate_files.append("") # 如果没有文件需要合并,也添加空字符串 print(f"没有文件需要合并 for {input_path}") + return truncate_files + def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_name): """ 合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件, @@ -336,16 +354,19 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na - truncate_files (list): 包含 PDF 文件路径的列表。 - output_path (str): 合并后的 PDF 文件保存路径。 - base_file_name (str): 用于匹配文件名的基础名称。 + + 返回: + - str: 合并后的 PDF 文件路径,如果没有合并则返回 ""。 """ # 1. 获取 output_folder 中所有文件 try: all_output_files = os.listdir(output_folder) except FileNotFoundError: print(f"输出文件夹 '{output_folder}' 未找到。") - return + return "" except PermissionError: print(f"没有权限访问输出文件夹 '{output_folder}'。") - return + return "" # 2. 定义要选择的文件后缀及合并顺序,包括 before 文件 desired_suffixes = [ @@ -381,11 +402,24 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na if not all_pdfs_to_merge: print("没有找到要合并的 PDF 文件。") - return + return "" + + # 过滤掉不存在或为空的文件路径 + all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f)] + + if not all_pdfs_to_merge: + print("没有有效的 PDF 文件需要合并。") + return "" # 调用 merge_pdfs 函数进行合并 - merge_pdfs(all_pdfs_to_merge, output_path) - print(f"已成功合并 PDF 文件到 '{output_path}'。") + try: + merge_pdfs(all_pdfs_to_merge, output_path) + print(f"已成功合并 PDF 文件到 '{output_path}'。") + return output_path + except Exception as e: + print(f"合并 PDF 文件时出错: {e}") + return "" + def truncate_pdf_specific_engineering(pdf_path, output_folder,selections): """ @@ -427,14 +461,13 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder,selections): # TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下,比如说就不进行跳转(见xx表) 开评定标这里也要考虑 如果评分表为空,也要处理。 if __name__ == "__main__": - # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74\\ztbfile.pdf" - # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74" input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf" + # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf" output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test" files=truncate_pdf_multiple(input_path,output_folder) # selections = [5, 1] # 仅处理 selection 5、1 和 3 # files=truncate_pdf_specific_engineering(input_path,output_folder,selections) print(files) - # selection = 3 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标 + # selection = 7 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标 # generated_files = truncate_pdf_main(input_path, output_folder, selection) - # # print("生成的文件:", generated_files) + # print("生成的文件:", generated_files) diff --git a/flask_app/main/投标人须知正文条款提取成json文件.py b/flask_app/main/投标人须知正文条款提取成json文件.py index 0a87e57..560f002 100644 --- a/flask_app/main/投标人须知正文条款提取成json文件.py +++ b/flask_app/main/投标人须知正文条款提取成json文件.py @@ -230,14 +230,14 @@ def convert_clause_to_json(input_path,output_folder,type=1): # json.dump(processed_data, file, ensure_ascii=False, indent=4) if __name__ == "__main__": - file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\special_output\\zbtest2_tobidders_notice.pdf' + file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_tobidders_notice.pdf' # file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件(一中多媒体报告厅教学设备)_tobidders_notice_part1.pdf' # start_word = "投标人须知正文" # end_phrases = [ # r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', # r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:', # ] - output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件\\special_output' + output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件\\new_test' try: output_path = convert_clause_to_json(file_path,output_folder) print(f"Final JSON result saved to: {output_path}") diff --git a/flask_app/main/招标文件解析.py b/flask_app/main/招标文件解析.py index f7ec354..d5aee66 100644 --- a/flask_app/main/招标文件解析.py +++ b/flask_app/main/招标文件解析.py @@ -12,7 +12,7 @@ from flask_app.main.无效标和废标和禁止投标整合 import combine_find_ from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice import concurrent.futures from flask_app.main.基础信息整合 import combine_basic_info -from flask_app.main.资格审查模块 import combine_review_standards +from flask_app.main.资格审查模块old import combine_review_standards from flask_app.main.商务标技术标整合 import combine_evaluation_standards from flask_app.main.format_change import pdf2docx, docx2pdf from flask_app.main.docx截取docx import copy_docx diff --git a/flask_app/main/根据条款号整合json.py b/flask_app/main/根据条款号整合json.py index e1d8224..d27029e 100644 --- a/flask_app/main/根据条款号整合json.py +++ b/flask_app/main/根据条款号整合json.py @@ -148,8 +148,8 @@ if __name__ == "__main__": # entries_with_numbers=[{'xxx': '3.7.3(3)'},{'xxssx': '10.1'},{'xsssxx': '3.7.4(5)'},{'a': '1.3.1'},{'b': '1.3.2'},{'c': '1.3.3'},{'d': '3.3.1'}] entries_with_numbers = [ {'xxssx': '1.3.1'} ] - primary_json_path = 'D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21\\truncate_output.json' - secondary_json_path = 'D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21\\clause1.json' + primary_json_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\ee2d8828-bae0-465a-9171-7b2dd7453251' + secondary_json_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\ee2d8828-bae0-465a-9171-7b2dd7453251' # Since this is just a test block, make sure these paths point to actual JSON files with the appropriate structure try: diff --git a/flask_app/main/读取文件/按页读取pdf.py b/flask_app/main/读取文件/按页读取pdf.py index e49c0d6..05818cf 100644 --- a/flask_app/main/读取文件/按页读取pdf.py +++ b/flask_app/main/读取文件/按页读取pdf.py @@ -16,7 +16,7 @@ def clean_page_content(text, common_header): text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码 text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码 - + text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码 return text def extract_common_header(pdf_path): from PyPDF2 import PdfReader @@ -153,7 +153,7 @@ if __name__ == '__main__': # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf' - file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\交警支队机动车查验监管系统项目采购.pdf' + file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf" # ress = extract_common_header(file_path) # print(ress) res=extract_text_by_page(file_path) diff --git a/flask_app/main/资格审查模块.py b/flask_app/main/资格审查模块.py index e8dc177..a58f117 100644 --- a/flask_app/main/资格审查模块.py +++ b/flask_app/main/资格审查模块.py @@ -1,42 +1,44 @@ +import json import os +import time from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json from flask_app.main.json_utils import nest_json_under_key, extract_content_from_json from flask_app.main.形式响应评审 import process_reviews -from flask_app.main.资格评审old import process_qualification +from flask_app.main.资格评审 import process_qualification from flask_app.main.通义千问long import upload_file, qianwen_long from concurrent.futures import ThreadPoolExecutor -def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder): #评标办法前附表 +def combine_review_standards(truncate1,truncate3,output_folder,truncate0_jsonpath,clause_path,invalid_path,merged_baseinfo_path): #评标办法前附表 # 形式评审、响应评审:千问 file_id=upload_file(truncate1) #评标办法前附表 user_query_1 = "根据该文档中的评标办法前附表,请你列出该文件中的形式评审标准和响应性评审标准和资格评审标准,请以json格式返回,外层键名为'形式评审标准'和'响应性评审标准'和'资格评审标准',嵌套键名为'评审因素'中的内容,相应的键值为对应'评审标准'中的内容。" results = qianwen_long(file_id, user_query_1) original_dict_data = extract_content_from_json(results) - qualification_review = original_dict_data.pop('资格评审标准', '默认值或None') #qianwen-long有关资格评审的内容 + qualification_review = original_dict_data.pop('资格评审标准', {}) #qianwen-long有关资格评审的内容 with ThreadPoolExecutor() as executor: # 创建Future对象 - future_qualification = executor.submit(process_qualification, qualification_review, truncate3, knowledge_name) - future_form_response = executor.submit(process_reviews, original_dict_data, knowledge_name, truncate0_jsonpath, - clause_path,input_file,output_folder) - + future_qualification = executor.submit(process_qualification, qualification_review, truncate3,invalid_path,merged_baseinfo_path) + future_form_response = executor.submit(process_reviews, original_dict_data,output_folder, truncate0_jsonpath, + clause_path) # 等待执行结果 final_qualify_json = future_qualification.result() form_response_dict = future_form_response.result() form_response_dict.update(final_qualify_json) - # return nest_json_under_key(form_response_dict,"资格审查") return {"资格审查":form_response_dict} if __name__ == "__main__": - input_file="D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21\\ztbfile.pdf" + start_time=time.time() + truncate1 = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_evaluation_method.pdf" + truncate3="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_qualification.pdf" output_folder = "D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21" - # truncate0 = os.path.join(output_folder, "zbtest20_tobidders_notice_table.pdf") - truncate2=os.path.join(output_folder,"ztbfile_tobidders_notice.pdf") - knowledge_name="zbtest20" - truncate1=os.path.join(output_folder,"ztbfile_evaluation_method.pdf") - truncate3=os.path.join(output_folder,"ztbfile_qualification.pdf") - clause_path = convert_clause_to_json(truncate2, output_folder) - truncate0_jsonpath = os.path.join(output_folder, "truncate_output.json") - res=combine_review_standards(truncate1,truncate3, knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder) - print(res) \ No newline at end of file + # knowledge_name="zbtest20" + clause_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\clause1.json" + truncate0_jsonpath = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\truncate_output.json" + invalid_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_invalid.pdf" + merged_baseinfo_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_merged_baseinfo.pdf" + res=combine_review_standards(truncate1,truncate3, output_folder,truncate0_jsonpath,clause_path,invalid_path,merged_baseinfo_path) + print(json.dumps(res,ensure_ascii=False,indent=4)) + end_time=time.time() + print("elapsed time:"+str(end_time-start_time)) \ No newline at end of file diff --git a/flask_app/main/资格审查模块old.py b/flask_app/main/资格审查模块old.py new file mode 100644 index 0000000..168864e --- /dev/null +++ b/flask_app/main/资格审查模块old.py @@ -0,0 +1,42 @@ +import os + +from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json +from flask_app.main.json_utils import nest_json_under_key, extract_content_from_json +from flask_app.main.形式响应评审old import process_reviews +from flask_app.main.资格评审old import process_qualification +from flask_app.main.通义千问long import upload_file, qianwen_long +from concurrent.futures import ThreadPoolExecutor + + +def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder): #评标办法前附表 + # 形式评审、响应评审:千问 + file_id=upload_file(truncate1) #评标办法前附表 + user_query_1 = "根据该文档中的评标办法前附表,请你列出该文件中的形式评审标准和响应性评审标准和资格评审标准,请以json格式返回,外层键名为'形式评审标准'和'响应性评审标准'和'资格评审标准',嵌套键名为'评审因素'中的内容,相应的键值为对应'评审标准'中的内容。" + results = qianwen_long(file_id, user_query_1) + original_dict_data = extract_content_from_json(results) + qualification_review = original_dict_data.pop('资格评审标准', '默认值或None') #qianwen-long有关资格评审的内容 + with ThreadPoolExecutor() as executor: + # 创建Future对象 + future_qualification = executor.submit(process_qualification, qualification_review, truncate3, knowledge_name) + future_form_response = executor.submit(process_reviews, original_dict_data, knowledge_name, truncate0_jsonpath, + clause_path,input_file,output_folder) + + # 等待执行结果 + final_qualify_json = future_qualification.result() + form_response_dict = future_form_response.result() + form_response_dict.update(final_qualify_json) + # return nest_json_under_key(form_response_dict,"资格审查") + return {"资格审查":form_response_dict} + +if __name__ == "__main__": + input_file="D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21\\ztbfile.pdf" + output_folder = "D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21" + # truncate0 = os.path.join(output_folder, "zbtest20_tobidders_notice_table.pdf") + truncate2=os.path.join(output_folder,"ztbfile_tobidders_notice.pdf") + knowledge_name="zbtest20" + truncate1=os.path.join(output_folder,"ztbfile_evaluation_method.pdf") + truncate3=os.path.join(output_folder,"ztbfile_qualification.pdf") + clause_path = convert_clause_to_json(truncate2, output_folder) + truncate0_jsonpath = os.path.join(output_folder, "truncate_output.json") + res=combine_review_standards(truncate1,truncate3, knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder) + print(res) \ No newline at end of file diff --git a/flask_app/main/资格评审.py b/flask_app/main/资格评审.py index da2ecc0..e9f7814 100644 --- a/flask_app/main/资格评审.py +++ b/flask_app/main/资格评审.py @@ -8,7 +8,8 @@ from flask_app.main.多线程提问 import multi_threading, read_questions_from_ from flask_app.main.通义千问long import upload_file, qianwen_long from flask_app.main.截取pdf import truncate_pdf_main -#这个函数的主要用途是将多个相关的字典(都包含 'common_key' 键)合并成一个更大的、综合的字典,所有相关信息都集中在 'common_key' 键下 + +# 这个函数的主要用途是将多个相关的字典(都包含 'common_key' 键)合并成一个更大的、综合的字典,所有相关信息都集中在 'common_key' 键下 def merge_dictionaries_under_common_key(dicts, common_key): """ dict1 = { @@ -94,26 +95,37 @@ def extract_matching_keys_qual(dict_data): # 获取联合体投标的要求,由于它不一定在资格审查表中,故调用rag def get_consortium_dict(merged_baseinfo_path): - consortium_dict={"联合体投标要求(如有)":""} + consortium_dict = {"联合体投标要求(如有)": ""} consortium_questions = "该招标文件对于联合体投标的要求是怎样的,请按json格式给我提供信息,外层键名为'联合体投标要求(如有)',嵌套键名为你对该要求的总结,而键值需要完全与原文保持一致,不要擅自总结、删减。" - file_id=upload_file(merged_baseinfo_path) - results1 = qianwen_long(file_id,consortium_questions) + file_id = upload_file(merged_baseinfo_path) + results1 = qianwen_long(file_id, consortium_questions) if results1: - consortium_dict=clean_json_string(results1) + consortium_dict = clean_json_string(results1) return consortium_dict -def get_all_dict(invalid_path): + +def get_all_dict(invalid_path, ques=None): + if ques is None: + ques = [] # qualification_review_file_path = 'flask_app/static/提示词/资格评审.txt' - qualification_review_file_path='D:\\flask_project\\flask_app\\static\\提示词\\资格评审.txt' + qualification_review_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\资格评审.txt' questions = read_questions_from_file(qualification_review_file_path) - file_id=upload_file(invalid_path) - res1 = multi_threading(questions,"",file_id,2) + # 确保 questions 和 ques 都是列表 + if not isinstance(questions, list): + print("读取的问题不是一个列表。") + return {'资格评审': None} + if not isinstance(ques, list): + print("传入的额外问题不是一个列表。") + return {'资格评审': None} + questions += ques + file_id = upload_file(invalid_path) + res1 = multi_threading(questions, "", file_id, 2) qualification_list = [clean_json_string(res) for _, res in res1] if res1 else [] qualification_combined_res = combine_json_results(qualification_list) return {'资格评审': qualification_combined_res} -def process_qualification(qualification_review, truncate3, invalid_path,merged_baseinfo_path): +def process_qualification(qualification_review, truncate3, invalid_path, merged_baseinfo_path): # 资格评审 matching_keys_list, non_matching_dict = extract_matching_keys_qual( qualification_review) # matching_keys_list:['资质条件', '财务状况'] non_matching_dict:{'营业执照': '具备有效的营业执照', '施工机械设备': '具备完善的施工设备'} @@ -127,7 +139,7 @@ def process_qualification(qualification_review, truncate3, invalid_path,merged_b results2 = multi_threading(ques, "", file_id2, 2) # 资格评审表,调用qianwen-long res_list = [clean_json_string(res) for _, res in results2] if results2 else [] if res_list: - #生成外键是'资格评审'的字典 + # 生成外键是'资格评审'的字典 merged_dict = merge_dictionaries_under_common_key(res_list, '资格评审') consortium_dict = get_consortium_dict(merged_baseinfo_path) updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) @@ -139,7 +151,7 @@ def process_qualification(qualification_review, truncate3, invalid_path,merged_b print("资格评审: type2") return get_all_dict(invalid_path) or {"资格评审": ""} - else: # 此时要求全部写在评标办法前附表中,不需要额外提取。 + else: # 此时要求全部写在评标办法前附表中,不需要额外提取。 print("资格评审: type3") new_non_matching_json = {'资格评审': non_matching_dict} substring = '联合体' @@ -161,16 +173,18 @@ def process_qualification(qualification_review, truncate3, invalid_path,merged_b print("资格评审: type4") target = ["资质", "业绩", "财务", "信誉", "人员", "项目经理", "负责人", "联合体"] - # 检查 matching_keys_list 是否包含 target 中的任何元素 - if not any(any(t in key for t in target) for key in matching_keys_list): - # 如果不包含,使用 generate_qual_question 模板生成问题 - question_template = "该招标文件中{key}的内容是怎样的?请你以json格式返回结果,键名为{key},若存在嵌套内容,嵌套键名为你对相应要求的总结,而对应键值需要完全与原文保持一致,不要擅自总结、删减。" - questions = [question_template.format(key=key) for key in target] + # 找出不在 target 中的 keys + non_target_keys = [key for key in matching_keys_list if not any(t in key for t in target)] + if non_target_keys: + # 如果有不在 target 中的 keys,使用它们构建问题 + question_template = "该招标文件中{key}的内容是怎样的?请你以json格式返回结果,键名为'{key}',若存在嵌套内容,嵌套键名为你对相应要求的总结,而对应键值需要完全与原文保持一致,不要擅自总结、删减。" + questions = [question_template.format(key=key) for key in non_target_keys] + print(questions) # 假设这里有一个函数来处理这些问题并获取结果 - # results = process_questions(questions) - # final_qualification = process_results(results) + final_qualification = get_all_dict(invalid_path, questions) else: - # 如果包含,保持原有逻辑 + print("所有键都在target中") + # 如果所有键都在 target 中,保持原有逻辑 final_qualification = get_all_dict(invalid_path) final_qualify_json = add_keys_to_json(final_qualification, non_matching_dict) @@ -202,10 +216,10 @@ if __name__ == "__main__": '资质等级': '符合第二章“投标人须知”规定', '财务状况': '符合第二章“投标人须知”规定'} truncate3 = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_qualification.pdf" # output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test" - invalid_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_invalid.pdf" - merged_baseinfo_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_merged_baseinfo.pdf" + invalid_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_invalid.pdf" + merged_baseinfo_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_merged_baseinfo.pdf" # knowledge_name = "招标解析word13" - res = process_qualification(qualification_review, truncate3,invalid_path,merged_baseinfo_path) + res = process_qualification(qualification_review, truncate3, invalid_path, merged_baseinfo_path) print(json.dumps(res, ensure_ascii=False, indent=4)) # 该招标文件中资格评审关于财务状况的内容是怎样的?请你以json格式返回结果,外层键名为'财务状况',请你忠于原文,回答要求完整准确,不要擅自总结、删减,且不要回答诸如'见投标人须知前附表'或'见第x.x项规定'这类无实质性内容的回答。 diff --git a/flask_app/main/test.py b/flask_app/main/资格评审5种情况测试脚本.py similarity index 74% rename from flask_app/main/test.py rename to flask_app/main/资格评审5种情况测试脚本.py index a8ad044..16cc488 100644 --- a/flask_app/main/test.py +++ b/flask_app/main/资格评审5种情况测试脚本.py @@ -60,7 +60,9 @@ def test_process_qualification_type4(): # Test inputs qualification_review = { '资质条件': '符合相关资质要求', - '财务状况': '符合财务健康标准' + '财务状况': '符合财务健康标准', + '联合体投标人(如有)':'xxxxx', + '分包':'符合附件规定' } truncate3 = "" # Missing qualification attachment invalid_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_invalid.pdf" # Not used in this case @@ -75,4 +77,27 @@ def test_process_qualification_type4(): # Execute the test -test_process_qualification_type4() \ No newline at end of file +# test_process_qualification_type4() + + +def test_process_qualification_type5(): + # Test inputs + qualification_review = { + '资质条件': '符合相关资质要求', + '财务状况': '符合财务健康标准', + '营业执照': '具备有效的营业执照' + } + truncate3 = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_qualification.pdf" # Qualification attachment provided + invalid_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_invalid.pdf" + merged_baseinfo_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_merged_baseinfo.pdf" + + # Call the function + res = process_qualification(qualification_review, truncate3, invalid_path, merged_baseinfo_path) + + # Print the output + print("Test Case 5 - Type5 Output:") + print(json.dumps(res, ensure_ascii=False, indent=4)) + + +# Execute the test +test_process_qualification_type5() \ No newline at end of file diff --git a/flask_app/货物标/货物标截取pdf.py b/flask_app/货物标/货物标截取pdf.py index a36de80..a880be0 100644 --- a/flask_app/货物标/货物标截取pdf.py +++ b/flask_app/货物标/货物标截取pdf.py @@ -16,7 +16,7 @@ def clean_page_content(text, common_header): text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码 text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码 - + text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码 return text @@ -133,25 +133,21 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt file_path = os.path.join(input_path, file_name) if is_pdf_or_doc(file_path): result = process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) - if result: - if isinstance(result, tuple): - generated_files.extend([f if f else "" for f in result]) - else: - generated_files.append(result if result else "") + if isinstance(result, tuple): + generated_files.extend([f if f else "" for f in result]) # 保留空字符串 + else: + generated_files.append(result) # 直接添加result,可能是空字符串 elif os.path.isfile(input_path) and is_pdf_or_doc(input_path): result = process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) - if result: - if isinstance(result, tuple): - generated_files.extend([f if f else "" for f in result]) - else: - generated_files.append(result if result else "") + if isinstance(result, tuple): + generated_files.extend([f if f else "" for f in result]) # 保留空字符串 + else: + generated_files.append(result) # 直接添加result,可能是空字符串 else: print("提供的路径既不是文件夹也不是PDF文件。") return generated_files - - # 默认逻辑是start_page匹配上就不再设置了,一般不匹配上目录的原因是设置了begin_page=5,但是匹配'第一章 招标公告'的时候start_page可能会错误匹配到目录。 def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None, output_suffix="normal"): @@ -383,7 +379,6 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header): else: print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!") return "" - return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix, ) except Exception as e: @@ -617,15 +612,16 @@ def truncate_pdf_specific_goods(pdf_path, output_folder,selections): return truncate_files -# TODO:交通智能系统和招标(1)(1)文件有问题 input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\2275fd59-5f5b-48f9-8c68-31d3fde7f927\\ztbfile.pdf"这种截取失败的时候没有生成默认路径,报错 +# TODO:交通智能系统和招标(1)(1)文件有问题 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况,类似工程标 if __name__ == "__main__": - # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\交警支队机动车查验监管系统项目采购.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf" - input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\2275fd59-5f5b-48f9-8c68-31d3fde7f927\\ztbfile.pdf" - output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\2275fd59-5f5b-48f9-8c68-31d3fde7f927" + # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf" + input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf" + output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp" + # output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zboutpub" files = truncate_pdf_multiple(input_path, output_folder) # files=truncate_pdf_specific_goods(input_path,output_folder) print(files) # selection = 1# 例如:1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-公告 # generated_files = truncate_pdf_main(input_path, output_folder, selection) - + # print(generated_files) diff --git a/flask_app/货物标/货物标解析main.py b/flask_app/货物标/货物标解析main.py index af3cc25..ed7eb9f 100644 --- a/flask_app/货物标/货物标解析main.py +++ b/flask_app/货物标/货物标解析main.py @@ -181,44 +181,27 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id): logger.error(f"Error processing {key}: {exc}") yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False) - # # 只有在需要 knowledge_name 和 index 时才等待 future_knowledge 完成 - # try: - # knowledge_name = "招标解析" + unique_id - # index = processed_data['knowledge_future'].result() # 阻塞等待知识库上传任务完成 - # - # # 提交依赖 knowledge_name 和 index 的任务 - # future_dependencies = { - # 'base_info': executor.submit(fetch_project_basic_info,processed_data['merged_baseinfo_path'], processed_data['procurement_path']), - # 'qualification_review': executor.submit(fetch_qualification_review, output_folder, - # processed_data['qualification_path'], - # processed_data['notice_path'], processed_data['merged_baseinfo_path']), - # } - # # 按完成顺序返回依赖任务的结果 - # for future in concurrent.futures.as_completed(future_dependencies.values()): - # key = next(k for k, v in future_dependencies.items() if v == future) - # try: - # result = future.result() - # yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False) - # except Exception as exc: - # logger.error(f"Error processing {key}: {exc}") - # yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False) - # - # except Exception as e: - # logger.error(f"Error uploading to knowledge base: {e}") - # yield json.dumps({'error': f'Knowledge upload failed: {str(e)}'}, ensure_ascii=False) - - # 删除知识索引 - # deleteKnowledge(index) - #TODO:目前的无效标这块的键值都删去空格了,所有的键名都删去空格 #广水市 2022 年义务教育学校多媒体补充采购项目 资格审查有问题 if __name__ == "__main__": - output_folder = "flask_app/static/output/zytest1" + import time - start_time = time.time() + # 配置日志器 + unique_id = "uuidzyzy11" + logger = get_global_logger(unique_id) + + output_folder = "flask_app/static/output/zytest1" file_type = 1 # 1:docx 2:pdf 3:其他 input_file = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\6.2定版视频会议磋商文件.pdf" - goods_bid_main(output_folder, input_file, file_type, "uuidzyzy11") + start_time = time.time() + # 创建生成器 + generator = goods_bid_main(output_folder, input_file, file_type, unique_id) + + # 迭代生成器,逐步获取和处理结果 + for output in generator: + print(output) + end_time = time.time() elapsed_time = end_time - start_time # 计算耗时 - print(f"Function execution took {elapsed_time} seconds.") + print(f"Function execution took {elapsed_time:.2f} seconds.") +