10.21工程标快速版

This commit is contained in:
zy123 2024-10-21 17:31:48 +08:00
parent b6da91d7b6
commit e63b719572
15 changed files with 874 additions and 237 deletions

View File

@ -130,8 +130,8 @@ def process_all_part1_pdfs(folder_path, output_folder):
extract_tables_main(file_path, subfolder_path)
if __name__ == "__main__":
path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件.docx'
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\tmp" # 前附表json文件
path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_tobidders_notice_table.docx'
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test" # 前附表json文件
res=extract_tables_main(path, output_folder)
#
# folder_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4'

View File

@ -0,0 +1,218 @@
# -*- encoding:utf-8 -*-
import json
import logging
import time
from concurrent.futures import ThreadPoolExecutor
from flask_app.main.截取pdf import truncate_pdf_multiple
from flask_app.main.table_content_extraction import extract_tables_main
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
from flask_app.main.json_utils import transform_json_values, combine_json_results
from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid
from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
import concurrent.futures
from flask_app.main.基础信息整合快速版 import combine_basic_info
from flask_app.main.资格审查模块 import combine_review_standards
from flask_app.main.商务标技术标整合 import combine_evaluation_standards
from flask_app.main.format_change import pdf2docx, docx2pdf
from flask_app.main.docx截取docx import copy_docx
def get_global_logger(unique_id):
if unique_id is None:
return logging.getLogger() # 获取默认的日志器
logger = logging.getLogger(unique_id)
return logger
logger=None
# 创建全局线程池
executor = ThreadPoolExecutor()
def preprocess_files(output_folder, downloaded_file_path, file_type):
logger.info("starting 文件预处理...")
logger.info("output_folder..." + output_folder)
# 根据文件类型处理文件路径
if file_type == 1: # docx
docx_path = downloaded_file_path
pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理
elif file_type == 2: # pdf
pdf_path = downloaded_file_path
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
else:
logger.error("Unsupported file type provided. Preprocessing halted.")
return None
# 调用截取PDF多次
truncate_files = truncate_pdf_multiple(pdf_path, output_folder)
# 处理各个部分
truncate0_docpath = pdf2docx(truncate_files[0]) # 投标人须知前附表转docx
invalid_path=truncate_files[5]
invalid_docpath = copy_docx(docx_path) # docx截取无效标部分
# invalid_docpath=pdf2docx(invalid_path)
truncate_jsonpath = extract_tables_main(truncate0_docpath, output_folder) # 投标人须知前附表docx->json
truncate0 = truncate_files[0]
truncate1 = truncate_files[1]
truncate2=truncate_files[2]
truncate3 = truncate_files[3]
merged_baseinfo_path=truncate_files[-1]
clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json
logger.info("文件预处理done")
# 返回包含预处理后文件路径的字典
return {
'file_path': downloaded_file_path,
'output_folder': output_folder,
'invalid_path':invalid_path,
'truncate0': truncate0,
'truncate1': truncate1,
'truncate2':truncate2,
'truncate3': truncate3,
'truncate0_jsonpath': truncate_jsonpath,
'merged_baseinfo_path':merged_baseinfo_path,
'clause_path': clause_path,
'invalid_docpath': invalid_docpath
}
def post_processing(data,includes):
# 初始化结果字典,预设'其他'分类为空字典
result = {"其他": {}}
# 遍历原始字典的每一个键值对
for key, value in data.items():
if key in includes:
# 如果键在includes列表中直接保留这个键值对
result[key] = value
else:
# 如果键不在includes列表中将这个键值对加入到'其他'分类中
result["其他"][key] = value
# 如果'其他'分类没有任何内容,可以选择删除这个键
if not result["其他"]:
del result["其他"]
return result
# 基本信息
def fetch_project_basic_info(merged_baseinfo_path, truncate0, truncate2, clause_path): # 投标人须知前附表
logger.info("starting基础信息...")
basic_res = combine_basic_info(merged_baseinfo_path, truncate0, truncate2, clause_path)
logger.info("基础信息done")
return basic_res
# 形式、响应、资格评审
def fetch_qualification_review(truncate1, truncate3,output_folder, truncate0_jsonpath, clause_path,invalid_path,merged_baseinfo_path):
logger.info("starting资格审查...")
review_standards_res = combine_review_standards(truncate1, truncate3,output_folder, truncate0_jsonpath,
clause_path,invalid_path,merged_baseinfo_path)
logger.info("资格审查done")
return review_standards_res
# 评分细则 流式
def fetch_evaluation_standards(truncate1): # 评标办法前附表
logger.info("starting 商务标和技术标...")
# 获取评标办法前附表的字典结果
evaluation_standards_res = combine_evaluation_standards(truncate1)
# 获取技术标和商务标
technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})}
commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})}
logger.info("商务标和技术标 done")
# 返回将 "技术标" 和 "商务标" 包含在新的键中
return {
"technical_standards": technical_standards,
"commercial_standards": commercial_standards
}
# 无效、废标项解析
def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3):
# 废标项要求:千问
logger.info("starting无效标与废标...")
find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3)
logger.info("无效标与废标done...")
return find_invalid_res
# 投标文件要求
def fetch_bidding_documents_requirements(clause_path):
logger.info("starting投标文件要求...")
fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1)
logger.info("投标文件要求done...")
return {"投标文件要求":fetch_bidding_documents_requirements_json}
# 开评定标流程
def fetch_bid_opening(clause_path):
logger.info("starting开评定标流程...")
fetch_bid_opening_json = extract_from_notice(clause_path, 2)
logger.info("开评定标流程done...")
return {"开评定标流程":fetch_bid_opening_json}
#分段返回
def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_id):
global logger
logger = get_global_logger(unique_id)
# 预处理文件,获取处理后的数据
processed_data = preprocess_files(output_folder, downloaded_file_path, file_type)
if not processed_data:
yield json.dumps({}) # 如果处理数据失败,返回空的 JSON
with concurrent.futures.ThreadPoolExecutor() as executor:
# 立即启动不依赖 knowledge_name 和 index 的任务
futures = {
'base_info': executor.submit(fetch_project_basic_info, processed_data['merged_baseinfo_path'],
processed_data['truncate0'],
processed_data['truncate2'], processed_data['clause_path']),
'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'],
processed_data['truncate3'], output_folder,
processed_data['truncate0_jsonpath'],
processed_data['clause_path'], processed_data['invalid_path'],
processed_data['merged_baseinfo_path']),
'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']),
'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
output_folder, processed_data['truncate0_jsonpath'],
processed_data['clause_path'], processed_data['truncate3']),
'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, processed_data['clause_path']),
'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path'])
}
# 提前处理这些不依赖的任务,按完成顺序返回
for future in concurrent.futures.as_completed(futures.values()):
key = next(k for k, v in futures.items() if v == future)
try:
result = future.result()
# 如果是 evaluation_standards拆分技术标和商务标
if key == 'evaluation_standards':
technical_standards = result["technical_standards"]
commercial_standards = result["commercial_standards"]
# 分别返回技术标和商务标
yield json.dumps({'technical_standards': transform_json_values(technical_standards)}, ensure_ascii=False)
yield json.dumps({'commercial_standards': transform_json_values(commercial_standards)}, ensure_ascii=False)
else:
# 处理其他任务的结果
yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False)
except Exception as exc:
logger.error(f"Error processing {key}: {exc}")
yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
if __name__ == "__main__":
start_time = time.time()
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test1"
file_type = 2 #1:docx 2:pdf 3:其他
input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf"
print("yes")
for output in engineering_bid_main(output_folder, input_file, file_type, "zytest"):
print(output)
end_time = time.time()
elapsed_time = end_time - start_time # 计算耗时
print(f"Function execution took {elapsed_time} seconds.")

View File

@ -1,13 +1,16 @@
# -*- encoding:utf-8 -*-
import os
import re
import json
import time
from flask_app.main.多线程提问 import multi_threading
from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2
from flask_app.main.json_utils import extract_content_from_json
from flask_app.main.json_utils import clean_json_string
from flask_app.main.截取pdf import truncate_pdf_main
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
from flask_app.main.多线程提问 import upload_file
from flask_app.main.截取pdf import merge_pdfs
prompt = """
# 角色
你是一个文档处理专家专门负责理解和操作基于特定内容的文档任务这包括解析总结搜索或生成与给定文档相关的各类信息
@ -63,27 +66,60 @@ def update_json_data(original_data, updates1, updates2,second_response_list):
return original_data
def judge_second_jump(input_dict):
formatted_questions = []
keys_to_remove = []
chapter_pattern = re.compile(r'\s*标\s*公\s*告|第\s*一\s*章')
question_template = (
"请你根据招标文件中的内容回答:关于{short_key}{additional_text},第一章招标公告中的内容是怎样的?"
"请按json格式给我提供信息键名为'{full_key}',而键值需要完全与原文保持一致,不要擅自总结、删减,"
"如果存在未知信息,请在对应键值处填'未知'"
)
for key, value in input_dict.items():
if '招标公告' in value or '第一章' in value:
formatted_entry = f"关于'{key}'第一章招标公告中的内容是怎样的请按json格式给我提供信息键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'"
if chapter_pattern.search(value):
short_key = key.split('.')[-1] if '.' in key else key
additional_text = get_additional_text(key)
formatted_entry = question_template.format(
short_key=short_key,
full_key=key,
additional_text=additional_text
)
formatted_questions.append(formatted_entry)
keys_to_remove.append(key) # 标记需要删除的键
# 从原始字典中删除处理过的键值对
for key in keys_to_remove:
del input_dict[key]
return input_dict, formatted_questions
def extract_matching_keys(json_data):
def extract_matching_keys(original_data):
"""
提取带序号的内容对于已经完整的内容不提取但会在后面用新数据更新orginal_data
:param json_data:
{
"投标要求": {
"资质要求": "符合第3章第2条规定",
"业绩要求": "满足招标文件第4章相关条目"
},
"其他信息": {
"投标文件格式": "按要求填写"
}
}
:return:
[
{"投标要求.资质要求": "符合第3章第2条规定"},
{"投标要求.业绩要求": "满足招标文件第4章相关条目"}
]
"""
#'投标人名称': '符合招标公告第3.2条规定'
# 函数首先检查输入 json_data 是否为字符串类型。如果是,它会使用 json.loads() 将字符串解析为字典。
if isinstance(json_data, str):
data = json.loads(json_data)
if isinstance(original_data, str):
data = json.loads(original_data)
else:
data = json_data
data = original_data
# 正则表达式匹配
include_patterns = [re.compile(r"符合第"), re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"),re.compile(r"第.*?条")]
@ -125,91 +161,160 @@ def extract_matching_keys(json_data):
return final_matching
def reformat_questions(match_keys, input_path, output_folder):
#同类表达扩展
def get_additional_text(key):
additional_text_map = {
"投标内容": "(或招标范围)",
"工程质量": "(或质量要求)",
# 可以在这里添加更多的映射
}
for keyword, text in additional_text_map.items():
if keyword in key:
return text
return ""
def process_tender_entries(match_keys,output_folder):
"""
:param match_keys:
match_keys = [
{"资质要求": "符合相关资质要求"},
{"业绩要求": "3.1.2 近三年完成类似项目"},
{"财务要求": "招标公告 2.3 财务状况良好"},
{"人员要求": "项目经理具有相关资格"}
]
:param notice_path:
notice_path="xxxx"
:return:
[{'业绩要求': '3.1.2'}]
["关于'资质要求'相关资质要求的内容是怎样的请按json格式给我提供信息键名为'资质要求',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'", "关于'人员要求'项目经理具有相关资格的内容是怎样的请按json格式给我提供信息键名为'人员要求',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'"]
[{'财务要求': '2.3'}]
"xxxxxx.json"
"""
entries_with_numbers = []
entries_with_numbers2 = [] # 用于保存特别处理的条目
entries_with_numbers2 = []
formatted_questions = []
clause2_path = ""
# 正则表达式,同时匹配全角和半角括号
pattern = re.compile(r'(\d+(?:\.\d+)+)(?:[\(\](\d+)[\)\])?') # 识别包含数字序列的特定格式
pattern = re.compile(r'(\d+(?:\.\d+)+)(?:[$\](\d+)[$\])?')
# 新的正则表达式,用于匹配 '招标公告' 和 '第一章',允许内部有空格
chapter_pattern = re.compile(r'\s*标\s*公\s*告|第\s*一\s*章')
# 初始化文件处理
generated_files = None
file_processed = False
question_template = (
"关于'{key}'{revised_value}的内容是怎样的请按json格式给我提供信息键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'"
"请你根据招标文件中的内容回答:关于{short_key}{additional_text}{revised_value}的内容是怎样的请按json格式给我提供信息键名为'{full_key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'"
)
def format_num(match):
return match.group(1) + (f"({match.group(2)})" if match.group(2) else "")
def add_formatted_question(key, value):
revised_value = value.replace('符合', '')
short_key = key.split('.')[-1] if '.' in key else key
additional_text = get_additional_text(key)
formatted_questions.append(question_template.format(
short_key=short_key,
full_key=key,
revised_value=revised_value,
additional_text=additional_text
))
for entry in match_keys:
key, value = next(iter(entry.items()))
match = pattern.search(value)
if '招标公告' in value or '第一章' in value:
if not file_processed:
# 只处理一次文件操作
generated_files = truncate_pdf_main(input_path, output_folder, 5)
file_processed = True # 设置标志位,避免重复处理文件
if generated_files:
clause2_path = convert_clause_to_json(generated_files[0], output_folder, 2)
if match and generated_files:
# 提取序号并添加到entries_with_numbers2
num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "")
entries_with_numbers2.append({key: num})
if chapter_pattern.search(value):
if match:
notice_path = next((os.path.join(root, file) for root, _, files in os.walk(output_folder)
for file in files if file.endswith('_notice.pdf')), "")
if notice_path:
clause2_path = convert_clause_to_json(notice_path, output_folder, 2)
entries_with_numbers2.append({key: format_num(match)})
else:
revised_value = value.replace('符合', '')
formatted_entry = question_template.format(key=key, revised_value=revised_value)
formatted_questions.append(formatted_entry)
add_formatted_question(key, value)
elif match:
num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "")
entries_with_numbers.append({key: num})
entries_with_numbers.append({key: format_num(match)})
else:
revised_value = value.replace('符合', '')
formatted_entry = question_template.format(key=key, revised_value=revised_value)
formatted_questions.append(formatted_entry)
add_formatted_question(key, value)
return entries_with_numbers, formatted_questions, entries_with_numbers2, clause2_path
def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause_json_path,input_file,output_folder):
def fetch_specific_pdf(output_folder):
pdf_paths = []
# 遍历指定目录
for root, dirs, files in os.walk(output_folder):
for file in files:
# 检查文件是否以指定的名称结尾
if file.endswith('merged_baseinfo.pdf') or file.endswith('tobidders_notice.pdf'):
# 如果是,将完整路径添加到列表中
full_path = os.path.join(root, file)
pdf_paths.append(full_path)
if pdf_paths: # 只有在找到匹配的PDF文件时才进行合并
output_path = os.path.join(output_folder, "merged_reviews.pdf")
merge_pdfs(pdf_paths, output_path)
return output_path
else:
return "" # 如果没有找到匹配的PDF文件返回None
def process_reviews(original_dict_data, output_folder, truncate0_jsonpath, clause_json_path):
combined_results2 = {}
matched_keys = extract_matching_keys(original_dict_data) #output:[{'形式评审标准.投标文件签字盖章': '符合第一章“投标人须知”第 3.7.34目 规定'}, {'形式评审标准.多标段投标': '符合第二章“投标人须知”第 10.1款规定'}] 提取值中包含"符合"的字典
entries_with_numbers, formatted_questions1,entries_with_numbers2, clause2_path = reformat_questions(matched_keys,input_file,output_folder)
combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath,
clause_json_path) # 调用根据条款号整合json.py
combined_results1, formatted_questions2 = judge_second_jump(combined_results) #判断是否需要二次跳转
if entries_with_numbers2: #跳转第一章招标公告
first_response_list = []
matched_keys = extract_matching_keys(original_dict_data)
entries_with_numbers, formatted_questions1, entries_with_numbers2, clause2_path = process_tender_entries(matched_keys, output_folder)
combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath, clause_json_path)
combined_results1, formatted_questions2 = judge_second_jump(combined_results)
if entries_with_numbers2:
combined_results2 = process_and_merge2(entries_with_numbers2, clause2_path)
results_2 = multi_threading(formatted_questions1+formatted_questions2, knowledge_name) #无序号的直接问大模型
first_response_list = []
# 只有当有问题需要处理时才执行PDF相关操作
formatted_questions = formatted_questions1 + formatted_questions2
if formatted_questions:
output_path = fetch_specific_pdf(output_folder)
if output_path:
file_id = upload_file(output_path)
results = multi_threading(formatted_questions, "", file_id, 2)
first_response_list = [clean_json_string(res) for _, res in results] if results else []
for _, response in results_2:
try:
if response and len(response) > 1: # 检查response存在且有至少两个元素
temp = extract_content_from_json(response[1])
first_response_list.append(temp)
else:
print(f"形式响应评审Warning: Missing or incomplete response data for query index {_}.")
except Exception as e:
print(f"形式响应评审Error processing response for query index {_}: {e}")
# Assume JSON file paths are defined or configured correctly
# print(entries_with_numbers) #[{'形式评审标准.多标段投标': '3.7.45'}]
updated_json = update_json_data(original_dict_data, combined_results1, combined_results2, first_response_list)
return updated_json
if __name__ == "__main__":
start_time=time.time()
knowledge_name="zbfile"
truncate_tobidders_table_json_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\truncate_output.json"
clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\clause.json"
original_dict_data={'形式评审标准': {'投标文件': '符合第一章规定', '投标人名称': '符合招标公告第3.2条规定', '投标文件签字': '符合第二章“投标人须知”第3.7.34目规定', '投标文件格式、内容': '符合第七章“投标文件格式”的要求,实质性内容齐全、关键字迹清晰可辨', '联合体投标人': '提交联合体协议书,并明确联合体牵头人', '报价唯一': '只能有一个有效报价', '多标段投标': '符合第二章“投标人须知”第 10.1款规定'}, '响应性评审标准': {'投标内容': '符合第二章“投标人须知”第1.3.1项规定', '工期': '符合第二章“投标人须知”第1.3.2项规定', '工程质量': '符合第二章“投标人须知”第1.3.3项规定'}}
input_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile.pdf"
output_folder="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7"
formal_json = process_reviews(original_dict_data,knowledge_name, truncate_tobidders_table_json_path, clause_path,input_file,output_folder)
# knowledge_name="zbfile"
truncate_tobidders_table_json_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\truncate_output.json"
clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\clause1.json"
original_dict_data={
"形式评审标准": {
"投标文件": "投标文件能正常打开",
"投标人名称": "与营业执照、资质证书、安全生产许可证一致",
"投标文件签字盖章": "符合第二章“投标人须知”规定",
"投标文件格式、内容": "符合第八章“投标文件格式”的要求,实质性内容齐全、关键字迹清晰可辨",
"联合体投标人(如有)": "提交联合体协议书,并明确联合体牵头人",
"报价唯一": "只能有一个有效报价(指投标函中的大写报价)",
"多标段投标": "符合第二章“投标人须知”规定",
"“技术暗标”": "符合第二章“投标人须知”规定"
},
"响应性评审标准": {
"投标内容": "符合第二章“投标人须知”规定",
"工期": "符合第二章“投标人须知”规定",
"工程质量": "符合第二章“投标人须知”规定",
"投标有效期": "符合第二章“投标人须知”规定",
"投标保证金": "符合第二章“投标人须知”规定",
"权利义务": "投标函附录中的相关承诺符合或优于第四章“合同条款及格式”的相关规定",
"技术标准和要求": "符合第七章“技术标准和要求”规定",
"分包计划": "符合第二章“投标人须知”规定",
"算术错误修正": "1)投标人接受算术错误修正后的报价 2)修正后的报价与投标报价相比偏差率不超过±1%",
"投标价格": "1投标函中的大写报价与已标价工程量清单中的投标总价一致 2投标函中的大写报价不大于本标段招标控制价总价 3算术错误修正后的投标总报价不大于本标段招标控制价总价 4投标报价不低于其成本",
"已标价工程量清单": "1已标价工程量清单项目编码顺序与第五章“工程量清单”给出的项目编码顺序一致 2已标价工程量清单符合第五章“工程量清单”给出的项目编码、项目名称、项目特征、计量单位和工程量 3暂列金额符合第五章“工程量清单”列出的金额 4专业工程暂估价符合第五章“工程量清单”列出的金额 5材料工程设备暂估价符合第五章“工程量清单”列出的单价并计入综合单价 6安全文明施工费、规费和税金等不可竞争费用按照规定的标准计取 7计税方法符合招标文件的约定 8已标价工程量清单项目未填报的项目不超过三项或不超过三项未填报的项目的费用合计按招标控制价相应项目的费用合计计算不超过其投标总报价修正后的投标总报价如有的±1%"
}
}
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test"
# notice_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_notice.pdf"
formal_json = process_reviews(original_dict_data,output_folder, truncate_tobidders_table_json_path, clause_path)
data = json.dumps(formal_json, ensure_ascii=False, indent=4)
end_time=time.time()
elapsed_time = end_time - start_time

View File

@ -0,0 +1,219 @@
# -*- encoding:utf-8 -*-
import re
import json
import time
from flask_app.main.多线程提问 import multi_threading
from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2
from flask_app.main.json_utils import extract_content_from_json
from flask_app.main.截取pdf import truncate_pdf_main
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
prompt = """
# 角色
你是一个文档处理专家专门负责理解和操作基于特定内容的文档任务这包括解析总结搜索或生成与给定文档相关的各类信息
## 技能
### 技能 1文档解析与摘要
- 深入理解并分析${document1}的内容提取关键信息
- 根据需求生成简洁明了的摘要保持原文核心意义不变
### 技能 2信息检索与关联
- ${document1}中高效检索特定信息或关键词
- 能够识别并链接到文档内部或外部的相关内容增强信息的连贯性和深度
## 限制
- 所有操作均需基于${document1}的内容不可超出此范围创造信息
- 在处理敏感或机密信息时需遵守严格的隐私和安全规定
- 确保所有生成或改编的内容逻辑连贯无误导性信息
请注意上述技能执行时将直接利用并参考${document1}的具体内容以确保所有产出紧密相关且高质量
"""
def update_json_data(original_data, updates1, updates2,second_response_list):
"""
根据提供的更新字典覆盖原始JSON数据中对应的键值支持点分隔的键来表示嵌套结构
参数:
- original_data: dict, 原始的JSON数据
- updates: dict, 包含需要更新的键值对
- second_response_list: list, 包含多个字典每个字典包含需要更新的键值对
返回:
- updated_data: dict, 更新后的JSON数据
"""
def recursive_update(data, key, value):
# 处理点分隔的键,递归定位并更新嵌套字典
keys = key.split('.')
for k in keys[:-1]:
data = data.setdefault(k, {})
if isinstance(value, dict) and isinstance(data.get(keys[-1], None), dict):
data[keys[-1]] = {**data.get(keys[-1], {}), **value}
else:
data[keys[-1]] = value
# 合并 updates 到 original_data 中
for key, value in updates1.items():
recursive_update(original_data, key, value)
for key,value in updates2.items():
recursive_update(original_data, key, value)
# 遍历 second_response_list 中的每个字典,并合并到 original_data 中
for response_dict in second_response_list:
for key, value in response_dict.items():
recursive_update(original_data, key, value)
return original_data
def judge_second_jump(input_dict):
formatted_questions = []
keys_to_remove = []
for key, value in input_dict.items():
if '招标公告' in value or '第一章' in value:
formatted_entry = f"关于'{key}'第一章招标公告中的内容是怎样的请按json格式给我提供信息键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'"
formatted_questions.append(formatted_entry)
keys_to_remove.append(key) # 标记需要删除的键
# 从原始字典中删除处理过的键值对
for key in keys_to_remove:
del input_dict[key]
return input_dict, formatted_questions
def extract_matching_keys(json_data):
#'投标人名称': '符合招标公告第3.2条规定'
# 函数首先检查输入 json_data 是否为字符串类型。如果是,它会使用 json.loads() 将字符串解析为字典。
if isinstance(json_data, str):
data = json.loads(json_data)
else:
data = json_data
# 正则表达式匹配
include_patterns = [re.compile(r"符合第"), re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"),re.compile(r"第.*?条")]
additional_include_patterns = [re.compile(r"规定"), re.compile(r"条目")]
exclude_patterns = ["投标文件格式", "权利义务", "技术标准","工程量清单"]
additional_exclude_patterns = ["按要求", "按规定"]
# Initialize a list to hold filtered key-value pairs
final_matching = []
# Recursive function to traverse and filter data
def recursive_search(current_data, path=[]):
if isinstance(current_data, dict):
for key, value in current_data.items():
new_path = path + [key] # Update path for nested keys
if isinstance(value, (dict, list)):
recursive_search(value, new_path)
else:
process_value(key, str(value), new_path)
elif isinstance(current_data, list):
for item in current_data:
recursive_search(item, path)
# Function to process each value against the patterns
def process_value(key, value, path):
# Check exclude patterns first
if any(ex in key or ex in value for ex in exclude_patterns):
return
# Main include patterns
if any(pattern.search(value) for pattern in include_patterns):
# Additional exclude patterns
if not any(ex in key or ex in value for ex in additional_exclude_patterns):
# Additional include patterns
if any(pattern.search(value) for pattern in additional_include_patterns):
final_matching.append({".".join(path): value}) # Use dot notation for nested keys
# Start the recursive search
recursive_search(data)
return final_matching
def reformat_questions(match_keys, input_path, output_folder):
entries_with_numbers = []
entries_with_numbers2 = [] # 用于保存特别处理的条目
formatted_questions = []
clause2_path = ""
# 正则表达式,同时匹配全角和半角括号
pattern = re.compile(r'(\d+(?:\.\d+)+)(?:[\(\](\d+)[\)\])?') # 识别包含数字序列的特定格式
# 初始化文件处理
generated_files = None
file_processed = False
question_template = (
"关于'{key}'{revised_value}的内容是怎样的请按json格式给我提供信息键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'"
)
for entry in match_keys:
key, value = next(iter(entry.items()))
match = pattern.search(value)
if '招标公告' in value or '第一章' in value:
if not file_processed:
# 只处理一次文件操作
generated_files = truncate_pdf_main(input_path, output_folder, 5)
file_processed = True # 设置标志位,避免重复处理文件
if generated_files:
clause2_path = convert_clause_to_json(generated_files[0], output_folder, 2)
if match and generated_files:
# 提取序号并添加到entries_with_numbers2
num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "")
entries_with_numbers2.append({key: num})
else:
revised_value = value.replace('符合', '')
formatted_entry = question_template.format(key=key, revised_value=revised_value)
formatted_questions.append(formatted_entry)
elif match:
num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "")
entries_with_numbers.append({key: num})
else:
revised_value = value.replace('符合', '')
formatted_entry = question_template.format(key=key, revised_value=revised_value)
formatted_questions.append(formatted_entry)
return entries_with_numbers, formatted_questions, entries_with_numbers2, clause2_path
def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause_json_path,input_file,output_folder):
combined_results2={}
matched_keys = extract_matching_keys(original_dict_data) #output:[{'形式评审标准.投标文件签字盖章': '符合第一章“投标人须知”第 3.7.34目 规定'}, {'形式评审标准.多标段投标': '符合第二章“投标人须知”第 10.1款规定'}] 提取值中包含"符合"的字典
entries_with_numbers, formatted_questions1,entries_with_numbers2, clause2_path = reformat_questions(matched_keys,input_file,output_folder)
combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath,
clause_json_path) # 调用根据条款号整合json.py
combined_results1, formatted_questions2 = judge_second_jump(combined_results) #判断是否需要二次跳转
if entries_with_numbers2: #跳转第一章招标公告
combined_results2=process_and_merge2(entries_with_numbers2,clause2_path)
results_2 = multi_threading(formatted_questions1+formatted_questions2, knowledge_name) #无序号的直接问大模型
first_response_list = []
for _, response in results_2:
try:
if response and len(response) > 1: # 检查response存在且有至少两个元素
temp = extract_content_from_json(response[1])
first_response_list.append(temp)
else:
print(f"形式响应评审Warning: Missing or incomplete response data for query index {_}.")
except Exception as e:
print(f"形式响应评审Error processing response for query index {_}: {e}")
# Assume JSON file paths are defined or configured correctly
# print(entries_with_numbers) #[{'形式评审标准.多标段投标': '3.7.45'}]
updated_json = update_json_data(original_dict_data, combined_results1, combined_results2,first_response_list)
return updated_json
if __name__ == "__main__":
start_time=time.time()
knowledge_name="zbfile"
truncate_tobidders_table_json_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\truncate_output.json"
clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\clause.json"
original_dict_data={'形式评审标准': {'投标文件': '符合第一章规定', '投标人名称': '符合招标公告第3.2条规定', '投标文件签字': '符合第二章“投标人须知”第3.7.34目规定', '投标文件格式、内容': '符合第七章“投标文件格式”的要求,实质性内容齐全、关键字迹清晰可辨', '联合体投标人': '提交联合体协议书,并明确联合体牵头人', '报价唯一': '只能有一个有效报价', '多标段投标': '符合第二章“投标人须知”第 10.1款规定'}, '响应性评审标准': {'投标内容': '符合第二章“投标人须知”第1.3.1项规定', '工期': '符合第二章“投标人须知”第1.3.2项规定', '工程质量': '符合第二章“投标人须知”第1.3.3项规定'}}
input_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile.pdf"
output_folder="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7"
formal_json = process_reviews(original_dict_data,knowledge_name, truncate_tobidders_table_json_path, clause_path,input_file,output_folder)
data = json.dumps(formal_json, ensure_ascii=False, indent=4)
end_time=time.time()
elapsed_time = end_time - start_time
print(f"Function execution took {elapsed_time} seconds.")
print(data)
#关于'技术暗标'第二章“投标人须知”规定的内容是怎样的请按json格式给我提供信息键名为'技术暗标',请你忠于原文,回答要求完整准确,不要擅自总结、删减,且不要回答诸如'见投标人须知前附表'或'见第x.x项规定'这类无实质性内容的回答。

View File

@ -14,7 +14,7 @@ def clean_page_content(text, common_header):
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码
text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
return text
#PYPDF2库
@ -170,7 +170,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix):
break # 找到结束页后退出循环
if start_page is None or end_page is None:
print(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!")
print(f"{output_suffix} twice: 未找到起始或结束页在文件 {pdf_path} 中!")
return ""
else:
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header)
@ -219,7 +219,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
if output_suffix == "qualification" or output_suffix =="invalid":
return extract_pages_twice(pdf_path, output_folder, output_suffix)
else:
print(f"first: 未找到起始或结束页在文件 {pdf_path} 中!")
print(f"{output_suffix} first: 未找到起始或结束页在文件 {pdf_path} 中!")
return ""
else:
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header)
@ -250,8 +250,8 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
print("提供的路径既不是文件夹也不是PDF文件。")
return []
def truncate_pdf_main(input_path, output_folder, selection):
try:
if selection == 1:
# Configure patterns and phrases for "投标人须知前附表"
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*投标人须知')
@ -298,10 +298,16 @@ def truncate_pdf_main(input_path, output_folder, selection):
output_suffix = "invalid"
else:
print("无效的选择:请选择1-6")
return None
return [""]
# 调用 process_input 进行截取
output_paths = process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
if not output_paths:
return [""] # 截取失败时返回空字符串
return output_paths
except Exception as e:
print(f"Error in truncate_pdf_main for selection {selection}: {e}")
return [""] # 捕获异常时返回空字符串
# Process the selected input
return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
def truncate_pdf_multiple_old(input_path, output_folder):
@ -311,21 +317,33 @@ def truncate_pdf_multiple_old(input_path, output_folder):
truncate_files.extend(files)
return truncate_files
def truncate_pdf_multiple(input_path, output_folder):
base_file_name = os.path.splitext(os.path.basename(input_path))[0] # 纯文件名
truncate_files = []
for selection in range(1, 7):
files = truncate_pdf_main(input_path, output_folder, selection)
if files:
truncate_files.extend(files)
if truncate_files:
else:
truncate_files.append("") # 截取失败时添加空字符串
if any(f for f in truncate_files if f): # 检查是否有有效的文件路径
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
merge_selected_pdfs(output_folder, truncate_files, merged_output_path,base_file_name)
truncate_files.append(merged_output_path)
merged_result = merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name)
if merged_result:
truncate_files.append(merged_result)
print(f"已生成合并文件: {merged_output_path}")
else:
truncate_files.append("") # 如果merged_result未生成添加空字符串
print("未生成合并文件,因为没有找到需要合并的 PDF 文件。")
else:
truncate_files.append("") # 如果没有文件需要合并,也添加空字符串
print(f"没有文件需要合并 for {input_path}")
return truncate_files
def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_name):
"""
合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件
@ -336,16 +354,19 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na
- truncate_files (list): 包含 PDF 文件路径的列表
- output_path (str): 合并后的 PDF 文件保存路径
- base_file_name (str): 用于匹配文件名的基础名称
返回:
- str: 合并后的 PDF 文件路径如果没有合并则返回 ""
"""
# 1. 获取 output_folder 中所有文件
try:
all_output_files = os.listdir(output_folder)
except FileNotFoundError:
print(f"输出文件夹 '{output_folder}' 未找到。")
return
return ""
except PermissionError:
print(f"没有权限访问输出文件夹 '{output_folder}'")
return
return ""
# 2. 定义要选择的文件后缀及合并顺序,包括 before 文件
desired_suffixes = [
@ -381,11 +402,24 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na
if not all_pdfs_to_merge:
print("没有找到要合并的 PDF 文件。")
return
return ""
# 过滤掉不存在或为空的文件路径
all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f)]
if not all_pdfs_to_merge:
print("没有有效的 PDF 文件需要合并。")
return ""
# 调用 merge_pdfs 函数进行合并
try:
merge_pdfs(all_pdfs_to_merge, output_path)
print(f"已成功合并 PDF 文件到 '{output_path}'")
return output_path
except Exception as e:
print(f"合并 PDF 文件时出错: {e}")
return ""
def truncate_pdf_specific_engineering(pdf_path, output_folder,selections):
"""
@ -427,14 +461,13 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder,selections):
# TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下比如说就不进行跳转见xx表 开评定标这里也要考虑 如果评分表为空,也要处理。
if __name__ == "__main__":
# input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74\\ztbfile.pdf"
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74"
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test"
files=truncate_pdf_multiple(input_path,output_folder)
# selections = [5, 1] # 仅处理 selection 5、1 和 3
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
print(files)
# selection = 3 # 例如1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
# selection = 7 # 例如1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
# # print("生成的文件:", generated_files)
# print("生成的文件:", generated_files)

View File

@ -230,14 +230,14 @@ def convert_clause_to_json(input_path,output_folder,type=1):
# json.dump(processed_data, file, ensure_ascii=False, indent=4)
if __name__ == "__main__":
file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\special_output\\zbtest2_tobidders_notice.pdf'
file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_tobidders_notice.pdf'
# file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件一中多媒体报告厅教学设备_tobidders_notice_part1.pdf'
# start_word = "投标人须知正文"
# end_phrases = [
# r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
# r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
# ]
output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件\\special_output'
output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件\\new_test'
try:
output_path = convert_clause_to_json(file_path,output_folder)
print(f"Final JSON result saved to: {output_path}")

View File

@ -12,7 +12,7 @@ from flask_app.main.无效标和废标和禁止投标整合 import combine_find_
from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
import concurrent.futures
from flask_app.main.基础信息整合 import combine_basic_info
from flask_app.main.资格审查模块 import combine_review_standards
from flask_app.main.资格审查模块old import combine_review_standards
from flask_app.main.商务标技术标整合 import combine_evaluation_standards
from flask_app.main.format_change import pdf2docx, docx2pdf
from flask_app.main.docx截取docx import copy_docx

View File

@ -148,8 +148,8 @@ if __name__ == "__main__":
# entries_with_numbers=[{'xxx': '3.7.33'},{'xxssx': '10.1'},{'xsssxx': '3.7.45'},{'a': '1.3.1'},{'b': '1.3.2'},{'c': '1.3.3'},{'d': '3.3.1'}]
entries_with_numbers = [ {'xxssx': '1.3.1'}
]
primary_json_path = 'D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21\\truncate_output.json'
secondary_json_path = 'D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21\\clause1.json'
primary_json_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\ee2d8828-bae0-465a-9171-7b2dd7453251'
secondary_json_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\ee2d8828-bae0-465a-9171-7b2dd7453251'
# Since this is just a test block, make sure these paths point to actual JSON files with the appropriate structure
try:

View File

@ -16,7 +16,7 @@ def clean_page_content(text, common_header):
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码
text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
return text
def extract_common_header(pdf_path):
from PyPDF2 import PdfReader
@ -153,7 +153,7 @@ if __name__ == '__main__':
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\交警支队机动车查验监管系统项目采购.pdf'
file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
# ress = extract_common_header(file_path)
# print(ress)
res=extract_text_by_page(file_path)

View File

@ -1,42 +1,44 @@
import json
import os
import time
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
from flask_app.main.json_utils import nest_json_under_key, extract_content_from_json
from flask_app.main.形式响应评审 import process_reviews
from flask_app.main.资格评审old import process_qualification
from flask_app.main.资格评审 import process_qualification
from flask_app.main.通义千问long import upload_file, qianwen_long
from concurrent.futures import ThreadPoolExecutor
def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder): #评标办法前附表
def combine_review_standards(truncate1,truncate3,output_folder,truncate0_jsonpath,clause_path,invalid_path,merged_baseinfo_path): #评标办法前附表
# 形式评审、响应评审:千问
file_id=upload_file(truncate1) #评标办法前附表
user_query_1 = "根据该文档中的评标办法前附表请你列出该文件中的形式评审标准和响应性评审标准和资格评审标准请以json格式返回外层键名为'形式评审标准''响应性评审标准''资格评审标准',嵌套键名为'评审因素'中的内容,相应的键值为对应'评审标准'中的内容。"
results = qianwen_long(file_id, user_query_1)
original_dict_data = extract_content_from_json(results)
qualification_review = original_dict_data.pop('资格评审标准', '默认值或None') #qianwen-long有关资格评审的内容
qualification_review = original_dict_data.pop('资格评审标准', {}) #qianwen-long有关资格评审的内容
with ThreadPoolExecutor() as executor:
# 创建Future对象
future_qualification = executor.submit(process_qualification, qualification_review, truncate3, knowledge_name)
future_form_response = executor.submit(process_reviews, original_dict_data, knowledge_name, truncate0_jsonpath,
clause_path,input_file,output_folder)
future_qualification = executor.submit(process_qualification, qualification_review, truncate3,invalid_path,merged_baseinfo_path)
future_form_response = executor.submit(process_reviews, original_dict_data,output_folder, truncate0_jsonpath,
clause_path)
# 等待执行结果
final_qualify_json = future_qualification.result()
form_response_dict = future_form_response.result()
form_response_dict.update(final_qualify_json)
# return nest_json_under_key(form_response_dict,"资格审查")
return {"资格审查":form_response_dict}
if __name__ == "__main__":
input_file="D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21\\ztbfile.pdf"
start_time=time.time()
truncate1 = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_evaluation_method.pdf"
truncate3="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_qualification.pdf"
output_folder = "D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21"
# truncate0 = os.path.join(output_folder, "zbtest20_tobidders_notice_table.pdf")
truncate2=os.path.join(output_folder,"ztbfile_tobidders_notice.pdf")
knowledge_name="zbtest20"
truncate1=os.path.join(output_folder,"ztbfile_evaluation_method.pdf")
truncate3=os.path.join(output_folder,"ztbfile_qualification.pdf")
clause_path = convert_clause_to_json(truncate2, output_folder)
truncate0_jsonpath = os.path.join(output_folder, "truncate_output.json")
res=combine_review_standards(truncate1,truncate3, knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder)
print(res)
# knowledge_name="zbtest20"
clause_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\clause1.json"
truncate0_jsonpath = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\truncate_output.json"
invalid_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_invalid.pdf"
merged_baseinfo_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_merged_baseinfo.pdf"
res=combine_review_standards(truncate1,truncate3, output_folder,truncate0_jsonpath,clause_path,invalid_path,merged_baseinfo_path)
print(json.dumps(res,ensure_ascii=False,indent=4))
end_time=time.time()
print("elapsed time:"+str(end_time-start_time))

View File

@ -0,0 +1,42 @@
import os
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
from flask_app.main.json_utils import nest_json_under_key, extract_content_from_json
from flask_app.main.形式响应评审old import process_reviews
from flask_app.main.资格评审old import process_qualification
from flask_app.main.通义千问long import upload_file, qianwen_long
from concurrent.futures import ThreadPoolExecutor
def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder): #评标办法前附表
# 形式评审、响应评审:千问
file_id=upload_file(truncate1) #评标办法前附表
user_query_1 = "根据该文档中的评标办法前附表请你列出该文件中的形式评审标准和响应性评审标准和资格评审标准请以json格式返回外层键名为'形式评审标准''响应性评审标准''资格评审标准',嵌套键名为'评审因素'中的内容,相应的键值为对应'评审标准'中的内容。"
results = qianwen_long(file_id, user_query_1)
original_dict_data = extract_content_from_json(results)
qualification_review = original_dict_data.pop('资格评审标准', '默认值或None') #qianwen-long有关资格评审的内容
with ThreadPoolExecutor() as executor:
# 创建Future对象
future_qualification = executor.submit(process_qualification, qualification_review, truncate3, knowledge_name)
future_form_response = executor.submit(process_reviews, original_dict_data, knowledge_name, truncate0_jsonpath,
clause_path,input_file,output_folder)
# 等待执行结果
final_qualify_json = future_qualification.result()
form_response_dict = future_form_response.result()
form_response_dict.update(final_qualify_json)
# return nest_json_under_key(form_response_dict,"资格审查")
return {"资格审查":form_response_dict}
if __name__ == "__main__":
input_file="D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21\\ztbfile.pdf"
output_folder = "D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21"
# truncate0 = os.path.join(output_folder, "zbtest20_tobidders_notice_table.pdf")
truncate2=os.path.join(output_folder,"ztbfile_tobidders_notice.pdf")
knowledge_name="zbtest20"
truncate1=os.path.join(output_folder,"ztbfile_evaluation_method.pdf")
truncate3=os.path.join(output_folder,"ztbfile_qualification.pdf")
clause_path = convert_clause_to_json(truncate2, output_folder)
truncate0_jsonpath = os.path.join(output_folder, "truncate_output.json")
res=combine_review_standards(truncate1,truncate3, knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder)
print(res)

View File

@ -8,6 +8,7 @@ from flask_app.main.多线程提问 import multi_threading, read_questions_from_
from flask_app.main.通义千问long import upload_file, qianwen_long
from flask_app.main.截取pdf import truncate_pdf_main
# 这个函数的主要用途是将多个相关的字典(都包含 'common_key' 键)合并成一个更大的、综合的字典,所有相关信息都集中在 'common_key' 键下
def merge_dictionaries_under_common_key(dicts, common_key):
"""
@ -102,10 +103,21 @@ def get_consortium_dict(merged_baseinfo_path):
consortium_dict = clean_json_string(results1)
return consortium_dict
def get_all_dict(invalid_path):
def get_all_dict(invalid_path, ques=None):
if ques is None:
ques = []
# qualification_review_file_path = 'flask_app/static/提示词/资格评审.txt'
qualification_review_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\资格评审.txt'
questions = read_questions_from_file(qualification_review_file_path)
# 确保 questions 和 ques 都是列表
if not isinstance(questions, list):
print("读取的问题不是一个列表。")
return {'资格评审': None}
if not isinstance(ques, list):
print("传入的额外问题不是一个列表。")
return {'资格评审': None}
questions += ques
file_id = upload_file(invalid_path)
res1 = multi_threading(questions, "", file_id, 2)
qualification_list = [clean_json_string(res) for _, res in res1] if res1 else []
@ -161,16 +173,18 @@ def process_qualification(qualification_review, truncate3, invalid_path,merged_b
print("资格评审: type4")
target = ["资质", "业绩", "财务", "信誉", "人员", "项目经理", "负责人", "联合体"]
# 检查 matching_keys_list 是否包含 target 中的任何元素
if not any(any(t in key for t in target) for key in matching_keys_list):
# 如果不包含,使用 generate_qual_question 模板生成问题
question_template = "该招标文件中{key}的内容是怎样的请你以json格式返回结果键名为{key},若存在嵌套内容,嵌套键名为你对相应要求的总结,而对应键值需要完全与原文保持一致,不要擅自总结、删减。"
questions = [question_template.format(key=key) for key in target]
# 找出不在 target 中的 keys
non_target_keys = [key for key in matching_keys_list if not any(t in key for t in target)]
if non_target_keys:
# 如果有不在 target 中的 keys使用它们构建问题
question_template = "该招标文件中{key}的内容是怎样的请你以json格式返回结果键名为'{key}',若存在嵌套内容,嵌套键名为你对相应要求的总结,而对应键值需要完全与原文保持一致,不要擅自总结、删减。"
questions = [question_template.format(key=key) for key in non_target_keys]
print(questions)
# 假设这里有一个函数来处理这些问题并获取结果
# results = process_questions(questions)
# final_qualification = process_results(results)
final_qualification = get_all_dict(invalid_path, questions)
else:
# 如果包含,保持原有逻辑
print("所有键都在target中")
# 如果所有键都在 target 中,保持原有逻辑
final_qualification = get_all_dict(invalid_path)
final_qualify_json = add_keys_to_json(final_qualification, non_matching_dict)

View File

@ -60,7 +60,9 @@ def test_process_qualification_type4():
# Test inputs
qualification_review = {
'资质条件': '符合相关资质要求',
'财务状况': '符合财务健康标准'
'财务状况': '符合财务健康标准',
'联合体投标人(如有)':'xxxxx',
'分包':'符合附件规定'
}
truncate3 = "" # Missing qualification attachment
invalid_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_invalid.pdf" # Not used in this case
@ -75,4 +77,27 @@ def test_process_qualification_type4():
# Execute the test
test_process_qualification_type4()
# test_process_qualification_type4()
def test_process_qualification_type5():
# Test inputs
qualification_review = {
'资质条件': '符合相关资质要求',
'财务状况': '符合财务健康标准',
'营业执照': '具备有效的营业执照'
}
truncate3 = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_qualification.pdf" # Qualification attachment provided
invalid_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_invalid.pdf"
merged_baseinfo_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_merged_baseinfo.pdf"
# Call the function
res = process_qualification(qualification_review, truncate3, invalid_path, merged_baseinfo_path)
# Print the output
print("Test Case 5 - Type5 Output:")
print(json.dumps(res, ensure_ascii=False, indent=4))
# Execute the test
test_process_qualification_type5()

View File

@ -16,7 +16,7 @@ def clean_page_content(text, common_header):
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码
text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
return text
@ -133,25 +133,21 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
file_path = os.path.join(input_path, file_name)
if is_pdf_or_doc(file_path):
result = process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
if result:
if isinstance(result, tuple):
generated_files.extend([f if f else "" for f in result])
generated_files.extend([f if f else "" for f in result]) # 保留空字符串
else:
generated_files.append(result if result else "")
generated_files.append(result) # 直接添加result可能是空字符串
elif os.path.isfile(input_path) and is_pdf_or_doc(input_path):
result = process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
if result:
if isinstance(result, tuple):
generated_files.extend([f if f else "" for f in result])
generated_files.extend([f if f else "" for f in result]) # 保留空字符串
else:
generated_files.append(result if result else "")
generated_files.append(result) # 直接添加result可能是空字符串
else:
print("提供的路径既不是文件夹也不是PDF文件。")
return generated_files
# 默认逻辑是start_page匹配上就不再设置了一般不匹配上目录的原因是设置了begin_page=5但是匹配'第一章 招标公告'的时候start_page可能会错误匹配到目录。
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
output_suffix="normal"):
@ -383,7 +379,6 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
else:
print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
return ""
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix,
)
except Exception as e:
@ -617,15 +612,16 @@ def truncate_pdf_specific_goods(pdf_path, output_folder,selections):
return truncate_files
# TODO:交通智能系统和招标(1)(1)文件有问题 input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\2275fd59-5f5b-48f9-8c68-31d3fde7f927\\ztbfile.pdf"这种截取失败的时候没有生成默认路径,报错
# TODO:交通智能系统和招标(1)(1)文件有问题 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况类似工程标
if __name__ == "__main__":
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\交警支队机动车查验监管系统项目采购.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\2275fd59-5f5b-48f9-8c68-31d3fde7f927\\ztbfile.pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\2275fd59-5f5b-48f9-8c68-31d3fde7f927"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
# output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zboutpub"
files = truncate_pdf_multiple(input_path, output_folder)
# files=truncate_pdf_specific_goods(input_path,output_folder)
print(files)
# selection = 1# 例如1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-公告
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print(generated_files)

View File

@ -181,44 +181,27 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
logger.error(f"Error processing {key}: {exc}")
yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
# # 只有在需要 knowledge_name 和 index 时才等待 future_knowledge 完成
# try:
# knowledge_name = "招标解析" + unique_id
# index = processed_data['knowledge_future'].result() # 阻塞等待知识库上传任务完成
#
# # 提交依赖 knowledge_name 和 index 的任务
# future_dependencies = {
# 'base_info': executor.submit(fetch_project_basic_info,processed_data['merged_baseinfo_path'], processed_data['procurement_path']),
# 'qualification_review': executor.submit(fetch_qualification_review, output_folder,
# processed_data['qualification_path'],
# processed_data['notice_path'], processed_data['merged_baseinfo_path']),
# }
# # 按完成顺序返回依赖任务的结果
# for future in concurrent.futures.as_completed(future_dependencies.values()):
# key = next(k for k, v in future_dependencies.items() if v == future)
# try:
# result = future.result()
# yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False)
# except Exception as exc:
# logger.error(f"Error processing {key}: {exc}")
# yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
#
# except Exception as e:
# logger.error(f"Error uploading to knowledge base: {e}")
# yield json.dumps({'error': f'Knowledge upload failed: {str(e)}'}, ensure_ascii=False)
# 删除知识索引
# deleteKnowledge(index)
#TODO:目前的无效标这块的键值都删去空格了,所有的键名都删去空格
#广水市 2022 年义务教育学校多媒体补充采购项目 资格审查有问题
if __name__ == "__main__":
output_folder = "flask_app/static/output/zytest1"
import time
start_time = time.time()
# 配置日志器
unique_id = "uuidzyzy11"
logger = get_global_logger(unique_id)
output_folder = "flask_app/static/output/zytest1"
file_type = 1 # 1:docx 2:pdf 3:其他
input_file = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\6.2定版视频会议磋商文件.pdf"
goods_bid_main(output_folder, input_file, file_type, "uuidzyzy11")
start_time = time.time()
# 创建生成器
generator = goods_bid_main(output_folder, input_file, file_type, unique_id)
# 迭代生成器,逐步获取和处理结果
for output in generator:
print(output)
end_time = time.time()
elapsed_time = end_time - start_time # 计算耗时
print(f"Function execution took {elapsed_time} seconds.")
print(f"Function execution took {elapsed_time:.2f} seconds.")