9.6部分文件引用了招标公告中的内容,添加了对二次跳转的处理
This commit is contained in:
parent
cac8ea4136
commit
3f79900ed4
@ -1,15 +1,28 @@
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
content="递交投标文件截止之日后120日内"
|
from PyPDF2 import PdfReader
|
||||||
def extract_content_after_special_chars(content):
|
|
||||||
"""
|
|
||||||
提取特定符号后的内容,直到遇到结束符号。
|
|
||||||
"""
|
|
||||||
pattern = r'[\x01\x02☑√团]([^□]+)'
|
|
||||||
match = re.search(pattern, content)
|
|
||||||
if match:
|
|
||||||
return match.group(1).strip() # 提取匹配的内容,并去除多余空格
|
|
||||||
return content # 如果没有找到匹配,返回原内容
|
|
||||||
|
|
||||||
res=extract_content_after_special_chars(content)
|
|
||||||
|
def extract_common_header(pdf_path):
|
||||||
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
headers = []
|
||||||
|
num_pages_to_read = 3 # 预读页数
|
||||||
|
|
||||||
|
for i in range(min(num_pages_to_read, len(pdf_document.pages))):
|
||||||
|
page = pdf_document.pages[i]
|
||||||
|
text = page.extract_text()
|
||||||
|
if text: # 确保页面有文本内容
|
||||||
|
first_line = text.strip().split('\n')[0]
|
||||||
|
headers.append(first_line)
|
||||||
|
|
||||||
|
if len(headers) < 2:
|
||||||
|
return "" # 如果没有足够的页来比较,返回空字符串
|
||||||
|
|
||||||
|
# 使用set交集来找出公共部分
|
||||||
|
common_header = set(headers[0].split()).intersection(*[set(header.split()) for header in headers[1:]])
|
||||||
|
common_header = ' '.join(common_header)
|
||||||
|
return common_header
|
||||||
|
|
||||||
|
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\test\\zbtest19.pdf"
|
||||||
|
res=extract_common_header(input_path)
|
||||||
print(res)
|
print(res)
|
@ -81,8 +81,8 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
|
|||||||
# 调用大模型回答项目基础信息
|
# 调用大模型回答项目基础信息
|
||||||
print("starting基础信息...")
|
print("starting基础信息...")
|
||||||
baseinfo_list = []
|
baseinfo_list = []
|
||||||
# baseinfo_file_path='../static/提示词/前两章提问总结.txt'
|
baseinfo_file_path='../static/提示词/前两章提问总结.txt'
|
||||||
baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt' # 替换为你的txt文件路径
|
# baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt' # 替换为你的txt文件路径
|
||||||
questions = read_questions_from_file(baseinfo_file_path)
|
questions = read_questions_from_file(baseinfo_file_path)
|
||||||
res1 = multi_threading(questions, knowledge_name)
|
res1 = multi_threading(questions, knowledge_name)
|
||||||
for _, response in res1: # _占位,代表ques;response[0]也是ques;response[1]是ans
|
for _, response in res1: # _占位,代表ques;response[0]也是ques;response[1]是ans
|
||||||
@ -97,8 +97,8 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
|
|||||||
# 判断是否分包、是否需要递交投标保证金等
|
# 判断是否分包、是否需要递交投标保证金等
|
||||||
chosen_numbers, merged = judge_whether_main(truncate0,output_folder)
|
chosen_numbers, merged = judge_whether_main(truncate0,output_folder)
|
||||||
baseinfo_list.append(merged)
|
baseinfo_list.append(merged)
|
||||||
# judge_file_path = '../static/提示词/是否相关问题.txt'
|
judge_file_path = '../static/提示词/是否相关问题.txt'
|
||||||
judge_file_path ='flask_app/static/提示词/是否相关问题.txt'
|
# judge_file_path ='flask_app/static/提示词/是否相关问题.txt'
|
||||||
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
|
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
|
||||||
|
|
||||||
|
|
||||||
|
@ -4,8 +4,10 @@ import json
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
from flask_app.main.多线程提问 import multi_threading
|
from flask_app.main.多线程提问 import multi_threading
|
||||||
from flask_app.main.根据条款号整合json import process_and_merge_entries
|
from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2
|
||||||
from flask_app.main.json_utils import extract_content_from_json
|
from flask_app.main.json_utils import extract_content_from_json
|
||||||
|
from flask_app.main.截取pdf import truncate_pdf_main
|
||||||
|
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
|
||||||
prompt = """
|
prompt = """
|
||||||
# 角色
|
# 角色
|
||||||
你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。
|
你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。
|
||||||
@ -28,7 +30,7 @@ prompt = """
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def update_json_data(original_data, updates, second_response_list):
|
def update_json_data(original_data, updates1, updates2,second_response_list):
|
||||||
"""
|
"""
|
||||||
根据提供的更新字典覆盖原始JSON数据中对应的键值,支持点分隔的键来表示嵌套结构。
|
根据提供的更新字典覆盖原始JSON数据中对应的键值,支持点分隔的键来表示嵌套结构。
|
||||||
参数:
|
参数:
|
||||||
@ -49,7 +51,9 @@ def update_json_data(original_data, updates, second_response_list):
|
|||||||
data[keys[-1]] = value
|
data[keys[-1]] = value
|
||||||
|
|
||||||
# 合并 updates 到 original_data 中
|
# 合并 updates 到 original_data 中
|
||||||
for key, value in updates.items():
|
for key, value in updates1.items():
|
||||||
|
recursive_update(original_data, key, value)
|
||||||
|
for key,value in updates2.items():
|
||||||
recursive_update(original_data, key, value)
|
recursive_update(original_data, key, value)
|
||||||
|
|
||||||
# 遍历 second_response_list 中的每个字典,并合并到 original_data 中
|
# 遍历 second_response_list 中的每个字典,并合并到 original_data 中
|
||||||
@ -59,8 +63,22 @@ def update_json_data(original_data, updates, second_response_list):
|
|||||||
|
|
||||||
return original_data
|
return original_data
|
||||||
|
|
||||||
|
def judge_second_jump(input_dict):
|
||||||
|
formatted_questions = []
|
||||||
|
keys_to_remove = []
|
||||||
|
for key, value in input_dict.items():
|
||||||
|
if '招标公告' in value or '第一章' in value:
|
||||||
|
formatted_entry = f"关于'{key}',第一章招标公告中的内容是怎样的?请按json格式给我提供信息,键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。"
|
||||||
|
formatted_questions.append(formatted_entry)
|
||||||
|
keys_to_remove.append(key) # 标记需要删除的键
|
||||||
|
# 从原始字典中删除处理过的键值对
|
||||||
|
for key in keys_to_remove:
|
||||||
|
del input_dict[key]
|
||||||
|
|
||||||
|
return input_dict, formatted_questions
|
||||||
|
|
||||||
def extract_matching_keys(json_data):
|
def extract_matching_keys(json_data):
|
||||||
|
#'投标人名称': '符合招标公告第3.2条规定'
|
||||||
# 函数首先检查输入 json_data 是否为字符串类型。如果是,它会使用 json.loads() 将字符串解析为字典。
|
# 函数首先检查输入 json_data 是否为字符串类型。如果是,它会使用 json.loads() 将字符串解析为字典。
|
||||||
if isinstance(json_data, str):
|
if isinstance(json_data, str):
|
||||||
data = json.loads(json_data)
|
data = json.loads(json_data)
|
||||||
@ -68,7 +86,7 @@ def extract_matching_keys(json_data):
|
|||||||
data = json_data
|
data = json_data
|
||||||
|
|
||||||
# 正则表达式匹配
|
# 正则表达式匹配
|
||||||
include_patterns = [re.compile(r"符合第"), re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目")]
|
include_patterns = [re.compile(r"符合第"), re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"),re.compile(r"第.*?条")]
|
||||||
additional_include_patterns = [re.compile(r"规定"), re.compile(r"条目")]
|
additional_include_patterns = [re.compile(r"规定"), re.compile(r"条目")]
|
||||||
exclude_patterns = ["投标文件格式", "权利义务", "技术标准","工程量清单"]
|
exclude_patterns = ["投标文件格式", "权利义务", "技术标准","工程量清单"]
|
||||||
additional_exclude_patterns = ["按要求", "按规定"]
|
additional_exclude_patterns = ["按要求", "按规定"]
|
||||||
@ -108,51 +126,67 @@ def extract_matching_keys(json_data):
|
|||||||
return final_matching
|
return final_matching
|
||||||
|
|
||||||
#TODO:如果要引用到招标公告中的内容,考虑提取 或者qianwen-long
|
#TODO:如果要引用到招标公告中的内容,考虑提取 或者qianwen-long
|
||||||
def reformat_questions(match_keys):
|
def reformat_questions(match_keys, input_path, output_folder):
|
||||||
"""
|
|
||||||
[{'形式评审标准.多标段投标': '符合第一章“招标公告”第 3.3款规定'}, {'形式评审标准.投标文件的签署': '符合第二章“投标人须知”第 3.6.3(5)目规定'}, {'形式评审标准.投标保证金': '符合第二章“投标人须知”第 3.4.1项规定'}, {'形式评审标准.工程分包(如有)': '符合第二章“投标人须知”第 1.11款规定'}, {'响应性评审标准.投标内容': '符合第二章“投标人须知”第 1.3款的规定'}, {'响应性评审标准.监理服务阶段': '符合第二章“投标人须知”第 1.3款的规定'}, {'响应性评审标准.监理工作范围': '符合第二章“投标人须知”第 1.3款的规定'}, {'响应性评审标准.监理服务期': '符合第二章“投标人须知”第 1.3款的规定'}, {'响应性评审标准.投标有效期': '符合第二章“投标人须知”第 3.3.1项的规定'}, {'响应性评审标准.投标保证金': '符合第二章“投标人须知”第 3.4.1项的规定'}, {'响应性评审标准.重大偏差': '见第二章“投标人须知”第 1.12款规定'}]
|
|
||||||
"""
|
|
||||||
"""
|
|
||||||
根据是否包含特定序号格式(如3.7.4或3.7.4(5)或3.7.4(5)),重新格式化匹配到的评审条目。
|
|
||||||
若包含序号,则提取出来;若不包含,则生成格式化的问题字符串。
|
|
||||||
"""
|
|
||||||
entries_with_numbers = []
|
entries_with_numbers = []
|
||||||
|
entries_with_numbers2 = [] # 用于保存特别处理的条目
|
||||||
formatted_questions = []
|
formatted_questions = []
|
||||||
|
clause2_path = ""
|
||||||
|
|
||||||
# 正则表达式,同时匹配全角和半角括号
|
# 正则表达式,同时匹配全角和半角括号
|
||||||
pattern = re.compile(r'(\d+(?:\.\d+)+)(?:[\(\(](\d+)[\)\)])?') #识别包含数字序列的特定格式 eg:3.7.4(5) 3.4.1
|
pattern = re.compile(r'(\d+(?:\.\d+)+)(?:[\(\(](\d+)[\)\)])?') # 识别包含数字序列的特定格式
|
||||||
|
|
||||||
|
# 初始化文件处理
|
||||||
|
generated_files = None
|
||||||
|
file_processed = False
|
||||||
|
|
||||||
for entry in match_keys:
|
for entry in match_keys:
|
||||||
key, value = next(iter(entry.items()))
|
key, value = next(iter(entry.items()))
|
||||||
if '招标公告' in value or '第一章' in value:
|
|
||||||
formatted_entry = f"关于‘{key}’,{value.replace('符合', '')}的内容是怎样的?请按json格式给我提供信息,键名为'{key}',如果存在未知信息,请在对应键值处填'未知'。"
|
|
||||||
formatted_questions.append(formatted_entry)
|
|
||||||
continue # 继续处理下一个条目
|
|
||||||
|
|
||||||
match = pattern.search(value)
|
match = pattern.search(value)
|
||||||
if match:
|
|
||||||
# 如果存在序号,保存序号与对应的键值对,包括括号内的数字(如果存在)
|
if '招标公告' in value or '第一章' in value:
|
||||||
|
if not file_processed:
|
||||||
|
# 只处理一次文件操作
|
||||||
|
generated_files = truncate_pdf_main(input_path, output_folder, 5)
|
||||||
|
file_processed = True # 设置标志位,避免重复处理文件
|
||||||
|
if generated_files:
|
||||||
|
clause2_path = convert_clause_to_json(generated_files[0], output_folder, 2)
|
||||||
|
|
||||||
|
if match and generated_files:
|
||||||
|
# 提取序号并添加到entries_with_numbers2
|
||||||
|
num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "")
|
||||||
|
entries_with_numbers2.append({key: num})
|
||||||
|
else:
|
||||||
|
revised_value = value.replace('符合', '')
|
||||||
|
formatted_entry = f"关于'{key}',{revised_value}的内容是怎样的?请按json格式给我提供信息,键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。"
|
||||||
|
formatted_questions.append(formatted_entry)
|
||||||
|
elif match:
|
||||||
num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "")
|
num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "")
|
||||||
entries_with_numbers.append({key: num})
|
entries_with_numbers.append({key: num})
|
||||||
else:
|
else:
|
||||||
# 如果不存在序号,删除“符合”并格式化文本
|
revised_value = value.replace('符合', '')
|
||||||
revised_standard = re.sub(r'符合', '', value)
|
formatted_entry = f"关于'{key}',{revised_value}的内容是怎样的?请按json格式给我提供信息,键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。"
|
||||||
formatted_entry = f"关于‘{key}’,{revised_standard}的内容是怎样的?请按json格式给我提供信息,键名为'{key}',如果存在未知信息,请在对应键值处填'未知'。"
|
|
||||||
formatted_questions.append(formatted_entry)
|
formatted_questions.append(formatted_entry)
|
||||||
print(formatted_questions)
|
|
||||||
return entries_with_numbers, formatted_questions
|
return entries_with_numbers, formatted_questions, entries_with_numbers2, clause2_path
|
||||||
|
|
||||||
|
|
||||||
def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause_json_path):
|
def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause_json_path,input_file,output_folder):
|
||||||
matched_keys = extract_matching_keys(original_dict_data) #[{'形式评审标准.投标文件签字盖章': '符合第二章“投标人须知”第 3.7.3(4)目 规定'}, {'形式评审标准.多标段投标': '符合第二章“投标人须知”第 10.1款规定'}] 提取值中包含"符合"的字典
|
combined_results2={}
|
||||||
entries_with_numbers, formatted_questions = reformat_questions(matched_keys)
|
matched_keys = extract_matching_keys(original_dict_data) #[{'形式评审标准.投标文件签字盖章': '符合第一章“投标人须知”第 3.7.3(4)目 规定'}, {'形式评审标准.多标段投标': '符合第二章“投标人须知”第 10.1款规定'}] 提取值中包含"符合"的字典
|
||||||
results_2 = multi_threading(formatted_questions, knowledge_name, True) #无序号的直接问大模型
|
entries_with_numbers, formatted_questions1,entries_with_numbers2, clause2_path = reformat_questions(matched_keys,input_file,output_folder)
|
||||||
second_response_list = []
|
combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath,
|
||||||
|
clause_json_path) # 调用根据条款号整合json.py
|
||||||
|
combined_results1, formatted_questions2 = judge_second_jump(combined_results)
|
||||||
|
if entries_with_numbers2: #跳转第一章招标公告
|
||||||
|
combined_results2=process_and_merge2(entries_with_numbers2,clause2_path)
|
||||||
|
|
||||||
|
results_2 = multi_threading(formatted_questions1+formatted_questions2, knowledge_name, True) #无序号的直接问大模型
|
||||||
|
first_response_list = []
|
||||||
for _, response in results_2:
|
for _, response in results_2:
|
||||||
try:
|
try:
|
||||||
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
||||||
temp = extract_content_from_json(response[1])
|
temp = extract_content_from_json(response[1])
|
||||||
second_response_list.append(temp)
|
first_response_list.append(temp)
|
||||||
else:
|
else:
|
||||||
print(f"Warning: Missing or incomplete response data for query index {_}.")
|
print(f"Warning: Missing or incomplete response data for query index {_}.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -160,18 +194,20 @@ def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause
|
|||||||
|
|
||||||
# Assume JSON file paths are defined or configured correctly
|
# Assume JSON file paths are defined or configured correctly
|
||||||
# print(entries_with_numbers) #[{'形式评审标准.多标段投标': '3.7.4(5)'}]
|
# print(entries_with_numbers) #[{'形式评审标准.多标段投标': '3.7.4(5)'}]
|
||||||
combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath, clause_json_path) #调用根据条款号整合json.py
|
updated_json = update_json_data(original_dict_data, combined_results1, combined_results2,first_response_list)
|
||||||
updated_json = update_json_data(original_dict_data, combined_results, second_response_list)
|
|
||||||
return updated_json
|
return updated_json
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
start_time=time.time()
|
start_time=time.time()
|
||||||
knowledge_name="zbtest20"
|
knowledge_name="zbfile"
|
||||||
truncate_tobidders_table_json_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\truncate_output.json"
|
truncate_tobidders_table_json_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\truncate_output.json"
|
||||||
clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\clause.json"
|
clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\clause.json"
|
||||||
original_dict_data={'形式评审标准': {'多标段投标': '符合第一章“招标公告”第 3.3款规定', '投标文件': '投标文件能正常打开', '投标人名称': '与营业执照、资质证书一致', '投标文件的签署': '符合第二章“投标人须知”第 3.6.3(5)目规定', '投标保证金': '符合第二章“投标人须知”规定', '投标文件的格式、内容': '符合第八章“投标文件格式”的格式规定、实质性内容齐全,关键字迹清晰可辨', '投标报价': '①一份投标文件应只有一个投标报价,未提交选择性报价; ②投标函中报价与监理服务费报价汇总表中的报价保持一致; ③投标人未提交调价函。', '联合体投标人(如有)': '提交联合体协议书,并明确联合体牵头人', '工程分包(如有)': '符合第二章“投标人须知”第 1.11款规定'}, '响应性评审标准': {'投标内容': '符合第二章“投标人须知”第 1.3款的规定', '监理服务阶段': '符合第二章“投标人须知”第 1.3款的规定', '监理工作范围': '符合第二章“投标人须知”第 1.3款的规定', '投标报价': '①投标报价不高于最高投标限价(如果有); ②投标报价不低于成本价。', '监理服务期': '符合第二章“投标人须知”第 1.3款的规定', '投标有效期': '符合第二章“投标人须知”第 3.3.1项的规定', '投标保证金': '符合第二章“投标人须知”第 3.4.1项的规定', '权利义务': '符合或优于第四章“合同条款及格式”规定的权利义务', '技术标准': '符合第七章“技术标准”规定', '招标人不能接受的条件': '未附有招标人不能接受的条件', '重大偏差': '见第二章“投标人须知”第 1.12款规定'}}
|
original_dict_data={'形式评审标准': {'投标文件': '符合第一章规定', '投标人名称': '符合招标公告第3.2条规定', '投标文件签字': '符合第二章“投标人须知”第3.7.3(4)目规定', '投标文件格式、内容': '符合第七章“投标文件格式”的要求,实质性内容齐全、关键字迹清晰可辨', '联合体投标人': '提交联合体协议书,并明确联合体牵头人', '报价唯一': '只能有一个有效报价', '多标段投标': '符合第二章“投标人须知”第 10.1款规定'}, '响应性评审标准': {'投标内容': '符合第二章“投标人须知”第1.3.1项规定', '工期': '符合第二章“投标人须知”第1.3.2项规定', '工程质量': '符合第二章“投标人须知”第1.3.3项规定'}}
|
||||||
formal_json = process_reviews(original_dict_data,knowledge_name, truncate_tobidders_table_json_path, clause_path)
|
|
||||||
|
input_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile.pdf"
|
||||||
|
output_folder="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7"
|
||||||
|
formal_json = process_reviews(original_dict_data,knowledge_name, truncate_tobidders_table_json_path, clause_path,input_file,output_folder)
|
||||||
data = json.dumps(formal_json, ensure_ascii=False, indent=4)
|
data = json.dumps(formal_json, ensure_ascii=False, indent=4)
|
||||||
end_time=time.time()
|
end_time=time.time()
|
||||||
elapsed_time = end_time - start_time
|
elapsed_time = end_time - start_time
|
||||||
|
@ -2,14 +2,39 @@ from PyPDF2 import PdfReader, PdfWriter
|
|||||||
import re # 导入正则表达式库
|
import re # 导入正则表达式库
|
||||||
import os # 用于文件和文件夹操作
|
import os # 用于文件和文件夹操作
|
||||||
|
|
||||||
def clean_page_numbers(text):
|
def extract_common_header(pdf_path):
|
||||||
# 使用正则表达式删除页码
|
pdf_document = PdfReader(pdf_path)
|
||||||
# 假设页码在文本的最开始,紧跟着文字且无空格分隔
|
headers = []
|
||||||
cleaned_text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
|
num_pages_to_read = 3 # 预读页数
|
||||||
# 删除结尾的页码
|
|
||||||
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text)
|
for i in range(min(num_pages_to_read, len(pdf_document.pages))):
|
||||||
# 删除形如 /129 的页码
|
page = pdf_document.pages[i]
|
||||||
cleaned_text = re.sub(r'\s*\/\s*\d+\\s*', '', cleaned_text)
|
text = page.extract_text()
|
||||||
|
if text: # 确保页面有文本内容
|
||||||
|
first_line = text.strip().split('\n')[0]
|
||||||
|
headers.append(first_line)
|
||||||
|
|
||||||
|
if len(headers) < 2:
|
||||||
|
return "" # 如果没有足够的页来比较,返回空字符串
|
||||||
|
|
||||||
|
# 使用set交集来找出公共部分
|
||||||
|
common_header = set(headers[0].split()).intersection(*[set(header.split()) for header in headers[1:]])
|
||||||
|
common_header = ' '.join(common_header)
|
||||||
|
return common_header
|
||||||
|
|
||||||
|
|
||||||
|
def clean_page_content(text, common_header):
|
||||||
|
# 首先删除抬头公共部分
|
||||||
|
if common_header: # 确保有公共抬头才进行替换
|
||||||
|
cleaned_text = text.replace(common_header, '', 1) # 假设抬头出现在文本开头,只替换一次
|
||||||
|
else:
|
||||||
|
cleaned_text = text
|
||||||
|
|
||||||
|
# 删除页码
|
||||||
|
cleaned_text = re.sub(r'^\s*\d+\s*(?=\D)', '', cleaned_text) # 删除开头的页码,仅当紧跟非数字字符时
|
||||||
|
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text) # 删除结尾的页码
|
||||||
|
cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text) # 删除形如 /129 的页码
|
||||||
|
|
||||||
return cleaned_text
|
return cleaned_text
|
||||||
|
|
||||||
|
|
||||||
@ -39,18 +64,19 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
|
|||||||
print("提供的页码范围无效。")
|
print("提供的页码范围无效。")
|
||||||
return output_pdf_path
|
return output_pdf_path
|
||||||
|
|
||||||
|
|
||||||
def extract_pages_twice(pdf_path, output_folder, output_suffix):
|
def extract_pages_twice(pdf_path, output_folder, output_suffix):
|
||||||
|
common_header = extract_common_header(pdf_path)
|
||||||
last_begin_index = 0
|
last_begin_index = 0
|
||||||
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷')
|
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷')
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
for i, page in enumerate(pdf_document.pages):
|
for i, page in enumerate(pdf_document.pages):
|
||||||
text = page.extract_text()
|
text = page.extract_text()
|
||||||
if text:
|
if text:
|
||||||
cleaned_text = clean_page_numbers(text)
|
cleaned_text = clean_page_content(text,common_header)
|
||||||
# 检查“第一章”开始的位置
|
# 检查“第一章”开始的位置
|
||||||
if begin_pattern.search(cleaned_text):
|
if begin_pattern.search(cleaned_text):
|
||||||
last_begin_index = i # 更新最后匹配的索引,页码从0开始
|
last_begin_index = i # 更新最后匹配的索引,页码从0开始
|
||||||
print(last_begin_index)
|
|
||||||
if output_suffix == "qualification":
|
if output_suffix == "qualification":
|
||||||
common_pattern = r'^(?:附录(?:一)?[::]|附件(?:一)?[::]|附表(?:一)?[::])'
|
common_pattern = r'^(?:附录(?:一)?[::]|附件(?:一)?[::]|附表(?:一)?[::])'
|
||||||
# end_pattern = r'^(第[一二三四五六七八九十]+章\s*投标人须知|评标办法|评标办法前附表)'
|
# end_pattern = r'^(第[一二三四五六七八九十]+章\s*投标人须知|评标办法|评标办法前附表)'
|
||||||
@ -66,7 +92,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix):
|
|||||||
for i, page in enumerate(pdf_document.pages[last_begin_index:], start=last_begin_index):
|
for i, page in enumerate(pdf_document.pages[last_begin_index:], start=last_begin_index):
|
||||||
text = page.extract_text()
|
text = page.extract_text()
|
||||||
if text:
|
if text:
|
||||||
cleaned_text = clean_page_numbers(text)
|
cleaned_text = clean_page_content(text,common_header)
|
||||||
# 确定起始页,需在last_begin_index之后
|
# 确定起始页,需在last_begin_index之后
|
||||||
if ("资格审查" in cleaned_text or "资质条件" in cleaned_text):
|
if ("资格审查" in cleaned_text or "资质条件" in cleaned_text):
|
||||||
if re.search(common_pattern, cleaned_text, re.MULTILINE):
|
if re.search(common_pattern, cleaned_text, re.MULTILINE):
|
||||||
@ -94,6 +120,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix):
|
|||||||
|
|
||||||
|
|
||||||
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
||||||
|
common_header=extract_common_header(pdf_path)
|
||||||
# 打开PDF文件
|
# 打开PDF文件
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
start_page = None
|
start_page = None
|
||||||
@ -103,10 +130,10 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
|||||||
page = pdf_document.pages[i]
|
page = pdf_document.pages[i]
|
||||||
text = page.extract_text()
|
text = page.extract_text()
|
||||||
if text:
|
if text:
|
||||||
cleaned_text = clean_page_numbers(text)
|
cleaned_text = clean_page_content(text,common_header)
|
||||||
# print(cleaned_text)
|
# print(cleaned_text)
|
||||||
if re.search(begin_pattern, cleaned_text) and i > begin_page:
|
if re.search(begin_pattern, cleaned_text) and i > begin_page:
|
||||||
if output_suffix == "invalid" and start_page: #仅当提取Invalid的时候,判断初始页码是第一个匹配到的页码,因为招标编号可能存在多个,后面的覆盖前面
|
if output_suffix == "invalid" and start_page: # 仅当提取Invalid的时候,判断初始页码是第一个匹配到的页码,因为招标编号可能存在多个,后面的覆盖前面
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
start_page = i
|
start_page = i
|
||||||
@ -117,14 +144,14 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
|||||||
else:
|
else:
|
||||||
condition = i > (start_page + 1)
|
condition = i > (start_page + 1)
|
||||||
if condition:
|
if condition:
|
||||||
is_invalid_condition = output_suffix == "invalid" and i > 30 #这边默认无效投标至少有30页
|
is_invalid_condition = output_suffix == "invalid" and i > 30 # 这边默认无效投标至少有30页
|
||||||
if is_invalid_condition or output_suffix != "invalid":
|
if is_invalid_condition or output_suffix != "invalid":
|
||||||
end_page = i
|
end_page = i
|
||||||
break
|
break
|
||||||
|
|
||||||
# 确保找到了起始和结束页面
|
# 确保找到了起始和结束页面
|
||||||
if start_page is None or end_page is None:
|
if start_page is None or end_page is None:
|
||||||
if output_suffix == "qualification" or "invalid":
|
if output_suffix == "qualification" or output_suffix =="invalid":
|
||||||
extract_pages_twice(pdf_path, output_folder, output_suffix)
|
extract_pages_twice(pdf_path, output_folder, output_suffix)
|
||||||
else:
|
else:
|
||||||
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
|
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
|
||||||
@ -132,6 +159,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
|||||||
else:
|
else:
|
||||||
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
|
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
|
||||||
|
|
||||||
|
|
||||||
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
||||||
# 确保输出文件夹存在
|
# 确保输出文件夹存在
|
||||||
if not os.path.exists(output_folder):
|
if not os.path.exists(output_folder):
|
||||||
@ -142,13 +170,15 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
|
|||||||
for file in os.listdir(input_path):
|
for file in os.listdir(input_path):
|
||||||
if file.endswith(".pdf"):
|
if file.endswith(".pdf"):
|
||||||
pdf_path = os.path.join(input_path, file)
|
pdf_path = os.path.join(input_path, file)
|
||||||
output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern,
|
||||||
|
output_suffix)
|
||||||
if output_pdf_path and os.path.isfile(output_pdf_path):
|
if output_pdf_path and os.path.isfile(output_pdf_path):
|
||||||
generated_files.append(output_pdf_path)
|
generated_files.append(output_pdf_path)
|
||||||
return generated_files
|
return generated_files
|
||||||
elif os.path.isfile(input_path) and input_path.endswith(".pdf"):
|
elif os.path.isfile(input_path) and input_path.endswith(".pdf"):
|
||||||
# 处理单个PDF文件
|
# 处理单个PDF文件
|
||||||
output_pdf_path = extract_pages(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
output_pdf_path = extract_pages(input_path, output_folder, begin_pattern, begin_page, end_pattern,
|
||||||
|
output_suffix)
|
||||||
if output_pdf_path and os.path.isfile(output_pdf_path):
|
if output_pdf_path and os.path.isfile(output_pdf_path):
|
||||||
return [output_pdf_path] # 以列表形式返回,以保持一致性
|
return [output_pdf_path] # 以列表形式返回,以保持一致性
|
||||||
else:
|
else:
|
||||||
@ -165,7 +195,8 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
|||||||
output_suffix = "tobidders_notice_table"
|
output_suffix = "tobidders_notice_table"
|
||||||
elif selection == 2:
|
elif selection == 2:
|
||||||
# Configure patterns and phrases for "评标办法"
|
# Configure patterns and phrases for "评标办法"
|
||||||
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*评标办法') #考虑到这种情况 '第三章 第三章 第三章 第三章 评标办法 评标办法 评标办法 评标办法'
|
begin_pattern = re.compile(
|
||||||
|
r'第[一二三四五六七八九十]+章\s*评标办法') # 考虑到这种情况 '第三章 第三章 第三章 第三章 评标办法 评标办法 评标办法 评标办法'
|
||||||
begin_page = 10
|
begin_page = 10
|
||||||
end_pattern = re.compile(r'评标办法正文|评标办法')
|
end_pattern = re.compile(r'评标办法正文|评标办法')
|
||||||
output_suffix = "evaluation_method"
|
output_suffix = "evaluation_method"
|
||||||
@ -173,9 +204,11 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
|||||||
# Configure patterns and phrases for "投标人须知正文"
|
# Configure patterns and phrases for "投标人须知正文"
|
||||||
begin_pattern = re.compile(r'投标人须知正文')
|
begin_pattern = re.compile(r'投标人须知正文')
|
||||||
begin_page = 5
|
begin_page = 5
|
||||||
end_pattern = re.compile(r'^第[一二三四五六七八九十]+章\s*评标办法|^评标办法前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]', re.MULTILINE)
|
end_pattern = re.compile(
|
||||||
|
r'^第[一二三四五六七八九十]+章\s*评标办法|^评标办法前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]',
|
||||||
|
re.MULTILINE)
|
||||||
output_suffix = "tobidders_notice"
|
output_suffix = "tobidders_notice"
|
||||||
elif selection==4:
|
elif selection == 4:
|
||||||
# 配置用于 "资格审查条件" 的正则表达式模式和短语
|
# 配置用于 "资格审查条件" 的正则表达式模式和短语
|
||||||
common_pattern = r'^(?:附录(?:一)?[::]|附件(?:一)?[::]|附表(?:一)?[::])'
|
common_pattern = r'^(?:附录(?:一)?[::]|附件(?:一)?[::]|附表(?:一)?[::])'
|
||||||
begin_pattern = re.compile(common_pattern + r'.*(?:资质|能力|信誉).*$', re.MULTILINE)
|
begin_pattern = re.compile(common_pattern + r'.*(?:资质|能力|信誉).*$', re.MULTILINE)
|
||||||
@ -187,6 +220,12 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
|||||||
)
|
)
|
||||||
output_suffix = "qualification"
|
output_suffix = "qualification"
|
||||||
elif selection == 5:
|
elif selection == 5:
|
||||||
|
# 配置用于 "招标公告" 的正则表达式模式和短语
|
||||||
|
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷')
|
||||||
|
begin_page = 0
|
||||||
|
end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*投标人须知', re.MULTILINE)
|
||||||
|
output_suffix = "notice"
|
||||||
|
elif selection == 6:
|
||||||
# 配置用于 "无效标" 的正则表达式模式和短语
|
# 配置用于 "无效标" 的正则表达式模式和短语
|
||||||
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:')
|
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:')
|
||||||
begin_page = 0
|
begin_page = 0
|
||||||
@ -199,6 +238,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
|||||||
# Process the selected input
|
# Process the selected input
|
||||||
return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||||
|
|
||||||
|
|
||||||
def truncate_pdf_multiple(input_path, output_folder):
|
def truncate_pdf_multiple(input_path, output_folder):
|
||||||
truncate_files = []
|
truncate_files = []
|
||||||
for selection in range(1, 5):
|
for selection in range(1, 5):
|
||||||
@ -206,11 +246,12 @@ def truncate_pdf_multiple(input_path, output_folder):
|
|||||||
truncate_files.extend(files)
|
truncate_files.extend(files)
|
||||||
return truncate_files
|
return truncate_files
|
||||||
|
|
||||||
#TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下,比如说就不进行跳转(见xx表) 开评定标这里也要考虑 如果评分表为空,也要处理。
|
|
||||||
|
# TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下,比如说就不进行跳转(见xx表) 开评定标这里也要考虑 如果评分表为空,也要处理。
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标02.pdf"
|
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\test\\zbtest18.pdf"
|
||||||
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件"
|
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\test"
|
||||||
# truncate_pdf_multiple(input_path,output_folder)
|
# truncate_pdf_multiple(input_path,output_folder)
|
||||||
selection = 4 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-无效标
|
selection = 5 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
|
||||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||||
# # print("生成的文件:", generated_files)
|
# # print("生成的文件:", generated_files)
|
||||||
|
@ -26,11 +26,11 @@ def extract_text_from_pdf(file_path):
|
|||||||
text += page_text
|
text += page_text
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def extract_section(text, start_keyword, end_phrases):
|
def extract_section(text, start_pattern, end_phrases):
|
||||||
start_index = text.find(start_keyword)
|
start_match = re.search(start_pattern, text)
|
||||||
if start_index == -1:
|
if not start_match:
|
||||||
return ""
|
return "" # 如果没有找到匹配的开始模式,返回空字符串
|
||||||
|
start_index = start_match.start()
|
||||||
end_index = len(text)
|
end_index = len(text)
|
||||||
for phrase in end_phrases:
|
for phrase in end_phrases:
|
||||||
# Use multiline mode with `re.MULTILINE`
|
# Use multiline mode with `re.MULTILINE`
|
||||||
@ -124,26 +124,31 @@ def convert_to_json(file_path, start_word, end_phrases):
|
|||||||
parsed_data = parse_text_by_heading(text)
|
parsed_data = parse_text_by_heading(text)
|
||||||
return parsed_data
|
return parsed_data
|
||||||
|
|
||||||
def convert_clause_to_json(input_path,output_folder):
|
def convert_clause_to_json(input_path,output_folder,type=1):
|
||||||
start_word = "投标人须知正文"
|
if type==1:
|
||||||
end_phrases = [
|
start_word = "投标人须知正文"
|
||||||
r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
|
end_phrases = [
|
||||||
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
|
r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
|
||||||
]
|
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:'
|
||||||
|
end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
|
||||||
result = convert_to_json(input_path, start_word, end_phrases)
|
result = convert_to_json(input_path, start_word, end_phrases)
|
||||||
output_path = os.path.join(output_folder, "clause.json")
|
file_name = "clause1.json" if type == 1 else "clause2.json"
|
||||||
|
output_path = os.path.join(output_folder, file_name)
|
||||||
with open(output_path, 'w', encoding='utf-8') as f:
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
json.dump(result, f, indent=4, ensure_ascii=False)
|
json.dump(result, f, indent=4, ensure_ascii=False)
|
||||||
return output_path
|
return output_path
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp6\\713022ff-27d9-43e3-9cc9-2752effbfd66\\ztbfile_invalid.pdf'
|
file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest16_tobidders_notice.pdf'
|
||||||
start_word = "投标人须知正文"
|
start_word = "投标人须知正文"
|
||||||
end_phrases = [
|
end_phrases = [
|
||||||
r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
|
r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
|
||||||
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
|
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
|
||||||
]
|
]
|
||||||
output_folder = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp6\\713022ff-27d9-43e3-9cc9-2752effbfd66'
|
output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件'
|
||||||
try:
|
try:
|
||||||
output_path = convert_clause_to_json(file_path,output_folder)
|
output_path = convert_clause_to_json(file_path,output_folder)
|
||||||
print(f"Final JSON result saved to: {output_path}")
|
print(f"Final JSON result saved to: {output_path}")
|
||||||
|
File diff suppressed because one or more lines are too long
@ -1,23 +1,12 @@
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
def extract_content_after_special_chars(content):
|
|
||||||
"""
|
|
||||||
提取特定符号后的内容,直到遇到结束符号。
|
|
||||||
"""
|
|
||||||
pattern = r'[\x01\x02☑√团]([^□]+)'
|
|
||||||
match = re.search(pattern, content)
|
|
||||||
if match:
|
|
||||||
return match.group(1).strip() # 提取匹配的内容,并去除多余空格
|
|
||||||
return content # 如果没有找到匹配,返回原内容
|
|
||||||
|
|
||||||
def load_json(file_path):
|
def load_json(file_path):
|
||||||
"""
|
"""
|
||||||
加载JSON文件,并统一其中的括号为全角括号。
|
加载JSON文件,并统一其中的括号为全角括号。
|
||||||
"""
|
"""
|
||||||
with open(file_path, 'r', encoding='utf-8') as f:
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
data = json.load(f)
|
data = json.load(f) #字典
|
||||||
return standardize_brackets_in_json(data)
|
return standardize_brackets_in_json(data)
|
||||||
|
|
||||||
def standardize_brackets_in_json(data):
|
def standardize_brackets_in_json(data):
|
||||||
@ -146,10 +135,17 @@ def process_and_merge_entries(entries_with_numbers, primary_json_path, secondary
|
|||||||
combined_results = find_entries_in_jsons(entries_with_numbers, primary_json_data, secondary_json_data)
|
combined_results = find_entries_in_jsons(entries_with_numbers, primary_json_data, secondary_json_data)
|
||||||
return combined_results
|
return combined_results
|
||||||
|
|
||||||
|
def process_and_merge2(entries_with_numbers,json_path):
|
||||||
|
json_data=load_json(json_path)
|
||||||
|
combined_results=find_entries_in_jsons(entries_with_numbers,json_data,{})
|
||||||
|
return combined_results
|
||||||
|
|
||||||
|
|
||||||
|
#3.4.1 投标保证金,提取截止有问题。
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Hypothetical entries and file paths for testing
|
# Hypothetical entries and file paths for testing
|
||||||
# entries_with_numbers = [{'形式评审标准.投标文件签字盖章': '3.7.3(3)'}, {'形式评审标准.多标段投标': '10.1'}, {'形式评审标准.“技术暗标”': '3.7.4(5)'}, {'响应性评审标准.投标内容': '1.3.1'}, {'响应性评审标准.工期': '1.3.2'}, {'响应性评审标准.工程质量': '1.3.3'}, {'响应性评审标准.投标有效期': '3.3.1'}, {'响应性评审标准.投标保证金': '3.4.1'}, {'响应性评审标准.分包计划': '1.11'}]
|
# entries_with_numbers = [{'形式评审标准.投标文件签字盖章': '3.7.3(3)'}, {'形式评审标准.多标段投标': '10.1'}, {'形式评审标准.“技术暗标”': '3.7.4(5)'}, {'响应性评审标准.投标内容': '1.3.1'}, {'响应性评审标准.工期': '1.3.2'}, {'响应性评审标准.工程质量': '1.3.3'}, {'响应性评审标准.投标有效期': '3.3.1'}, {'响应性评审标准.投标保证金': '3.4.1'}, {'响应性评审标准.分包计划': '1.11'}]
|
||||||
entries_with_numbers=[{'xxx': '3.6.3(5)'}]
|
entries_with_numbers=[{'xxx': '4.2.3'}]
|
||||||
primary_json_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\truncate_output.json'
|
primary_json_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\truncate_output.json'
|
||||||
secondary_json_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\clause.json'
|
secondary_json_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\clause.json'
|
||||||
|
|
||||||
|
@ -125,12 +125,12 @@ def process_string_list(string_list):
|
|||||||
else:
|
else:
|
||||||
# 如果没有匹配到内容,返回空列表
|
# 如果没有匹配到内容,返回空列表
|
||||||
return []
|
return []
|
||||||
def find_forbidden(truncate_json_path,clause_path,truncate4): #投标人须知前附表 条款 评分前附表和资格审查表中
|
def find_forbidden(truncate_json_path,clause_path,truncate3): #投标人须知前附表 条款 评分前附表和资格审查表中
|
||||||
# output_filename="merged.pdf"
|
# output_filename="merged.pdf"
|
||||||
# paths=[truncate1,truncate4]
|
# paths=[truncate1,truncate4]
|
||||||
# merged_filepath=merge_pdfs(paths,output_filename) #暂时废弃,评分前附表中的在'否决投标'中摘录了。
|
# merged_filepath=merge_pdfs(paths,output_filename) #暂时废弃,评分前附表中的在'否决投标'中摘录了。
|
||||||
|
|
||||||
file_id=upload_file(truncate4)
|
file_id=upload_file(truncate3)
|
||||||
# user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请按json列表格式给我提供信息,键名为'不得存在的其他情形',请你不要回答有关\"信誉要求\"的内容,若文件中未说明,请在键值中填'未知'。"
|
# user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请按json列表格式给我提供信息,键名为'不得存在的其他情形',请你不要回答有关\"信誉要求\"的内容,若文件中未说明,请在键值中填'未知'。"
|
||||||
user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...],请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。"
|
user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...],请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。"
|
||||||
qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden)
|
qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden)
|
||||||
|
@ -8,7 +8,7 @@ from flask_app.main.通义千问long import upload_file, qianwen_long
|
|||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
|
||||||
def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path): #评标办法前附表
|
def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder): #评标办法前附表
|
||||||
# 形式评审、响应评审:千问
|
# 形式评审、响应评审:千问
|
||||||
print("starting形式响应评审...")
|
print("starting形式响应评审...")
|
||||||
file_id=upload_file(truncate1) #评标办法前附表
|
file_id=upload_file(truncate1) #评标办法前附表
|
||||||
@ -21,7 +21,7 @@ def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpa
|
|||||||
# 创建Future对象
|
# 创建Future对象
|
||||||
future_qualification = executor.submit(process_qualification, qualification_review, truncate3, knowledge_name)
|
future_qualification = executor.submit(process_qualification, qualification_review, truncate3, knowledge_name)
|
||||||
future_form_response = executor.submit(process_reviews, original_dict_data, knowledge_name, truncate0_jsonpath,
|
future_form_response = executor.submit(process_reviews, original_dict_data, knowledge_name, truncate0_jsonpath,
|
||||||
clause_path)
|
clause_path,input_file,output_folder)
|
||||||
|
|
||||||
# 等待执行结果
|
# 等待执行结果
|
||||||
final_qualify_json = future_qualification.result()
|
final_qualify_json = future_qualification.result()
|
||||||
@ -32,6 +32,7 @@ def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpa
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
input_file="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest11.pdf"
|
||||||
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
|
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
|
||||||
# truncate0 = os.path.join(output_folder, "zbtest20_tobidders_notice_table.pdf")
|
# truncate0 = os.path.join(output_folder, "zbtest20_tobidders_notice_table.pdf")
|
||||||
truncate2=os.path.join(output_folder,"zbtest20_tobidders_notice.pdf")
|
truncate2=os.path.join(output_folder,"zbtest20_tobidders_notice.pdf")
|
||||||
@ -40,5 +41,5 @@ if __name__ == "__main__":
|
|||||||
truncate3=os.path.join(output_folder,"zbtest20_qualification.pdf")
|
truncate3=os.path.join(output_folder,"zbtest20_qualification.pdf")
|
||||||
clause_path = convert_clause_to_json(truncate2, output_folder)
|
clause_path = convert_clause_to_json(truncate2, output_folder)
|
||||||
truncate0_jsonpath = os.path.join(output_folder, "truncate_output.json")
|
truncate0_jsonpath = os.path.join(output_folder, "truncate_output.json")
|
||||||
res=combine_review_standards(truncate1,truncate3, knowledge_name,truncate0_jsonpath,clause_path)
|
res=combine_review_standards(truncate1,truncate3, knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder)
|
||||||
print(res)
|
print(res)
|
Loading…
x
Reference in New Issue
Block a user