9.6部分文件引用了招标公告中的内容,添加了对二次跳转的处理
This commit is contained in:
parent
cac8ea4136
commit
3f79900ed4
@ -1,15 +1,28 @@
|
||||
import re
|
||||
|
||||
content="递交投标文件截止之日后120日内"
|
||||
def extract_content_after_special_chars(content):
|
||||
"""
|
||||
提取特定符号后的内容,直到遇到结束符号。
|
||||
"""
|
||||
pattern = r'[\x01\x02☑√团]([^□]+)'
|
||||
match = re.search(pattern, content)
|
||||
if match:
|
||||
return match.group(1).strip() # 提取匹配的内容,并去除多余空格
|
||||
return content # 如果没有找到匹配,返回原内容
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
res=extract_content_after_special_chars(content)
|
||||
|
||||
def extract_common_header(pdf_path):
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
headers = []
|
||||
num_pages_to_read = 3 # 预读页数
|
||||
|
||||
for i in range(min(num_pages_to_read, len(pdf_document.pages))):
|
||||
page = pdf_document.pages[i]
|
||||
text = page.extract_text()
|
||||
if text: # 确保页面有文本内容
|
||||
first_line = text.strip().split('\n')[0]
|
||||
headers.append(first_line)
|
||||
|
||||
if len(headers) < 2:
|
||||
return "" # 如果没有足够的页来比较,返回空字符串
|
||||
|
||||
# 使用set交集来找出公共部分
|
||||
common_header = set(headers[0].split()).intersection(*[set(header.split()) for header in headers[1:]])
|
||||
common_header = ' '.join(common_header)
|
||||
return common_header
|
||||
|
||||
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\test\\zbtest19.pdf"
|
||||
res=extract_common_header(input_path)
|
||||
print(res)
|
@ -81,8 +81,8 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
|
||||
# 调用大模型回答项目基础信息
|
||||
print("starting基础信息...")
|
||||
baseinfo_list = []
|
||||
# baseinfo_file_path='../static/提示词/前两章提问总结.txt'
|
||||
baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt' # 替换为你的txt文件路径
|
||||
baseinfo_file_path='../static/提示词/前两章提问总结.txt'
|
||||
# baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt' # 替换为你的txt文件路径
|
||||
questions = read_questions_from_file(baseinfo_file_path)
|
||||
res1 = multi_threading(questions, knowledge_name)
|
||||
for _, response in res1: # _占位,代表ques;response[0]也是ques;response[1]是ans
|
||||
@ -97,8 +97,8 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
|
||||
# 判断是否分包、是否需要递交投标保证金等
|
||||
chosen_numbers, merged = judge_whether_main(truncate0,output_folder)
|
||||
baseinfo_list.append(merged)
|
||||
# judge_file_path = '../static/提示词/是否相关问题.txt'
|
||||
judge_file_path ='flask_app/static/提示词/是否相关问题.txt'
|
||||
judge_file_path = '../static/提示词/是否相关问题.txt'
|
||||
# judge_file_path ='flask_app/static/提示词/是否相关问题.txt'
|
||||
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
|
||||
|
||||
|
||||
|
@ -4,8 +4,10 @@ import json
|
||||
import time
|
||||
|
||||
from flask_app.main.多线程提问 import multi_threading
|
||||
from flask_app.main.根据条款号整合json import process_and_merge_entries
|
||||
from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2
|
||||
from flask_app.main.json_utils import extract_content_from_json
|
||||
from flask_app.main.截取pdf import truncate_pdf_main
|
||||
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
|
||||
prompt = """
|
||||
# 角色
|
||||
你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。
|
||||
@ -28,7 +30,7 @@ prompt = """
|
||||
"""
|
||||
|
||||
|
||||
def update_json_data(original_data, updates, second_response_list):
|
||||
def update_json_data(original_data, updates1, updates2,second_response_list):
|
||||
"""
|
||||
根据提供的更新字典覆盖原始JSON数据中对应的键值,支持点分隔的键来表示嵌套结构。
|
||||
参数:
|
||||
@ -49,7 +51,9 @@ def update_json_data(original_data, updates, second_response_list):
|
||||
data[keys[-1]] = value
|
||||
|
||||
# 合并 updates 到 original_data 中
|
||||
for key, value in updates.items():
|
||||
for key, value in updates1.items():
|
||||
recursive_update(original_data, key, value)
|
||||
for key,value in updates2.items():
|
||||
recursive_update(original_data, key, value)
|
||||
|
||||
# 遍历 second_response_list 中的每个字典,并合并到 original_data 中
|
||||
@ -59,8 +63,22 @@ def update_json_data(original_data, updates, second_response_list):
|
||||
|
||||
return original_data
|
||||
|
||||
def judge_second_jump(input_dict):
|
||||
formatted_questions = []
|
||||
keys_to_remove = []
|
||||
for key, value in input_dict.items():
|
||||
if '招标公告' in value or '第一章' in value:
|
||||
formatted_entry = f"关于'{key}',第一章招标公告中的内容是怎样的?请按json格式给我提供信息,键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。"
|
||||
formatted_questions.append(formatted_entry)
|
||||
keys_to_remove.append(key) # 标记需要删除的键
|
||||
# 从原始字典中删除处理过的键值对
|
||||
for key in keys_to_remove:
|
||||
del input_dict[key]
|
||||
|
||||
return input_dict, formatted_questions
|
||||
|
||||
def extract_matching_keys(json_data):
|
||||
#'投标人名称': '符合招标公告第3.2条规定'
|
||||
# 函数首先检查输入 json_data 是否为字符串类型。如果是,它会使用 json.loads() 将字符串解析为字典。
|
||||
if isinstance(json_data, str):
|
||||
data = json.loads(json_data)
|
||||
@ -68,7 +86,7 @@ def extract_matching_keys(json_data):
|
||||
data = json_data
|
||||
|
||||
# 正则表达式匹配
|
||||
include_patterns = [re.compile(r"符合第"), re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目")]
|
||||
include_patterns = [re.compile(r"符合第"), re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"),re.compile(r"第.*?条")]
|
||||
additional_include_patterns = [re.compile(r"规定"), re.compile(r"条目")]
|
||||
exclude_patterns = ["投标文件格式", "权利义务", "技术标准","工程量清单"]
|
||||
additional_exclude_patterns = ["按要求", "按规定"]
|
||||
@ -108,51 +126,67 @@ def extract_matching_keys(json_data):
|
||||
return final_matching
|
||||
|
||||
#TODO:如果要引用到招标公告中的内容,考虑提取 或者qianwen-long
|
||||
def reformat_questions(match_keys):
|
||||
"""
|
||||
[{'形式评审标准.多标段投标': '符合第一章“招标公告”第 3.3款规定'}, {'形式评审标准.投标文件的签署': '符合第二章“投标人须知”第 3.6.3(5)目规定'}, {'形式评审标准.投标保证金': '符合第二章“投标人须知”第 3.4.1项规定'}, {'形式评审标准.工程分包(如有)': '符合第二章“投标人须知”第 1.11款规定'}, {'响应性评审标准.投标内容': '符合第二章“投标人须知”第 1.3款的规定'}, {'响应性评审标准.监理服务阶段': '符合第二章“投标人须知”第 1.3款的规定'}, {'响应性评审标准.监理工作范围': '符合第二章“投标人须知”第 1.3款的规定'}, {'响应性评审标准.监理服务期': '符合第二章“投标人须知”第 1.3款的规定'}, {'响应性评审标准.投标有效期': '符合第二章“投标人须知”第 3.3.1项的规定'}, {'响应性评审标准.投标保证金': '符合第二章“投标人须知”第 3.4.1项的规定'}, {'响应性评审标准.重大偏差': '见第二章“投标人须知”第 1.12款规定'}]
|
||||
"""
|
||||
"""
|
||||
根据是否包含特定序号格式(如3.7.4或3.7.4(5)或3.7.4(5)),重新格式化匹配到的评审条目。
|
||||
若包含序号,则提取出来;若不包含,则生成格式化的问题字符串。
|
||||
"""
|
||||
def reformat_questions(match_keys, input_path, output_folder):
|
||||
entries_with_numbers = []
|
||||
entries_with_numbers2 = [] # 用于保存特别处理的条目
|
||||
formatted_questions = []
|
||||
clause2_path = ""
|
||||
|
||||
# 正则表达式,同时匹配全角和半角括号
|
||||
pattern = re.compile(r'(\d+(?:\.\d+)+)(?:[\(\(](\d+)[\)\)])?') #识别包含数字序列的特定格式 eg:3.7.4(5) 3.4.1
|
||||
pattern = re.compile(r'(\d+(?:\.\d+)+)(?:[\(\(](\d+)[\)\)])?') # 识别包含数字序列的特定格式
|
||||
|
||||
# 初始化文件处理
|
||||
generated_files = None
|
||||
file_processed = False
|
||||
|
||||
for entry in match_keys:
|
||||
key, value = next(iter(entry.items()))
|
||||
if '招标公告' in value or '第一章' in value:
|
||||
formatted_entry = f"关于‘{key}’,{value.replace('符合', '')}的内容是怎样的?请按json格式给我提供信息,键名为'{key}',如果存在未知信息,请在对应键值处填'未知'。"
|
||||
formatted_questions.append(formatted_entry)
|
||||
continue # 继续处理下一个条目
|
||||
|
||||
match = pattern.search(value)
|
||||
if match:
|
||||
# 如果存在序号,保存序号与对应的键值对,包括括号内的数字(如果存在)
|
||||
|
||||
if '招标公告' in value or '第一章' in value:
|
||||
if not file_processed:
|
||||
# 只处理一次文件操作
|
||||
generated_files = truncate_pdf_main(input_path, output_folder, 5)
|
||||
file_processed = True # 设置标志位,避免重复处理文件
|
||||
if generated_files:
|
||||
clause2_path = convert_clause_to_json(generated_files[0], output_folder, 2)
|
||||
|
||||
if match and generated_files:
|
||||
# 提取序号并添加到entries_with_numbers2
|
||||
num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "")
|
||||
entries_with_numbers2.append({key: num})
|
||||
else:
|
||||
revised_value = value.replace('符合', '')
|
||||
formatted_entry = f"关于'{key}',{revised_value}的内容是怎样的?请按json格式给我提供信息,键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。"
|
||||
formatted_questions.append(formatted_entry)
|
||||
elif match:
|
||||
num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "")
|
||||
entries_with_numbers.append({key: num})
|
||||
else:
|
||||
# 如果不存在序号,删除“符合”并格式化文本
|
||||
revised_standard = re.sub(r'符合', '', value)
|
||||
formatted_entry = f"关于‘{key}’,{revised_standard}的内容是怎样的?请按json格式给我提供信息,键名为'{key}',如果存在未知信息,请在对应键值处填'未知'。"
|
||||
revised_value = value.replace('符合', '')
|
||||
formatted_entry = f"关于'{key}',{revised_value}的内容是怎样的?请按json格式给我提供信息,键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。"
|
||||
formatted_questions.append(formatted_entry)
|
||||
print(formatted_questions)
|
||||
return entries_with_numbers, formatted_questions
|
||||
|
||||
return entries_with_numbers, formatted_questions, entries_with_numbers2, clause2_path
|
||||
|
||||
|
||||
def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause_json_path):
|
||||
matched_keys = extract_matching_keys(original_dict_data) #[{'形式评审标准.投标文件签字盖章': '符合第二章“投标人须知”第 3.7.3(4)目 规定'}, {'形式评审标准.多标段投标': '符合第二章“投标人须知”第 10.1款规定'}] 提取值中包含"符合"的字典
|
||||
entries_with_numbers, formatted_questions = reformat_questions(matched_keys)
|
||||
results_2 = multi_threading(formatted_questions, knowledge_name, True) #无序号的直接问大模型
|
||||
second_response_list = []
|
||||
def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause_json_path,input_file,output_folder):
|
||||
combined_results2={}
|
||||
matched_keys = extract_matching_keys(original_dict_data) #[{'形式评审标准.投标文件签字盖章': '符合第一章“投标人须知”第 3.7.3(4)目 规定'}, {'形式评审标准.多标段投标': '符合第二章“投标人须知”第 10.1款规定'}] 提取值中包含"符合"的字典
|
||||
entries_with_numbers, formatted_questions1,entries_with_numbers2, clause2_path = reformat_questions(matched_keys,input_file,output_folder)
|
||||
combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath,
|
||||
clause_json_path) # 调用根据条款号整合json.py
|
||||
combined_results1, formatted_questions2 = judge_second_jump(combined_results)
|
||||
if entries_with_numbers2: #跳转第一章招标公告
|
||||
combined_results2=process_and_merge2(entries_with_numbers2,clause2_path)
|
||||
|
||||
results_2 = multi_threading(formatted_questions1+formatted_questions2, knowledge_name, True) #无序号的直接问大模型
|
||||
first_response_list = []
|
||||
for _, response in results_2:
|
||||
try:
|
||||
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
||||
temp = extract_content_from_json(response[1])
|
||||
second_response_list.append(temp)
|
||||
first_response_list.append(temp)
|
||||
else:
|
||||
print(f"Warning: Missing or incomplete response data for query index {_}.")
|
||||
except Exception as e:
|
||||
@ -160,18 +194,20 @@ def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause
|
||||
|
||||
# Assume JSON file paths are defined or configured correctly
|
||||
# print(entries_with_numbers) #[{'形式评审标准.多标段投标': '3.7.4(5)'}]
|
||||
combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath, clause_json_path) #调用根据条款号整合json.py
|
||||
updated_json = update_json_data(original_dict_data, combined_results, second_response_list)
|
||||
updated_json = update_json_data(original_dict_data, combined_results1, combined_results2,first_response_list)
|
||||
return updated_json
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
start_time=time.time()
|
||||
knowledge_name="zbtest20"
|
||||
truncate_tobidders_table_json_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\truncate_output.json"
|
||||
clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\clause.json"
|
||||
original_dict_data={'形式评审标准': {'多标段投标': '符合第一章“招标公告”第 3.3款规定', '投标文件': '投标文件能正常打开', '投标人名称': '与营业执照、资质证书一致', '投标文件的签署': '符合第二章“投标人须知”第 3.6.3(5)目规定', '投标保证金': '符合第二章“投标人须知”规定', '投标文件的格式、内容': '符合第八章“投标文件格式”的格式规定、实质性内容齐全,关键字迹清晰可辨', '投标报价': '①一份投标文件应只有一个投标报价,未提交选择性报价; ②投标函中报价与监理服务费报价汇总表中的报价保持一致; ③投标人未提交调价函。', '联合体投标人(如有)': '提交联合体协议书,并明确联合体牵头人', '工程分包(如有)': '符合第二章“投标人须知”第 1.11款规定'}, '响应性评审标准': {'投标内容': '符合第二章“投标人须知”第 1.3款的规定', '监理服务阶段': '符合第二章“投标人须知”第 1.3款的规定', '监理工作范围': '符合第二章“投标人须知”第 1.3款的规定', '投标报价': '①投标报价不高于最高投标限价(如果有); ②投标报价不低于成本价。', '监理服务期': '符合第二章“投标人须知”第 1.3款的规定', '投标有效期': '符合第二章“投标人须知”第 3.3.1项的规定', '投标保证金': '符合第二章“投标人须知”第 3.4.1项的规定', '权利义务': '符合或优于第四章“合同条款及格式”规定的权利义务', '技术标准': '符合第七章“技术标准”规定', '招标人不能接受的条件': '未附有招标人不能接受的条件', '重大偏差': '见第二章“投标人须知”第 1.12款规定'}}
|
||||
formal_json = process_reviews(original_dict_data,knowledge_name, truncate_tobidders_table_json_path, clause_path)
|
||||
knowledge_name="zbfile"
|
||||
truncate_tobidders_table_json_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\truncate_output.json"
|
||||
clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\clause.json"
|
||||
original_dict_data={'形式评审标准': {'投标文件': '符合第一章规定', '投标人名称': '符合招标公告第3.2条规定', '投标文件签字': '符合第二章“投标人须知”第3.7.3(4)目规定', '投标文件格式、内容': '符合第七章“投标文件格式”的要求,实质性内容齐全、关键字迹清晰可辨', '联合体投标人': '提交联合体协议书,并明确联合体牵头人', '报价唯一': '只能有一个有效报价', '多标段投标': '符合第二章“投标人须知”第 10.1款规定'}, '响应性评审标准': {'投标内容': '符合第二章“投标人须知”第1.3.1项规定', '工期': '符合第二章“投标人须知”第1.3.2项规定', '工程质量': '符合第二章“投标人须知”第1.3.3项规定'}}
|
||||
|
||||
input_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile.pdf"
|
||||
output_folder="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7"
|
||||
formal_json = process_reviews(original_dict_data,knowledge_name, truncate_tobidders_table_json_path, clause_path,input_file,output_folder)
|
||||
data = json.dumps(formal_json, ensure_ascii=False, indent=4)
|
||||
end_time=time.time()
|
||||
elapsed_time = end_time - start_time
|
||||
|
@ -2,14 +2,39 @@ from PyPDF2 import PdfReader, PdfWriter
|
||||
import re # 导入正则表达式库
|
||||
import os # 用于文件和文件夹操作
|
||||
|
||||
def clean_page_numbers(text):
|
||||
# 使用正则表达式删除页码
|
||||
# 假设页码在文本的最开始,紧跟着文字且无空格分隔
|
||||
cleaned_text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
|
||||
# 删除结尾的页码
|
||||
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text)
|
||||
# 删除形如 /129 的页码
|
||||
cleaned_text = re.sub(r'\s*\/\s*\d+\\s*', '', cleaned_text)
|
||||
def extract_common_header(pdf_path):
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
headers = []
|
||||
num_pages_to_read = 3 # 预读页数
|
||||
|
||||
for i in range(min(num_pages_to_read, len(pdf_document.pages))):
|
||||
page = pdf_document.pages[i]
|
||||
text = page.extract_text()
|
||||
if text: # 确保页面有文本内容
|
||||
first_line = text.strip().split('\n')[0]
|
||||
headers.append(first_line)
|
||||
|
||||
if len(headers) < 2:
|
||||
return "" # 如果没有足够的页来比较,返回空字符串
|
||||
|
||||
# 使用set交集来找出公共部分
|
||||
common_header = set(headers[0].split()).intersection(*[set(header.split()) for header in headers[1:]])
|
||||
common_header = ' '.join(common_header)
|
||||
return common_header
|
||||
|
||||
|
||||
def clean_page_content(text, common_header):
|
||||
# 首先删除抬头公共部分
|
||||
if common_header: # 确保有公共抬头才进行替换
|
||||
cleaned_text = text.replace(common_header, '', 1) # 假设抬头出现在文本开头,只替换一次
|
||||
else:
|
||||
cleaned_text = text
|
||||
|
||||
# 删除页码
|
||||
cleaned_text = re.sub(r'^\s*\d+\s*(?=\D)', '', cleaned_text) # 删除开头的页码,仅当紧跟非数字字符时
|
||||
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text) # 删除结尾的页码
|
||||
cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text) # 删除形如 /129 的页码
|
||||
|
||||
return cleaned_text
|
||||
|
||||
|
||||
@ -39,18 +64,19 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
|
||||
print("提供的页码范围无效。")
|
||||
return output_pdf_path
|
||||
|
||||
|
||||
def extract_pages_twice(pdf_path, output_folder, output_suffix):
|
||||
common_header = extract_common_header(pdf_path)
|
||||
last_begin_index = 0
|
||||
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷')
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
for i, page in enumerate(pdf_document.pages):
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
cleaned_text = clean_page_numbers(text)
|
||||
cleaned_text = clean_page_content(text,common_header)
|
||||
# 检查“第一章”开始的位置
|
||||
if begin_pattern.search(cleaned_text):
|
||||
last_begin_index = i # 更新最后匹配的索引,页码从0开始
|
||||
print(last_begin_index)
|
||||
if output_suffix == "qualification":
|
||||
common_pattern = r'^(?:附录(?:一)?[::]|附件(?:一)?[::]|附表(?:一)?[::])'
|
||||
# end_pattern = r'^(第[一二三四五六七八九十]+章\s*投标人须知|评标办法|评标办法前附表)'
|
||||
@ -66,7 +92,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix):
|
||||
for i, page in enumerate(pdf_document.pages[last_begin_index:], start=last_begin_index):
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
cleaned_text = clean_page_numbers(text)
|
||||
cleaned_text = clean_page_content(text,common_header)
|
||||
# 确定起始页,需在last_begin_index之后
|
||||
if ("资格审查" in cleaned_text or "资质条件" in cleaned_text):
|
||||
if re.search(common_pattern, cleaned_text, re.MULTILINE):
|
||||
@ -94,6 +120,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix):
|
||||
|
||||
|
||||
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
||||
common_header=extract_common_header(pdf_path)
|
||||
# 打开PDF文件
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
start_page = None
|
||||
@ -103,10 +130,10 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
||||
page = pdf_document.pages[i]
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
cleaned_text = clean_page_numbers(text)
|
||||
cleaned_text = clean_page_content(text,common_header)
|
||||
# print(cleaned_text)
|
||||
if re.search(begin_pattern, cleaned_text) and i > begin_page:
|
||||
if output_suffix == "invalid" and start_page: #仅当提取Invalid的时候,判断初始页码是第一个匹配到的页码,因为招标编号可能存在多个,后面的覆盖前面
|
||||
if output_suffix == "invalid" and start_page: # 仅当提取Invalid的时候,判断初始页码是第一个匹配到的页码,因为招标编号可能存在多个,后面的覆盖前面
|
||||
continue
|
||||
else:
|
||||
start_page = i
|
||||
@ -117,14 +144,14 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
||||
else:
|
||||
condition = i > (start_page + 1)
|
||||
if condition:
|
||||
is_invalid_condition = output_suffix == "invalid" and i > 30 #这边默认无效投标至少有30页
|
||||
is_invalid_condition = output_suffix == "invalid" and i > 30 # 这边默认无效投标至少有30页
|
||||
if is_invalid_condition or output_suffix != "invalid":
|
||||
end_page = i
|
||||
break
|
||||
|
||||
# 确保找到了起始和结束页面
|
||||
if start_page is None or end_page is None:
|
||||
if output_suffix == "qualification" or "invalid":
|
||||
if output_suffix == "qualification" or output_suffix =="invalid":
|
||||
extract_pages_twice(pdf_path, output_folder, output_suffix)
|
||||
else:
|
||||
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
|
||||
@ -132,6 +159,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
||||
else:
|
||||
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
|
||||
|
||||
|
||||
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
||||
# 确保输出文件夹存在
|
||||
if not os.path.exists(output_folder):
|
||||
@ -142,13 +170,15 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
|
||||
for file in os.listdir(input_path):
|
||||
if file.endswith(".pdf"):
|
||||
pdf_path = os.path.join(input_path, file)
|
||||
output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||
output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern,
|
||||
output_suffix)
|
||||
if output_pdf_path and os.path.isfile(output_pdf_path):
|
||||
generated_files.append(output_pdf_path)
|
||||
return generated_files
|
||||
elif os.path.isfile(input_path) and input_path.endswith(".pdf"):
|
||||
# 处理单个PDF文件
|
||||
output_pdf_path = extract_pages(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||
output_pdf_path = extract_pages(input_path, output_folder, begin_pattern, begin_page, end_pattern,
|
||||
output_suffix)
|
||||
if output_pdf_path and os.path.isfile(output_pdf_path):
|
||||
return [output_pdf_path] # 以列表形式返回,以保持一致性
|
||||
else:
|
||||
@ -165,7 +195,8 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
||||
output_suffix = "tobidders_notice_table"
|
||||
elif selection == 2:
|
||||
# Configure patterns and phrases for "评标办法"
|
||||
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*评标办法') #考虑到这种情况 '第三章 第三章 第三章 第三章 评标办法 评标办法 评标办法 评标办法'
|
||||
begin_pattern = re.compile(
|
||||
r'第[一二三四五六七八九十]+章\s*评标办法') # 考虑到这种情况 '第三章 第三章 第三章 第三章 评标办法 评标办法 评标办法 评标办法'
|
||||
begin_page = 10
|
||||
end_pattern = re.compile(r'评标办法正文|评标办法')
|
||||
output_suffix = "evaluation_method"
|
||||
@ -173,9 +204,11 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
||||
# Configure patterns and phrases for "投标人须知正文"
|
||||
begin_pattern = re.compile(r'投标人须知正文')
|
||||
begin_page = 5
|
||||
end_pattern = re.compile(r'^第[一二三四五六七八九十]+章\s*评标办法|^评标办法前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]', re.MULTILINE)
|
||||
end_pattern = re.compile(
|
||||
r'^第[一二三四五六七八九十]+章\s*评标办法|^评标办法前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]',
|
||||
re.MULTILINE)
|
||||
output_suffix = "tobidders_notice"
|
||||
elif selection==4:
|
||||
elif selection == 4:
|
||||
# 配置用于 "资格审查条件" 的正则表达式模式和短语
|
||||
common_pattern = r'^(?:附录(?:一)?[::]|附件(?:一)?[::]|附表(?:一)?[::])'
|
||||
begin_pattern = re.compile(common_pattern + r'.*(?:资质|能力|信誉).*$', re.MULTILINE)
|
||||
@ -187,6 +220,12 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
||||
)
|
||||
output_suffix = "qualification"
|
||||
elif selection == 5:
|
||||
# 配置用于 "招标公告" 的正则表达式模式和短语
|
||||
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷')
|
||||
begin_page = 0
|
||||
end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*投标人须知', re.MULTILINE)
|
||||
output_suffix = "notice"
|
||||
elif selection == 6:
|
||||
# 配置用于 "无效标" 的正则表达式模式和短语
|
||||
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:')
|
||||
begin_page = 0
|
||||
@ -199,6 +238,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
||||
# Process the selected input
|
||||
return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||
|
||||
|
||||
def truncate_pdf_multiple(input_path, output_folder):
|
||||
truncate_files = []
|
||||
for selection in range(1, 5):
|
||||
@ -206,11 +246,12 @@ def truncate_pdf_multiple(input_path, output_folder):
|
||||
truncate_files.extend(files)
|
||||
return truncate_files
|
||||
|
||||
#TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下,比如说就不进行跳转(见xx表) 开评定标这里也要考虑 如果评分表为空,也要处理。
|
||||
|
||||
# TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下,比如说就不进行跳转(见xx表) 开评定标这里也要考虑 如果评分表为空,也要处理。
|
||||
if __name__ == "__main__":
|
||||
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标02.pdf"
|
||||
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件"
|
||||
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\test\\zbtest18.pdf"
|
||||
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\test"
|
||||
# truncate_pdf_multiple(input_path,output_folder)
|
||||
selection = 4 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-无效标
|
||||
selection = 5 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
|
||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
# # print("生成的文件:", generated_files)
|
||||
|
@ -26,11 +26,11 @@ def extract_text_from_pdf(file_path):
|
||||
text += page_text
|
||||
return text
|
||||
|
||||
def extract_section(text, start_keyword, end_phrases):
|
||||
start_index = text.find(start_keyword)
|
||||
if start_index == -1:
|
||||
return ""
|
||||
|
||||
def extract_section(text, start_pattern, end_phrases):
|
||||
start_match = re.search(start_pattern, text)
|
||||
if not start_match:
|
||||
return "" # 如果没有找到匹配的开始模式,返回空字符串
|
||||
start_index = start_match.start()
|
||||
end_index = len(text)
|
||||
for phrase in end_phrases:
|
||||
# Use multiline mode with `re.MULTILINE`
|
||||
@ -124,26 +124,31 @@ def convert_to_json(file_path, start_word, end_phrases):
|
||||
parsed_data = parse_text_by_heading(text)
|
||||
return parsed_data
|
||||
|
||||
def convert_clause_to_json(input_path,output_folder):
|
||||
def convert_clause_to_json(input_path,output_folder,type=1):
|
||||
if type==1:
|
||||
start_word = "投标人须知正文"
|
||||
end_phrases = [
|
||||
r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
|
||||
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
|
||||
]
|
||||
else:
|
||||
start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:'
|
||||
end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
|
||||
result = convert_to_json(input_path, start_word, end_phrases)
|
||||
output_path = os.path.join(output_folder, "clause.json")
|
||||
file_name = "clause1.json" if type == 1 else "clause2.json"
|
||||
output_path = os.path.join(output_folder, file_name)
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, indent=4, ensure_ascii=False)
|
||||
return output_path
|
||||
|
||||
if __name__ == "__main__":
|
||||
file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp6\\713022ff-27d9-43e3-9cc9-2752effbfd66\\ztbfile_invalid.pdf'
|
||||
file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest16_tobidders_notice.pdf'
|
||||
start_word = "投标人须知正文"
|
||||
end_phrases = [
|
||||
r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
|
||||
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
|
||||
]
|
||||
output_folder = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp6\\713022ff-27d9-43e3-9cc9-2752effbfd66'
|
||||
output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件'
|
||||
try:
|
||||
output_path = convert_clause_to_json(file_path,output_folder)
|
||||
print(f"Final JSON result saved to: {output_path}")
|
||||
|
File diff suppressed because one or more lines are too long
@ -1,23 +1,12 @@
|
||||
import json
|
||||
import re
|
||||
|
||||
|
||||
def extract_content_after_special_chars(content):
|
||||
"""
|
||||
提取特定符号后的内容,直到遇到结束符号。
|
||||
"""
|
||||
pattern = r'[\x01\x02☑√团]([^□]+)'
|
||||
match = re.search(pattern, content)
|
||||
if match:
|
||||
return match.group(1).strip() # 提取匹配的内容,并去除多余空格
|
||||
return content # 如果没有找到匹配,返回原内容
|
||||
|
||||
def load_json(file_path):
|
||||
"""
|
||||
加载JSON文件,并统一其中的括号为全角括号。
|
||||
"""
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
data = json.load(f) #字典
|
||||
return standardize_brackets_in_json(data)
|
||||
|
||||
def standardize_brackets_in_json(data):
|
||||
@ -146,10 +135,17 @@ def process_and_merge_entries(entries_with_numbers, primary_json_path, secondary
|
||||
combined_results = find_entries_in_jsons(entries_with_numbers, primary_json_data, secondary_json_data)
|
||||
return combined_results
|
||||
|
||||
def process_and_merge2(entries_with_numbers,json_path):
|
||||
json_data=load_json(json_path)
|
||||
combined_results=find_entries_in_jsons(entries_with_numbers,json_data,{})
|
||||
return combined_results
|
||||
|
||||
|
||||
#3.4.1 投标保证金,提取截止有问题。
|
||||
if __name__ == "__main__":
|
||||
# Hypothetical entries and file paths for testing
|
||||
# entries_with_numbers = [{'形式评审标准.投标文件签字盖章': '3.7.3(3)'}, {'形式评审标准.多标段投标': '10.1'}, {'形式评审标准.“技术暗标”': '3.7.4(5)'}, {'响应性评审标准.投标内容': '1.3.1'}, {'响应性评审标准.工期': '1.3.2'}, {'响应性评审标准.工程质量': '1.3.3'}, {'响应性评审标准.投标有效期': '3.3.1'}, {'响应性评审标准.投标保证金': '3.4.1'}, {'响应性评审标准.分包计划': '1.11'}]
|
||||
entries_with_numbers=[{'xxx': '3.6.3(5)'}]
|
||||
entries_with_numbers=[{'xxx': '4.2.3'}]
|
||||
primary_json_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\truncate_output.json'
|
||||
secondary_json_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\clause.json'
|
||||
|
||||
|
@ -125,12 +125,12 @@ def process_string_list(string_list):
|
||||
else:
|
||||
# 如果没有匹配到内容,返回空列表
|
||||
return []
|
||||
def find_forbidden(truncate_json_path,clause_path,truncate4): #投标人须知前附表 条款 评分前附表和资格审查表中
|
||||
def find_forbidden(truncate_json_path,clause_path,truncate3): #投标人须知前附表 条款 评分前附表和资格审查表中
|
||||
# output_filename="merged.pdf"
|
||||
# paths=[truncate1,truncate4]
|
||||
# merged_filepath=merge_pdfs(paths,output_filename) #暂时废弃,评分前附表中的在'否决投标'中摘录了。
|
||||
|
||||
file_id=upload_file(truncate4)
|
||||
file_id=upload_file(truncate3)
|
||||
# user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请按json列表格式给我提供信息,键名为'不得存在的其他情形',请你不要回答有关\"信誉要求\"的内容,若文件中未说明,请在键值中填'未知'。"
|
||||
user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...],请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。"
|
||||
qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden)
|
||||
|
@ -8,7 +8,7 @@ from flask_app.main.通义千问long import upload_file, qianwen_long
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
|
||||
def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path): #评标办法前附表
|
||||
def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder): #评标办法前附表
|
||||
# 形式评审、响应评审:千问
|
||||
print("starting形式响应评审...")
|
||||
file_id=upload_file(truncate1) #评标办法前附表
|
||||
@ -21,7 +21,7 @@ def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpa
|
||||
# 创建Future对象
|
||||
future_qualification = executor.submit(process_qualification, qualification_review, truncate3, knowledge_name)
|
||||
future_form_response = executor.submit(process_reviews, original_dict_data, knowledge_name, truncate0_jsonpath,
|
||||
clause_path)
|
||||
clause_path,input_file,output_folder)
|
||||
|
||||
# 等待执行结果
|
||||
final_qualify_json = future_qualification.result()
|
||||
@ -32,6 +32,7 @@ def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpa
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
input_file="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest11.pdf"
|
||||
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
|
||||
# truncate0 = os.path.join(output_folder, "zbtest20_tobidders_notice_table.pdf")
|
||||
truncate2=os.path.join(output_folder,"zbtest20_tobidders_notice.pdf")
|
||||
@ -40,5 +41,5 @@ if __name__ == "__main__":
|
||||
truncate3=os.path.join(output_folder,"zbtest20_qualification.pdf")
|
||||
clause_path = convert_clause_to_json(truncate2, output_folder)
|
||||
truncate0_jsonpath = os.path.join(output_folder, "truncate_output.json")
|
||||
res=combine_review_standards(truncate1,truncate3, knowledge_name,truncate0_jsonpath,clause_path)
|
||||
res=combine_review_standards(truncate1,truncate3, knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder)
|
||||
print(res)
|
Loading…
x
Reference in New Issue
Block a user