9.6部分文件引用了招标公告中的内容,添加了对二次跳转的处理

This commit is contained in:
zy123 2024-09-06 15:50:52 +08:00
parent cac8ea4136
commit 3f79900ed4
9 changed files with 258 additions and 162 deletions

View File

@ -1,15 +1,28 @@
import re
content="递交投标文件截止之日后120日内"
def extract_content_after_special_chars(content):
"""
提取特定符号后的内容直到遇到结束符号
"""
pattern = r'[\x01\x02☑√团]([^□]+)'
match = re.search(pattern, content)
if match:
return match.group(1).strip() # 提取匹配的内容,并去除多余空格
return content # 如果没有找到匹配,返回原内容
from PyPDF2 import PdfReader
res=extract_content_after_special_chars(content)
def extract_common_header(pdf_path):
pdf_document = PdfReader(pdf_path)
headers = []
num_pages_to_read = 3 # 预读页数
for i in range(min(num_pages_to_read, len(pdf_document.pages))):
page = pdf_document.pages[i]
text = page.extract_text()
if text: # 确保页面有文本内容
first_line = text.strip().split('\n')[0]
headers.append(first_line)
if len(headers) < 2:
return "" # 如果没有足够的页来比较,返回空字符串
# 使用set交集来找出公共部分
common_header = set(headers[0].split()).intersection(*[set(header.split()) for header in headers[1:]])
common_header = ' '.join(common_header)
return common_header
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\test\\zbtest19.pdf"
res=extract_common_header(input_path)
print(res)

View File

@ -81,8 +81,8 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
# 调用大模型回答项目基础信息
print("starting基础信息...")
baseinfo_list = []
# baseinfo_file_path='../static/提示词/前两章提问总结.txt'
baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt' # 替换为你的txt文件路径
baseinfo_file_path='../static/提示词/前两章提问总结.txt'
# baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt' # 替换为你的txt文件路径
questions = read_questions_from_file(baseinfo_file_path)
res1 = multi_threading(questions, knowledge_name)
for _, response in res1: # _占位代表ques;response[0]也是ques;response[1]是ans
@ -97,8 +97,8 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
# 判断是否分包、是否需要递交投标保证金等
chosen_numbers, merged = judge_whether_main(truncate0,output_folder)
baseinfo_list.append(merged)
# judge_file_path = '../static/提示词/是否相关问题.txt'
judge_file_path ='flask_app/static/提示词/是否相关问题.txt'
judge_file_path = '../static/提示词/是否相关问题.txt'
# judge_file_path ='flask_app/static/提示词/是否相关问题.txt'
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)

View File

@ -4,8 +4,10 @@ import json
import time
from flask_app.main.多线程提问 import multi_threading
from flask_app.main.根据条款号整合json import process_and_merge_entries
from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2
from flask_app.main.json_utils import extract_content_from_json
from flask_app.main.截取pdf import truncate_pdf_main
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
prompt = """
# 角色
你是一个文档处理专家专门负责理解和操作基于特定内容的文档任务这包括解析总结搜索或生成与给定文档相关的各类信息
@ -28,7 +30,7 @@ prompt = """
"""
def update_json_data(original_data, updates, second_response_list):
def update_json_data(original_data, updates1, updates2,second_response_list):
"""
根据提供的更新字典覆盖原始JSON数据中对应的键值支持点分隔的键来表示嵌套结构
参数:
@ -49,7 +51,9 @@ def update_json_data(original_data, updates, second_response_list):
data[keys[-1]] = value
# 合并 updates 到 original_data 中
for key, value in updates.items():
for key, value in updates1.items():
recursive_update(original_data, key, value)
for key,value in updates2.items():
recursive_update(original_data, key, value)
# 遍历 second_response_list 中的每个字典,并合并到 original_data 中
@ -59,8 +63,22 @@ def update_json_data(original_data, updates, second_response_list):
return original_data
def judge_second_jump(input_dict):
formatted_questions = []
keys_to_remove = []
for key, value in input_dict.items():
if '招标公告' in value or '第一章' in value:
formatted_entry = f"关于'{key}'第一章招标公告中的内容是怎样的请按json格式给我提供信息键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'"
formatted_questions.append(formatted_entry)
keys_to_remove.append(key) # 标记需要删除的键
# 从原始字典中删除处理过的键值对
for key in keys_to_remove:
del input_dict[key]
return input_dict, formatted_questions
def extract_matching_keys(json_data):
#'投标人名称': '符合招标公告第3.2条规定'
# 函数首先检查输入 json_data 是否为字符串类型。如果是,它会使用 json.loads() 将字符串解析为字典。
if isinstance(json_data, str):
data = json.loads(json_data)
@ -68,7 +86,7 @@ def extract_matching_keys(json_data):
data = json_data
# 正则表达式匹配
include_patterns = [re.compile(r"符合第"), re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目")]
include_patterns = [re.compile(r"符合第"), re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"),re.compile(r"第.*?条")]
additional_include_patterns = [re.compile(r"规定"), re.compile(r"条目")]
exclude_patterns = ["投标文件格式", "权利义务", "技术标准","工程量清单"]
additional_exclude_patterns = ["按要求", "按规定"]
@ -108,51 +126,67 @@ def extract_matching_keys(json_data):
return final_matching
#TODO:如果要引用到招标公告中的内容,考虑提取 或者qianwen-long
def reformat_questions(match_keys):
"""
[{'形式评审标准.多标段投标': '符合第一章“招标公告”第 3.3款规定'}, {'形式评审标准.投标文件的签署': '符合第二章“投标人须知”第 3.6.3(5)目规定'}, {'形式评审标准.投标保证金': '符合第二章“投标人须知”第 3.4.1项规定'}, {'形式评审标准.工程分包(如有)': '符合第二章“投标人须知”第 1.11款规定'}, {'响应性评审标准.投标内容': '符合第二章“投标人须知”第 1.3款的规定'}, {'响应性评审标准.监理服务阶段': '符合第二章“投标人须知”第 1.3款的规定'}, {'响应性评审标准.监理工作范围': '符合第二章“投标人须知”第 1.3款的规定'}, {'响应性评审标准.监理服务期': '符合第二章“投标人须知”第 1.3款的规定'}, {'响应性评审标准.投标有效期': '符合第二章“投标人须知”第 3.3.1项的规定'}, {'响应性评审标准.投标保证金': '符合第二章“投标人须知”第 3.4.1项的规定'}, {'响应性评审标准.重大偏差': '见第二章“投标人须知”第 1.12款规定'}]
"""
"""
根据是否包含特定序号格式如3.7.4或3.7.4(5)或3.7.45重新格式化匹配到的评审条目
若包含序号则提取出来若不包含则生成格式化的问题字符串
"""
def reformat_questions(match_keys, input_path, output_folder):
entries_with_numbers = []
entries_with_numbers2 = [] # 用于保存特别处理的条目
formatted_questions = []
clause2_path = ""
# 正则表达式,同时匹配全角和半角括号
pattern = re.compile(r'(\d+(?:\.\d+)+)(?:[\(\](\d+)[\)\])?') #识别包含数字序列的特定格式 eg:3.7.4(5) 3.4.1
pattern = re.compile(r'(\d+(?:\.\d+)+)(?:[\(\](\d+)[\)\])?') # 识别包含数字序列的特定格式
# 初始化文件处理
generated_files = None
file_processed = False
for entry in match_keys:
key, value = next(iter(entry.items()))
if '招标公告' in value or '第一章' in value:
formatted_entry = f"关于‘{key}{value.replace('符合', '')}的内容是怎样的请按json格式给我提供信息键名为'{key}',如果存在未知信息,请在对应键值处填'未知'"
formatted_questions.append(formatted_entry)
continue # 继续处理下一个条目
match = pattern.search(value)
if match:
# 如果存在序号,保存序号与对应的键值对,包括括号内的数字(如果存在)
if '招标公告' in value or '第一章' in value:
if not file_processed:
# 只处理一次文件操作
generated_files = truncate_pdf_main(input_path, output_folder, 5)
file_processed = True # 设置标志位,避免重复处理文件
if generated_files:
clause2_path = convert_clause_to_json(generated_files[0], output_folder, 2)
if match and generated_files:
# 提取序号并添加到entries_with_numbers2
num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "")
entries_with_numbers2.append({key: num})
else:
revised_value = value.replace('符合', '')
formatted_entry = f"关于'{key}'{revised_value}的内容是怎样的请按json格式给我提供信息键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'"
formatted_questions.append(formatted_entry)
elif match:
num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "")
entries_with_numbers.append({key: num})
else:
# 如果不存在序号,删除“符合”并格式化文本
revised_standard = re.sub(r'符合', '', value)
formatted_entry = f"关于‘{key}{revised_standard}的内容是怎样的请按json格式给我提供信息键名为'{key}',如果存在未知信息,请在对应键值处填'未知'"
revised_value = value.replace('符合', '')
formatted_entry = f"关于'{key}'{revised_value}的内容是怎样的请按json格式给我提供信息键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'"
formatted_questions.append(formatted_entry)
print(formatted_questions)
return entries_with_numbers, formatted_questions
return entries_with_numbers, formatted_questions, entries_with_numbers2, clause2_path
def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause_json_path):
matched_keys = extract_matching_keys(original_dict_data) #[{'形式评审标准.投标文件签字盖章': '符合第二章“投标人须知”第 3.7.34目 规定'}, {'形式评审标准.多标段投标': '符合第二章“投标人须知”第 10.1款规定'}] 提取值中包含"符合"的字典
entries_with_numbers, formatted_questions = reformat_questions(matched_keys)
results_2 = multi_threading(formatted_questions, knowledge_name, True) #无序号的直接问大模型
second_response_list = []
def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause_json_path,input_file,output_folder):
combined_results2={}
matched_keys = extract_matching_keys(original_dict_data) #[{'形式评审标准.投标文件签字盖章': '符合第一章“投标人须知”第 3.7.34目 规定'}, {'形式评审标准.多标段投标': '符合第二章“投标人须知”第 10.1款规定'}] 提取值中包含"符合"的字典
entries_with_numbers, formatted_questions1,entries_with_numbers2, clause2_path = reformat_questions(matched_keys,input_file,output_folder)
combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath,
clause_json_path) # 调用根据条款号整合json.py
combined_results1, formatted_questions2 = judge_second_jump(combined_results)
if entries_with_numbers2: #跳转第一章招标公告
combined_results2=process_and_merge2(entries_with_numbers2,clause2_path)
results_2 = multi_threading(formatted_questions1+formatted_questions2, knowledge_name, True) #无序号的直接问大模型
first_response_list = []
for _, response in results_2:
try:
if response and len(response) > 1: # 检查response存在且有至少两个元素
temp = extract_content_from_json(response[1])
second_response_list.append(temp)
first_response_list.append(temp)
else:
print(f"Warning: Missing or incomplete response data for query index {_}.")
except Exception as e:
@ -160,18 +194,20 @@ def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause
# Assume JSON file paths are defined or configured correctly
# print(entries_with_numbers) #[{'形式评审标准.多标段投标': '3.7.45'}]
combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath, clause_json_path) #调用根据条款号整合json.py
updated_json = update_json_data(original_dict_data, combined_results, second_response_list)
updated_json = update_json_data(original_dict_data, combined_results1, combined_results2,first_response_list)
return updated_json
if __name__ == "__main__":
start_time=time.time()
knowledge_name="zbtest20"
truncate_tobidders_table_json_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\truncate_output.json"
clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\clause.json"
original_dict_data={'形式评审标准': {'多标段投标': '符合第一章“招标公告”第 3.3款规定', '投标文件': '投标文件能正常打开', '投标人名称': '与营业执照、资质证书一致', '投标文件的签署': '符合第二章“投标人须知”第 3.6.3(5)目规定', '投标保证金': '符合第二章“投标人须知”规定', '投标文件的格式、内容': '符合第八章“投标文件格式”的格式规定、实质性内容齐全,关键字迹清晰可辨', '投标报价': '①一份投标文件应只有一个投标报价,未提交选择性报价; ②投标函中报价与监理服务费报价汇总表中的报价保持一致; ③投标人未提交调价函。', '联合体投标人(如有)': '提交联合体协议书,并明确联合体牵头人', '工程分包(如有)': '符合第二章“投标人须知”第 1.11款规定'}, '响应性评审标准': {'投标内容': '符合第二章“投标人须知”第 1.3款的规定', '监理服务阶段': '符合第二章“投标人须知”第 1.3款的规定', '监理工作范围': '符合第二章“投标人须知”第 1.3款的规定', '投标报价': '①投标报价不高于最高投标限价(如果有); ②投标报价不低于成本价。', '监理服务期': '符合第二章“投标人须知”第 1.3款的规定', '投标有效期': '符合第二章“投标人须知”第 3.3.1项的规定', '投标保证金': '符合第二章“投标人须知”第 3.4.1项的规定', '权利义务': '符合或优于第四章“合同条款及格式”规定的权利义务', '技术标准': '符合第七章“技术标准”规定', '招标人不能接受的条件': '未附有招标人不能接受的条件', '重大偏差': '见第二章“投标人须知”第 1.12款规定'}}
formal_json = process_reviews(original_dict_data,knowledge_name, truncate_tobidders_table_json_path, clause_path)
knowledge_name="zbfile"
truncate_tobidders_table_json_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\truncate_output.json"
clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\clause.json"
original_dict_data={'形式评审标准': {'投标文件': '符合第一章规定', '投标人名称': '符合招标公告第3.2条规定', '投标文件签字': '符合第二章“投标人须知”第3.7.34目规定', '投标文件格式、内容': '符合第七章“投标文件格式”的要求,实质性内容齐全、关键字迹清晰可辨', '联合体投标人': '提交联合体协议书,并明确联合体牵头人', '报价唯一': '只能有一个有效报价', '多标段投标': '符合第二章“投标人须知”第 10.1款规定'}, '响应性评审标准': {'投标内容': '符合第二章“投标人须知”第1.3.1项规定', '工期': '符合第二章“投标人须知”第1.3.2项规定', '工程质量': '符合第二章“投标人须知”第1.3.3项规定'}}
input_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile.pdf"
output_folder="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7"
formal_json = process_reviews(original_dict_data,knowledge_name, truncate_tobidders_table_json_path, clause_path,input_file,output_folder)
data = json.dumps(formal_json, ensure_ascii=False, indent=4)
end_time=time.time()
elapsed_time = end_time - start_time

View File

@ -2,14 +2,39 @@ from PyPDF2 import PdfReader, PdfWriter
import re # 导入正则表达式库
import os # 用于文件和文件夹操作
def clean_page_numbers(text):
# 使用正则表达式删除页码
# 假设页码在文本的最开始,紧跟着文字且无空格分隔
cleaned_text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
# 删除结尾的页码
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text)
# 删除形如 /129 的页码
cleaned_text = re.sub(r'\s*\/\s*\d+\\s*', '', cleaned_text)
def extract_common_header(pdf_path):
pdf_document = PdfReader(pdf_path)
headers = []
num_pages_to_read = 3 # 预读页数
for i in range(min(num_pages_to_read, len(pdf_document.pages))):
page = pdf_document.pages[i]
text = page.extract_text()
if text: # 确保页面有文本内容
first_line = text.strip().split('\n')[0]
headers.append(first_line)
if len(headers) < 2:
return "" # 如果没有足够的页来比较,返回空字符串
# 使用set交集来找出公共部分
common_header = set(headers[0].split()).intersection(*[set(header.split()) for header in headers[1:]])
common_header = ' '.join(common_header)
return common_header
def clean_page_content(text, common_header):
# 首先删除抬头公共部分
if common_header: # 确保有公共抬头才进行替换
cleaned_text = text.replace(common_header, '', 1) # 假设抬头出现在文本开头,只替换一次
else:
cleaned_text = text
# 删除页码
cleaned_text = re.sub(r'^\s*\d+\s*(?=\D)', '', cleaned_text) # 删除开头的页码,仅当紧跟非数字字符时
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text) # 删除结尾的页码
cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text) # 删除形如 /129 的页码
return cleaned_text
@ -39,18 +64,19 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
print("提供的页码范围无效。")
return output_pdf_path
def extract_pages_twice(pdf_path, output_folder, output_suffix):
common_header = extract_common_header(pdf_path)
last_begin_index = 0
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷')
pdf_document = PdfReader(pdf_path)
for i, page in enumerate(pdf_document.pages):
text = page.extract_text()
if text:
cleaned_text = clean_page_numbers(text)
cleaned_text = clean_page_content(text,common_header)
# 检查“第一章”开始的位置
if begin_pattern.search(cleaned_text):
last_begin_index = i # 更新最后匹配的索引页码从0开始
print(last_begin_index)
if output_suffix == "qualification":
common_pattern = r'^(?:附录(?:一)?[:]|附件(?:一)?[:]|附表(?:一)?[:])'
# end_pattern = r'^(第[一二三四五六七八九十]+章\s*投标人须知|评标办法|评标办法前附表)'
@ -66,7 +92,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix):
for i, page in enumerate(pdf_document.pages[last_begin_index:], start=last_begin_index):
text = page.extract_text()
if text:
cleaned_text = clean_page_numbers(text)
cleaned_text = clean_page_content(text,common_header)
# 确定起始页需在last_begin_index之后
if ("资格审查" in cleaned_text or "资质条件" in cleaned_text):
if re.search(common_pattern, cleaned_text, re.MULTILINE):
@ -94,6 +120,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix):
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
common_header=extract_common_header(pdf_path)
# 打开PDF文件
pdf_document = PdfReader(pdf_path)
start_page = None
@ -103,10 +130,10 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
page = pdf_document.pages[i]
text = page.extract_text()
if text:
cleaned_text = clean_page_numbers(text)
cleaned_text = clean_page_content(text,common_header)
# print(cleaned_text)
if re.search(begin_pattern, cleaned_text) and i > begin_page:
if output_suffix == "invalid" and start_page: #仅当提取Invalid的时候判断初始页码是第一个匹配到的页码因为招标编号可能存在多个后面的覆盖前面
if output_suffix == "invalid" and start_page: # 仅当提取Invalid的时候判断初始页码是第一个匹配到的页码因为招标编号可能存在多个后面的覆盖前面
continue
else:
start_page = i
@ -117,14 +144,14 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
else:
condition = i > (start_page + 1)
if condition:
is_invalid_condition = output_suffix == "invalid" and i > 30 #这边默认无效投标至少有30页
is_invalid_condition = output_suffix == "invalid" and i > 30 # 这边默认无效投标至少有30页
if is_invalid_condition or output_suffix != "invalid":
end_page = i
break
# 确保找到了起始和结束页面
if start_page is None or end_page is None:
if output_suffix == "qualification" or "invalid":
if output_suffix == "qualification" or output_suffix =="invalid":
extract_pages_twice(pdf_path, output_folder, output_suffix)
else:
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
@ -132,6 +159,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
else:
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
# 确保输出文件夹存在
if not os.path.exists(output_folder):
@ -142,13 +170,15 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
for file in os.listdir(input_path):
if file.endswith(".pdf"):
pdf_path = os.path.join(input_path, file)
output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern,
output_suffix)
if output_pdf_path and os.path.isfile(output_pdf_path):
generated_files.append(output_pdf_path)
return generated_files
elif os.path.isfile(input_path) and input_path.endswith(".pdf"):
# 处理单个PDF文件
output_pdf_path = extract_pages(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
output_pdf_path = extract_pages(input_path, output_folder, begin_pattern, begin_page, end_pattern,
output_suffix)
if output_pdf_path and os.path.isfile(output_pdf_path):
return [output_pdf_path] # 以列表形式返回,以保持一致性
else:
@ -165,7 +195,8 @@ def truncate_pdf_main(input_path, output_folder, selection):
output_suffix = "tobidders_notice_table"
elif selection == 2:
# Configure patterns and phrases for "评标办法"
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*评标办法') #考虑到这种情况 '第三章 第三章 第三章 第三章 评标办法 评标办法 评标办法 评标办法'
begin_pattern = re.compile(
r'第[一二三四五六七八九十]+章\s*评标办法') # 考虑到这种情况 '第三章 第三章 第三章 第三章 评标办法 评标办法 评标办法 评标办法'
begin_page = 10
end_pattern = re.compile(r'评标办法正文|评标办法')
output_suffix = "evaluation_method"
@ -173,9 +204,11 @@ def truncate_pdf_main(input_path, output_folder, selection):
# Configure patterns and phrases for "投标人须知正文"
begin_pattern = re.compile(r'投标人须知正文')
begin_page = 5
end_pattern = re.compile(r'^第[一二三四五六七八九十]+章\s*评标办法|^评标办法前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]', re.MULTILINE)
end_pattern = re.compile(
r'^第[一二三四五六七八九十]+章\s*评标办法|^评标办法前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]',
re.MULTILINE)
output_suffix = "tobidders_notice"
elif selection==4:
elif selection == 4:
# 配置用于 "资格审查条件" 的正则表达式模式和短语
common_pattern = r'^(?:附录(?:一)?[:]|附件(?:一)?[:]|附表(?:一)?[:])'
begin_pattern = re.compile(common_pattern + r'.*(?:资质|能力|信誉).*$', re.MULTILINE)
@ -187,6 +220,12 @@ def truncate_pdf_main(input_path, output_folder, selection):
)
output_suffix = "qualification"
elif selection == 5:
# 配置用于 "招标公告" 的正则表达式模式和短语
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷')
begin_page = 0
end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*投标人须知', re.MULTILINE)
output_suffix = "notice"
elif selection == 6:
# 配置用于 "无效标" 的正则表达式模式和短语
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:')
begin_page = 0
@ -199,6 +238,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
# Process the selected input
return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
def truncate_pdf_multiple(input_path, output_folder):
truncate_files = []
for selection in range(1, 5):
@ -206,11 +246,12 @@ def truncate_pdf_multiple(input_path, output_folder):
truncate_files.extend(files)
return truncate_files
#TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下比如说就不进行跳转见xx表 开评定标这里也要考虑 如果评分表为空,也要处理。
# TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下比如说就不进行跳转见xx表 开评定标这里也要考虑 如果评分表为空,也要处理。
if __name__ == "__main__":
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标02.pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件"
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\test\\zbtest18.pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\test"
# truncate_pdf_multiple(input_path,output_folder)
selection = 4 # 例如1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-无效标
selection = 5 # 例如1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
generated_files = truncate_pdf_main(input_path, output_folder, selection)
# # print("生成的文件:", generated_files)

View File

@ -26,11 +26,11 @@ def extract_text_from_pdf(file_path):
text += page_text
return text
def extract_section(text, start_keyword, end_phrases):
start_index = text.find(start_keyword)
if start_index == -1:
return ""
def extract_section(text, start_pattern, end_phrases):
start_match = re.search(start_pattern, text)
if not start_match:
return "" # 如果没有找到匹配的开始模式,返回空字符串
start_index = start_match.start()
end_index = len(text)
for phrase in end_phrases:
# Use multiline mode with `re.MULTILINE`
@ -124,26 +124,31 @@ def convert_to_json(file_path, start_word, end_phrases):
parsed_data = parse_text_by_heading(text)
return parsed_data
def convert_clause_to_json(input_path,output_folder):
start_word = "投标人须知正文"
end_phrases = [
r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
]
def convert_clause_to_json(input_path,output_folder,type=1):
if type==1:
start_word = "投标人须知正文"
end_phrases = [
r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
]
else:
start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:'
end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
result = convert_to_json(input_path, start_word, end_phrases)
output_path = os.path.join(output_folder, "clause.json")
file_name = "clause1.json" if type == 1 else "clause2.json"
output_path = os.path.join(output_folder, file_name)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=4, ensure_ascii=False)
return output_path
if __name__ == "__main__":
file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp6\\713022ff-27d9-43e3-9cc9-2752effbfd66\\ztbfile_invalid.pdf'
file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest16_tobidders_notice.pdf'
start_word = "投标人须知正文"
end_phrases = [
r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
]
output_folder = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp6\\713022ff-27d9-43e3-9cc9-2752effbfd66'
output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件'
try:
output_path = convert_clause_to_json(file_path,output_folder)
print(f"Final JSON result saved to: {output_path}")

File diff suppressed because one or more lines are too long

View File

@ -1,23 +1,12 @@
import json
import re
def extract_content_after_special_chars(content):
"""
提取特定符号后的内容直到遇到结束符号
"""
pattern = r'[\x01\x02☑√团]([^□]+)'
match = re.search(pattern, content)
if match:
return match.group(1).strip() # 提取匹配的内容,并去除多余空格
return content # 如果没有找到匹配,返回原内容
def load_json(file_path):
"""
加载JSON文件并统一其中的括号为全角括号
"""
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
data = json.load(f) #字典
return standardize_brackets_in_json(data)
def standardize_brackets_in_json(data):
@ -146,10 +135,17 @@ def process_and_merge_entries(entries_with_numbers, primary_json_path, secondary
combined_results = find_entries_in_jsons(entries_with_numbers, primary_json_data, secondary_json_data)
return combined_results
def process_and_merge2(entries_with_numbers,json_path):
json_data=load_json(json_path)
combined_results=find_entries_in_jsons(entries_with_numbers,json_data,{})
return combined_results
#3.4.1 投标保证金,提取截止有问题。
if __name__ == "__main__":
# Hypothetical entries and file paths for testing
# entries_with_numbers = [{'形式评审标准.投标文件签字盖章': '3.7.3(3)'}, {'形式评审标准.多标段投标': '10.1'}, {'形式评审标准.“技术暗标”': '3.7.4(5)'}, {'响应性评审标准.投标内容': '1.3.1'}, {'响应性评审标准.工期': '1.3.2'}, {'响应性评审标准.工程质量': '1.3.3'}, {'响应性评审标准.投标有效期': '3.3.1'}, {'响应性评审标准.投标保证金': '3.4.1'}, {'响应性评审标准.分包计划': '1.11'}]
entries_with_numbers=[{'xxx': '3.6.3(5)'}]
entries_with_numbers=[{'xxx': '4.2.3'}]
primary_json_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\truncate_output.json'
secondary_json_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\clause.json'

View File

@ -125,12 +125,12 @@ def process_string_list(string_list):
else:
# 如果没有匹配到内容,返回空列表
return []
def find_forbidden(truncate_json_path,clause_path,truncate4): #投标人须知前附表 条款 评分前附表和资格审查表中
def find_forbidden(truncate_json_path,clause_path,truncate3): #投标人须知前附表 条款 评分前附表和资格审查表中
# output_filename="merged.pdf"
# paths=[truncate1,truncate4]
# merged_filepath=merge_pdfs(paths,output_filename) #暂时废弃,评分前附表中的在'否决投标'中摘录了。
file_id=upload_file(truncate4)
file_id=upload_file(truncate3)
# user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些请按json列表格式给我提供信息键名为'不得存在的其他情形',请你不要回答有关\"信誉要求\"的内容,若文件中未说明,请在键值中填'未知'。"
user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...],请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。"
qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden)

View File

@ -8,7 +8,7 @@ from flask_app.main.通义千问long import upload_file, qianwen_long
from concurrent.futures import ThreadPoolExecutor
def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path): #评标办法前附表
def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder): #评标办法前附表
# 形式评审、响应评审:千问
print("starting形式响应评审...")
file_id=upload_file(truncate1) #评标办法前附表
@ -21,7 +21,7 @@ def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpa
# 创建Future对象
future_qualification = executor.submit(process_qualification, qualification_review, truncate3, knowledge_name)
future_form_response = executor.submit(process_reviews, original_dict_data, knowledge_name, truncate0_jsonpath,
clause_path)
clause_path,input_file,output_folder)
# 等待执行结果
final_qualify_json = future_qualification.result()
@ -32,6 +32,7 @@ def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpa
if __name__ == "__main__":
input_file="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest11.pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
# truncate0 = os.path.join(output_folder, "zbtest20_tobidders_notice_table.pdf")
truncate2=os.path.join(output_folder,"zbtest20_tobidders_notice.pdf")
@ -40,5 +41,5 @@ if __name__ == "__main__":
truncate3=os.path.join(output_folder,"zbtest20_qualification.pdf")
clause_path = convert_clause_to_json(truncate2, output_folder)
truncate0_jsonpath = os.path.join(output_folder, "truncate_output.json")
res=combine_review_standards(truncate1,truncate3, knowledge_name,truncate0_jsonpath,clause_path)
res=combine_review_standards(truncate1,truncate3, knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder)
print(res)