9.4截取pdf提升健壮性,考虑截取失败的情况
This commit is contained in:
parent
777ef682cb
commit
349c52f2c7
@ -1,23 +1,18 @@
|
|||||||
def add_entry_level1(data, combined_value):
|
import re
|
||||||
# 假设这里我们基于某些逻辑添加了一个条目
|
|
||||||
combined_value.append("Level 1 data: " + data)
|
|
||||||
add_entry_level2("Data from Level 1", combined_value)
|
|
||||||
|
|
||||||
def add_entry_level2(data, combined_value):
|
|
||||||
# 在第二层,我们再添加一个条目
|
|
||||||
combined_value.append("Level 2 data: " + data)
|
|
||||||
add_entry_level3("Data from Level 2", combined_value)
|
|
||||||
|
|
||||||
def add_entry_level3(data, combined_value):
|
test_text = """
|
||||||
# 在第三层,我们添加最后一个条目
|
xxxxxxxxxxxxxxxxx
|
||||||
combined_value.append("Level 3 data: " + data)
|
附录: 投标人资质条件、能力和信誉(资格审查标准)
|
||||||
|
xxxxxx
|
||||||
|
附表一:招标文件澄清申请函
|
||||||
|
xxxxxxxxxxxxxxx
|
||||||
|
"""
|
||||||
|
pattern = r'^(?:附录(?:一)?[::]|附件(?:一)?[::]|附表(?:一)?[::]).*(?:资质|能力|信誉).*$'
|
||||||
|
|
||||||
# 主函数或测试代码
|
match = re.search(pattern, test_text, re.MULTILINE)
|
||||||
def main():
|
|
||||||
combined_value = []
|
|
||||||
initial_data = "Initial Entry"
|
|
||||||
add_entry_level1(initial_data, combined_value)
|
|
||||||
print("Combined values after processing:", combined_value)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if match:
|
||||||
main()
|
print("匹配到的行:", match.group())
|
||||||
|
else:
|
||||||
|
print("没有匹配到符合条件的行")
|
@ -158,23 +158,23 @@ def multi_threading(queries, knowledge_name="", file_id="",llm_type=1):
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
start_time=time.time()
|
# start_time=time.time()
|
||||||
# 读取问题列表
|
# # 读取问题列表
|
||||||
questions =read_questions_from_file('../static/提示词/前两章提问总结.txt')
|
# questions =read_questions_from_file('../static/提示词/前两章提问总结.txt')
|
||||||
for i in questions:
|
# for i in questions:
|
||||||
print(i)
|
# print(i)
|
||||||
knowledge_name = "招标解析5word"
|
# knowledge_name = "招标解析5word"
|
||||||
llm_type=1
|
# llm_type=1
|
||||||
results = multi_threading(questions, knowledge_name)
|
# results = multi_threading(questions, knowledge_name)
|
||||||
end_time = time.time()
|
# end_time = time.time()
|
||||||
if not results:
|
# if not results:
|
||||||
print("errror!")
|
# print("errror!")
|
||||||
else:
|
# else:
|
||||||
print("elapsed time:"+str(end_time-start_time))
|
# print("elapsed time:"+str(end_time-start_time))
|
||||||
# 打印结果
|
# # 打印结果
|
||||||
for question, response in results:
|
# for question, response in results:
|
||||||
print(f"Question: {question}")
|
# print(f"Question: {question}")
|
||||||
print(f"Response: {response}")
|
# print(f"Response: {response}")
|
||||||
|
|
||||||
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\ztb_evaluation_method.pdf"
|
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\ztb_evaluation_method.pdf"
|
||||||
# file_id = upload_file(file_path)
|
# file_id = upload_file(file_path)
|
||||||
@ -186,4 +186,13 @@ if __name__ == "__main__":
|
|||||||
# # 打印结果
|
# # 打印结果
|
||||||
# for question, response in results:
|
# for question, response in results:
|
||||||
# print(f"Question: {question}")
|
# print(f"Question: {question}")
|
||||||
# print(f"Response: {response}")
|
# print(f"Response: {response}")
|
||||||
|
ques=[]
|
||||||
|
results = multi_threading(ques, "招标解析5word")
|
||||||
|
if not results:
|
||||||
|
print("errror!")
|
||||||
|
else:
|
||||||
|
# 打印结果
|
||||||
|
for question, response in results:
|
||||||
|
print(f"Question: {question}")
|
||||||
|
print(f"Response: {response}")
|
@ -168,3 +168,5 @@ if __name__ == "__main__":
|
|||||||
end_time=time.time()
|
end_time=time.time()
|
||||||
elapsed_time = end_time - start_time
|
elapsed_time = end_time - start_time
|
||||||
print(f"Function execution took {elapsed_time} seconds.")
|
print(f"Function execution took {elapsed_time} seconds.")
|
||||||
|
|
||||||
|
#关于'技术暗标',第二章“投标人须知”规定的内容是怎样的?请按json格式给我提供信息,键名为'技术暗标',请你忠于原文,回答要求完整准确,不要擅自总结、删减,且不要回答诸如'见投标人须知前附表'或'见第x.x项规定'这类无实质性内容的回答。
|
@ -12,76 +12,127 @@ def clean_page_numbers(text):
|
|||||||
cleaned_text = re.sub(r'\s*\/\s*\d+\\s*', '', cleaned_text)
|
cleaned_text = re.sub(r'\s*\/\s*\d+\\s*', '', cleaned_text)
|
||||||
return cleaned_text
|
return cleaned_text
|
||||||
|
|
||||||
def extract_pages(pdf_path, output_folder, chapter_pattern, begin_page, end_phrase_pattern, output_suffix):
|
|
||||||
|
def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page):
|
||||||
|
"""从原始PDF截取指定范围的页面并保存到新的PDF文件中"""
|
||||||
|
# 获取文件基本名称
|
||||||
|
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||||
|
# 构建输出文件路径
|
||||||
|
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
|
||||||
|
|
||||||
|
# 读取PDF文件
|
||||||
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
output_doc = PdfWriter()
|
||||||
|
|
||||||
|
# 检查起始和结束页码是否有效
|
||||||
|
if start_page is not None and end_page is not None and start_page <= end_page:
|
||||||
|
# 添加指定范围的页面到新的PDF文档中
|
||||||
|
for page_num in range(start_page, end_page + 1):
|
||||||
|
output_doc.add_page(pdf_document.pages[page_num])
|
||||||
|
|
||||||
|
# 保存新的PDF文件
|
||||||
|
with open(output_pdf_path, 'wb') as f:
|
||||||
|
output_doc.write(f)
|
||||||
|
|
||||||
|
print(f"已截取并保存页面从 {start_page + 1} 到 {end_page + 1} 为 {output_pdf_path}")
|
||||||
|
else:
|
||||||
|
print("提供的页码范围无效。")
|
||||||
|
return output_pdf_path
|
||||||
|
|
||||||
|
def extract_pages_twice(pdf_path, output_folder, output_suffix): #第一次截取失败
|
||||||
|
if output_suffix == "qualification":
|
||||||
|
common_pattern = r'^(?:附录(?:一)?[::]|附件(?:一)?[::]|附表(?:一)?[::])'
|
||||||
|
end_pattern = r'^(第[一二三四五六七八九十]+章\s*投标人须知|评标办法|评标办法前附表)'
|
||||||
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
start_page = None
|
||||||
|
end_page = None
|
||||||
|
|
||||||
|
for i, page in enumerate(pdf_document.pages):
|
||||||
|
text = page.extract_text()
|
||||||
|
if text:
|
||||||
|
cleaned_text = clean_page_numbers(text)
|
||||||
|
|
||||||
|
# 如果还未找到起始页,检查当前页是否满足条件作为起始页
|
||||||
|
if start_page is None and "资格审查" in cleaned_text:
|
||||||
|
if re.search(common_pattern, cleaned_text, re.MULTILINE):
|
||||||
|
start_page = i # 存储符合条件的页码,页码从1开始
|
||||||
|
|
||||||
|
# 检查当前页是否满足条件作为结束页
|
||||||
|
if start_page is not None and re.search(end_pattern, cleaned_text, re.MULTILINE):
|
||||||
|
if i > start_page:
|
||||||
|
end_page = i # 存储符合条件的结束页码,页码从1开始
|
||||||
|
break # 可选:找到结束页后退出循环
|
||||||
|
|
||||||
|
if start_page is None or end_page is None:
|
||||||
|
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
|
||||||
|
return ""
|
||||||
|
else:
|
||||||
|
return save_pages_to_new_pdf(pdf_path,output_folder,output_suffix,start_page,end_page)
|
||||||
|
elif output_suffix == "invalid":
|
||||||
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
total_pages = len(pdf_document.pages)
|
||||||
|
# 计算总页数的三分之二
|
||||||
|
total = int(total_pages * 2 / 3)
|
||||||
|
start_page=1
|
||||||
|
end_page = min(90, total)
|
||||||
|
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
|
||||||
|
|
||||||
|
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
||||||
# 打开PDF文件
|
# 打开PDF文件
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
start_page = None
|
start_page = None
|
||||||
end_page = None
|
end_page = None
|
||||||
|
|
||||||
qualification_pattern = re.compile(r'资格审查|资质条件|能力')
|
|
||||||
# 遍历文档的每一页,查找开始和结束短语的位置
|
# 遍历文档的每一页,查找开始和结束短语的位置
|
||||||
for i in range(len(pdf_document.pages)):
|
for i in range(len(pdf_document.pages)):
|
||||||
page = pdf_document.pages[i]
|
page = pdf_document.pages[i]
|
||||||
text = page.extract_text()
|
text = page.extract_text()
|
||||||
if text:
|
if text:
|
||||||
cleaned_text = clean_page_numbers(text)
|
cleaned_text = clean_page_numbers(text)
|
||||||
if re.search(chapter_pattern, cleaned_text) and i > begin_page:
|
# print(cleaned_text)
|
||||||
|
if re.search(begin_pattern, cleaned_text) and i > begin_page:
|
||||||
if output_suffix == "invalid" and start_page: #仅当提取Invalid的时候,判断初始页码是第一个匹配到的页码,因为招标编号可能存在多个,后面的覆盖前面
|
if output_suffix == "invalid" and start_page: #仅当提取Invalid的时候,判断初始页码是第一个匹配到的页码,因为招标编号可能存在多个,后面的覆盖前面
|
||||||
continue
|
continue
|
||||||
if output_suffix == "qualification" and not re.search(qualification_pattern, cleaned_text):
|
|
||||||
# 如果是资格审查条件,但当前页不包含相关词汇,则不执行任何操作
|
|
||||||
pass
|
|
||||||
else:
|
else:
|
||||||
start_page = i
|
start_page = i
|
||||||
if start_page is not None and re.search(end_phrase_pattern, cleaned_text) and i > (start_page+1):
|
if start_page is not None and re.search(end_pattern, cleaned_text):
|
||||||
is_invalid_condition = output_suffix == "invalid" and i > 30
|
# 如果output_suffix是"qualification",调整条件检查
|
||||||
if is_invalid_condition or output_suffix != "invalid":
|
if output_suffix == "qualification":
|
||||||
end_page = i
|
condition = i > start_page
|
||||||
break
|
else:
|
||||||
|
condition = i > (start_page + 1)
|
||||||
|
if condition:
|
||||||
|
is_invalid_condition = output_suffix == "invalid" and i > 30 #这边默认无效投标至少有30页
|
||||||
|
if is_invalid_condition or output_suffix != "invalid":
|
||||||
|
end_page = i
|
||||||
|
break
|
||||||
|
|
||||||
# 确保找到了起始和结束页面
|
# 确保找到了起始和结束页面
|
||||||
if start_page is None or end_page is None:
|
if start_page is None or end_page is None:
|
||||||
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
|
if output_suffix == "qualification" or "invalid":
|
||||||
return None
|
extract_pages_twice(pdf_path, output_folder, output_suffix)
|
||||||
|
else:
|
||||||
|
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
|
||||||
|
return ""
|
||||||
|
else:
|
||||||
|
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
|
||||||
|
|
||||||
# 创建一个新的PDF文档保存截取的页面
|
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
||||||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # Get the base name without extension
|
|
||||||
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
|
|
||||||
output_doc = PdfWriter()
|
|
||||||
|
|
||||||
# 添加需要的页面,从 start_page 开始,包括 end_page
|
|
||||||
for page_num in range(start_page, end_page + 1):
|
|
||||||
output_doc.add_page(pdf_document.pages[page_num])
|
|
||||||
# 保存新的PDF文件
|
|
||||||
with open(output_pdf_path, 'wb') as f:
|
|
||||||
output_doc.write(f)
|
|
||||||
|
|
||||||
print(f"已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
|
|
||||||
|
|
||||||
return output_pdf_path
|
|
||||||
|
|
||||||
def process_input(input_path, output_folder, chapter_pattern, begin_page, end_phrases, output_suffix,selection):
|
|
||||||
# 确保输出文件夹存在
|
# 确保输出文件夹存在
|
||||||
if not os.path.exists(output_folder):
|
if not os.path.exists(output_folder):
|
||||||
os.makedirs(output_folder)
|
os.makedirs(output_folder)
|
||||||
if(selection==3 or selection==4 or selection==5):
|
|
||||||
end_phrase_pattern = re.compile('|'.join([phrase for phrase in end_phrases]), re.MULTILINE)
|
|
||||||
else:
|
|
||||||
end_phrase_pattern = re.compile('|'.join([re.escape(phrase) for phrase in end_phrases]))
|
|
||||||
|
|
||||||
if os.path.isdir(input_path):
|
if os.path.isdir(input_path):
|
||||||
generated_files = []
|
generated_files = []
|
||||||
# 遍历文件夹内的所有PDF文件
|
# 遍历文件夹内的所有PDF文件
|
||||||
for file in os.listdir(input_path):
|
for file in os.listdir(input_path):
|
||||||
if file.endswith(".pdf"):
|
if file.endswith(".pdf"):
|
||||||
pdf_path = os.path.join(input_path, file)
|
pdf_path = os.path.join(input_path, file)
|
||||||
output_pdf_path = extract_pages(pdf_path, output_folder, chapter_pattern, begin_page, end_phrase_pattern, output_suffix)
|
output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||||
if output_pdf_path and os.path.isfile(output_pdf_path):
|
if output_pdf_path and os.path.isfile(output_pdf_path):
|
||||||
generated_files.append(output_pdf_path)
|
generated_files.append(output_pdf_path)
|
||||||
return generated_files
|
return generated_files
|
||||||
elif os.path.isfile(input_path) and input_path.endswith(".pdf"):
|
elif os.path.isfile(input_path) and input_path.endswith(".pdf"):
|
||||||
# 处理单个PDF文件
|
# 处理单个PDF文件
|
||||||
output_pdf_path = extract_pages(input_path, output_folder, chapter_pattern, begin_page, end_phrase_pattern, output_suffix)
|
output_pdf_path = extract_pages(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||||
if output_pdf_path and os.path.isfile(output_pdf_path):
|
if output_pdf_path and os.path.isfile(output_pdf_path):
|
||||||
return [output_pdf_path] # 以列表形式返回,以保持一致性
|
return [output_pdf_path] # 以列表形式返回,以保持一致性
|
||||||
else:
|
else:
|
||||||
@ -92,47 +143,45 @@ def process_input(input_path, output_folder, chapter_pattern, begin_page, end_ph
|
|||||||
def truncate_pdf_main(input_path, output_folder, selection):
|
def truncate_pdf_main(input_path, output_folder, selection):
|
||||||
if selection == 1:
|
if selection == 1:
|
||||||
# Configure patterns and phrases for "投标人须知前附表"
|
# Configure patterns and phrases for "投标人须知前附表"
|
||||||
pattern = re.compile(r'第[一二三四五六七八九十]+章\s*投标人须知')
|
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*投标人须知')
|
||||||
begin_page = 3
|
begin_page = 3
|
||||||
end_phrases = ["投标人须知正文"]
|
end_pattern = re.compile(r'投标人须知正文')
|
||||||
output_suffix = "tobidders_notice_table"
|
output_suffix = "tobidders_notice_table"
|
||||||
elif selection == 2:
|
elif selection == 2:
|
||||||
# Configure patterns and phrases for "评标办法"
|
# Configure patterns and phrases for "评标办法"
|
||||||
pattern = re.compile(r'第[一二三四五六七八九十]+章\s*评标办法')
|
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*评标办法') #考虑到这种情况 '第三章 第三章 第三章 第三章 评标办法 评标办法 评标办法 评标办法'
|
||||||
begin_page = 10
|
begin_page = 10
|
||||||
end_phrases = ["评标办法正文", "评标办法"]
|
end_pattern = re.compile(r'评标办法正文|评标办法')
|
||||||
output_suffix = "evaluation_method"
|
output_suffix = "evaluation_method"
|
||||||
elif selection == 3:
|
elif selection == 3:
|
||||||
# Configure patterns and phrases for "投标人须知正文"
|
# Configure patterns and phrases for "投标人须知正文"
|
||||||
pattern = re.compile(r'投标人须知正文')
|
begin_pattern = re.compile(r'投标人须知正文')
|
||||||
begin_page = 5
|
begin_page = 5
|
||||||
end_phrases = [
|
end_pattern = re.compile(r'^第[一二三四五六七八九十]+章\s*评标办法|^评标办法前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]', re.MULTILINE)
|
||||||
r'^第[一二三四五六七八九十]+章\s*评标办法',r'^评标办法前附表',r'^附录:', r'^附录一:',r'^附件:', r'^附件一:',r'^附表:', r'^附表一:',r'^附录:', r'^附录一:',r'^附件:', r'^附件一:',r'^附表:', r'^附表一:',
|
|
||||||
]
|
|
||||||
output_suffix = "tobidders_notice"
|
output_suffix = "tobidders_notice"
|
||||||
elif selection==4:
|
elif selection==4:
|
||||||
# 配置用于 "资格审查条件" 的正则表达式模式和短语
|
# 配置用于 "资格审查条件" 的正则表达式模式和短语
|
||||||
appendix_pattern = r'^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]'
|
common_pattern = r'^(?:附录(?:一)?[::]|附件(?:一)?[::]|附表(?:一)?[::])'
|
||||||
pattern = re.compile(appendix_pattern)
|
begin_pattern = re.compile(common_pattern + r'.*(?:资质|能力|信誉).*$', re.MULTILINE)
|
||||||
begin_page = 5
|
begin_page = 5
|
||||||
end_phrases = [r'评标办法正文', r'评标办法', appendix_pattern]
|
end_pattern = re.compile(
|
||||||
|
common_pattern + r'(?!.*(?:资质|能力|信誉)).*$|' # 原有的模式
|
||||||
|
r'^(第[一二三四五六七八九十]+章\s*评标办法|评标办法前附表)', # 新增的匹配项
|
||||||
|
re.MULTILINE
|
||||||
|
)
|
||||||
output_suffix = "qualification"
|
output_suffix = "qualification"
|
||||||
elif selection == 5:
|
elif selection == 5:
|
||||||
# 配置用于 "无效标" 的正则表达式模式和短语
|
# 配置用于 "无效标" 的正则表达式模式和短语
|
||||||
pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:')
|
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:')
|
||||||
begin_page = 0
|
begin_page = 0
|
||||||
end_phrases = [
|
end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|第二卷', re.MULTILINE)
|
||||||
r'第[一二三四五六七八九十]+章\s*合同',
|
|
||||||
r':清标报告',# 添加了新的匹配项
|
|
||||||
r':清标报告'
|
|
||||||
]
|
|
||||||
output_suffix = "invalid"
|
output_suffix = "invalid"
|
||||||
else:
|
else:
|
||||||
print("无效的选择")
|
print("无效的选择")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Process the selected input
|
# Process the selected input
|
||||||
return process_input(input_path, output_folder, pattern, begin_page, end_phrases, output_suffix,selection)
|
return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||||
|
|
||||||
def truncate_pdf_multiple(input_path, output_folder):
|
def truncate_pdf_multiple(input_path, output_folder):
|
||||||
truncate_files = []
|
truncate_files = []
|
||||||
@ -141,10 +190,11 @@ def truncate_pdf_multiple(input_path, output_folder):
|
|||||||
truncate_files.extend(files)
|
truncate_files.extend(files)
|
||||||
return truncate_files
|
return truncate_files
|
||||||
|
|
||||||
|
#TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下,比如说就不进行跳转(见xx表) 开评定标这里也要考虑 如果评分表为空,也要处理。
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
|
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
|
||||||
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
|
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
|
||||||
# truncate_pdf_multiple(input_path,output_folder)
|
# truncate_pdf_multiple(input_path,output_folder)
|
||||||
selection = 4 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-无效标
|
selection = 5 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-无效标
|
||||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||||
# print("生成的文件:", generated_files)
|
# # print("生成的文件:", generated_files)
|
||||||
|
@ -137,7 +137,7 @@ def convert_clause_to_json(input_path,output_folder):
|
|||||||
return output_path
|
return output_path
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp6\\713022ff-27d9-43e3-9cc9-2752effbfd66\\ztbfile_tobidders_notice.pdf'
|
file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp6\\713022ff-27d9-43e3-9cc9-2752effbfd66\\ztbfile_invalid.pdf'
|
||||||
start_word = "投标人须知正文"
|
start_word = "投标人须知正文"
|
||||||
end_phrases = [
|
end_phrases = [
|
||||||
r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
|
r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
import json
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
|
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
|
||||||
@ -24,12 +23,12 @@ def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpa
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
output_folder = "C:\\Users\\Administrator\Desktop\\fsdownload\\temp1"
|
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
|
||||||
truncate1 = os.path.join(output_folder, "ztbfile_tobidders_notice_table.pdf")
|
truncate0 = os.path.join(output_folder, "zbtest20_tobidders_notice_table.pdf")
|
||||||
knowledge_name="zbfile"
|
knowledge_name="zbtest20"
|
||||||
truncate2=os.path.join(output_folder,"ztbfile_evaluation_method.pdf")
|
truncate1=os.path.join(output_folder,"zbtest20_evaluation_method.pdf")
|
||||||
truncate4=os.path.join(output_folder,"ztbfile_qualification.pdf")
|
truncate3=os.path.join(output_folder,"zbtest20_qualification.pdf")
|
||||||
clause_path = convert_clause_to_json(truncate2, output_folder)
|
clause_path = convert_clause_to_json(truncate1, output_folder)
|
||||||
truncate1_jsonpath = os.path.join(output_folder, "truncate_output.json")
|
truncate1_jsonpath = os.path.join(output_folder, "truncate_output.json")
|
||||||
res=combine_review_standards(truncate2,truncate4, knowledge_name,truncate1_jsonpath,clause_path)
|
res=combine_review_standards(truncate1,truncate3, knowledge_name,truncate1_jsonpath,clause_path)
|
||||||
print(res)
|
print(res)
|
@ -1,8 +1,9 @@
|
|||||||
|
# -*- encoding:utf-8 -*-
|
||||||
#资格审查中,首先排除'联合体投标'和'不得存在的情况',有'符合'等的,加入matching_keys列表,否则保留原字典
|
#资格审查中,首先排除'联合体投标'和'不得存在的情况',有'符合'等的,加入matching_keys列表,否则保留原字典
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from flask_app.main.json_utils import clean_json_string, combine_json_results, add_keys_to_json
|
from flask_app.main.json_utils import clean_json_string, combine_json_results, add_keys_to_json
|
||||||
from flask_app.main.多线程提问 import multi_threading, read_questions_from_file
|
from flask_app.main.多线程提问 import multi_threading
|
||||||
from flask_app.main.通义千问long import upload_file
|
from flask_app.main.通义千问long import upload_file
|
||||||
|
|
||||||
def merge_dictionaries_under_common_key(dicts, common_key):
|
def merge_dictionaries_under_common_key(dicts, common_key):
|
||||||
@ -18,19 +19,22 @@ def merge_dictionaries_under_common_key(dicts, common_key):
|
|||||||
print(f"Warning: Dictionary does not contain the key {common_key}")
|
print(f"Warning: Dictionary does not contain the key {common_key}")
|
||||||
|
|
||||||
return merged_dict
|
return merged_dict
|
||||||
def generate_qual_question(matching_keys_list):
|
def generate_qual_question(matching_keys_list): #这里假设资质、信誉与人员要求 要不都有、要不都没
|
||||||
questions=[]
|
if not matching_keys_list:
|
||||||
# 将列表转换为单引号包裹的格式,并用逗号和空格分隔
|
return []
|
||||||
formatted_keys = ["'{}'".format(key) for key in matching_keys_list]
|
else:
|
||||||
# 将格式化后的关键词列表连接成字符串
|
questions=[]
|
||||||
keys_string = ", ".join(formatted_keys)
|
# 将列表转换为单引号包裹的格式,并用逗号和空格分隔
|
||||||
# 构造完整的问题语句
|
formatted_keys = ["'{}'".format(key) for key in matching_keys_list]
|
||||||
question1 = (f"该招标文件中资格评审的内容是怎样的?具体内容包括{keys_string},"
|
# 将格式化后的关键词列表连接成字符串
|
||||||
"请你以json格式返回结果,外层键名为'资格评审',嵌套键名为具体的字段,请你忠于原文,回答要求完整准确,不要擅自总结、删减。")
|
keys_string = "、".join(formatted_keys)
|
||||||
question2="该招标文件中资格评审中有关人员资格的要求是怎样的?请依次给出所需的岗位、需要的数量、资格要求、需要提交的证明材料(如具体的社保证明、技能证书等,若有时间要求请注明时间范围)、在岗要求、备注,若相关要求不存在,则以“未知”填充。请你以json格式返回结果,外层键名为'资格评审',嵌套键名为具体的要求,请你忠于原文,回答要求完整准确,不要擅自总结、删减。"
|
# 构造完整的问题语句
|
||||||
questions.append(question1)
|
question1 = (f"该招标文件中资格评审的内容是怎样的?具体内容包括{keys_string},"
|
||||||
questions.append(question2)
|
"请你以json格式返回结果,外层键名为'资格评审',嵌套键名为具体的字段,请你忠于原文,回答要求完整准确,不要擅自总结、删减。")
|
||||||
return questions
|
question2="该招标文件中资格评审中有关人员资格的要求是怎样的?请依次给出所需的岗位、需要的数量、资格要求、需要提交的证明材料(如具体的社保证明、技能证书等,若有时间要求请注明时间范围)、在岗要求、备注,若相关要求不存在,则以“未知”填充。请你以json格式返回结果,外层键名为'资格评审',嵌套键名为具体的要求,请你忠于原文,回答要求完整准确,不要擅自总结、删减。"
|
||||||
|
questions.append(question1)
|
||||||
|
questions.append(question2)
|
||||||
|
return questions
|
||||||
def extract_matching_keys_qual(dict_data):
|
def extract_matching_keys_qual(dict_data):
|
||||||
# 定义包含模式的列表
|
# 定义包含模式的列表
|
||||||
include_patterns = [re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"),re.compile(r"符合")]
|
include_patterns = [re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"),re.compile(r"符合")]
|
||||||
@ -38,41 +42,27 @@ def extract_matching_keys_qual(dict_data):
|
|||||||
matching_keys = []
|
matching_keys = []
|
||||||
non_matching_keys = {}
|
non_matching_keys = {}
|
||||||
# 定义排除项
|
# 定义排除项
|
||||||
excludes = ['联合体', '禁止投标', '不存在', '不得存在','资格','管理机构','负责人']
|
excludes = ['联合体', '禁止投标', '不存在', '不得存在','资格','管理机构','负责人','人员'] #联合体、禁止投标的情况、人员需要额外问
|
||||||
# 遍历字典中的每个键值对
|
# 遍历字典中的每个键值对
|
||||||
for key, value in dict_data.items():
|
for key, value in dict_data.items():
|
||||||
# 检查键是否包含任何排除项
|
|
||||||
if any(ex in key for ex in excludes):
|
|
||||||
continue # 如果包含排除项,则跳过当前键值对
|
|
||||||
# 检查值是否符合任何一个包含模式
|
# 检查值是否符合任何一个包含模式
|
||||||
if any(pattern.search(value) for pattern in include_patterns):
|
if any(pattern.search(value) for pattern in include_patterns):
|
||||||
# 如果匹配,将键添加到列表中
|
# 如果值符合包含模式,再检查键是否包含任何排除项
|
||||||
matching_keys.append(key)
|
if not any(ex in key for ex in excludes):
|
||||||
|
# 如果键不包含排除项,则添加到匹配键列表中
|
||||||
|
matching_keys.append(key)
|
||||||
else:
|
else:
|
||||||
# 如果不匹配,将键值对添加到不匹配字典中
|
# value中有实质的内容,不需要额外问。
|
||||||
non_matching_keys[key] = value
|
non_matching_keys[key] = value
|
||||||
return matching_keys,non_matching_keys #matching:['资质条件', '财务状况'] non_matching_keys:{'营业执照': '具备有效的营业执照', '施工机械设备': '具备完善的施工设备'}
|
|
||||||
|
|
||||||
def process_qualification(qualification_review,truncate3,knowledge_name):
|
return matching_keys, non_matching_keys #matching:['资质条件', '财务状况'] non_matching_keys:{'营业执照': '具备有效的营业执照', '施工机械设备': '具备完善的施工设备'}
|
||||||
# 资格评审
|
|
||||||
matching_keys_list, non_matching_dict = extract_matching_keys_qual(qualification_review)
|
#获取联合体投标的要求,由于它不一定在资格审查表中,故调用rag
|
||||||
user_querys = generate_qual_question(matching_keys_list) # 生成提问->‘附件:资格审查’
|
def get_consortium_dict(knowledge_name):
|
||||||
file_id2 = upload_file(truncate3)
|
|
||||||
results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表
|
|
||||||
res_list = []
|
|
||||||
if not results2:
|
|
||||||
print("errror!")
|
|
||||||
else:
|
|
||||||
# 打印结果
|
|
||||||
for question, response in results2:
|
|
||||||
cleaned_res = clean_json_string(response)
|
|
||||||
res_list.append(cleaned_res) # 都是问资格评审表得出的
|
|
||||||
merged_dict = merge_dictionaries_under_common_key(res_list, '资格评审')
|
|
||||||
qualify_list = []
|
qualify_list = []
|
||||||
# qualification_review_file_path = '../static/提示词/资格评审问题.txt' # 替换为你的txt文件路径
|
consortium_questions = [
|
||||||
qualification_review_file_path='flask_app/static/提示词/资格评审问题.txt'
|
"该招标文件对于联合体投标的要求是怎样的,请按json格式给我提供信息,外层键名为'联合体投标要求(如有)'"]
|
||||||
qualification_review_questions = read_questions_from_file(qualification_review_file_path) # 联合体投标
|
results1 = multi_threading(consortium_questions, knowledge_name)
|
||||||
results1 = multi_threading(qualification_review_questions, knowledge_name)
|
|
||||||
for _, response in results1: # _占位,代表ques;response[0]也是ques;response[1]是ans
|
for _, response in results1: # _占位,代表ques;response[0]也是ques;response[1]是ans
|
||||||
try:
|
try:
|
||||||
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
||||||
@ -81,7 +71,54 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
|
|||||||
print(f"Warning: Missing or incomplete response data for query index {_}.")
|
print(f"Warning: Missing or incomplete response data for query index {_}.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing response for query index {_}: {e}")
|
print(f"Error processing response for query index {_}: {e}")
|
||||||
qualify_combined_dict = combine_json_results(qualify_list)
|
consortium_dict = combine_json_results(qualify_list)
|
||||||
updated_qualify_json = add_keys_to_json(merged_dict, qualify_combined_dict) # 合并字典
|
return consortium_dict
|
||||||
final_qualify_json = add_keys_to_json(updated_qualify_json, non_matching_dict)
|
|
||||||
return final_qualify_json
|
def process_qualification(qualification_review,truncate3,knowledge_name):
|
||||||
|
# 资格评审
|
||||||
|
matching_keys_list, non_matching_dict = extract_matching_keys_qual(qualification_review)
|
||||||
|
if not matching_keys_list: #此时要求全部写在评分办法前附表中,不需要额外提取。
|
||||||
|
if not non_matching_dict:
|
||||||
|
print("error!")
|
||||||
|
else:
|
||||||
|
substring = '联合体'
|
||||||
|
found_key = any(substring in key for key in non_matching_dict.keys()) #没有联合体投标,则需生成
|
||||||
|
if not found_key:
|
||||||
|
consortium_dict=get_consortium_dict(knowledge_name)
|
||||||
|
final_qualify_json = add_keys_to_json(consortium_dict, non_matching_dict)
|
||||||
|
return final_qualify_json
|
||||||
|
else:
|
||||||
|
return non_matching_dict
|
||||||
|
|
||||||
|
elif matching_keys_list and truncate3=="": #这种情况是评分办法前附表中有要求,但是没有正确截取到'资格审查表'
|
||||||
|
#non_matching_dict 首先提取企业信息 施工设施。
|
||||||
|
A=1
|
||||||
|
#rag
|
||||||
|
# qualification_review_file_path = '../static/提示词/资格评审问题.txt' # 替换为你的txt文件路径
|
||||||
|
# # qualification_review_file_path='flask_app/static/提示词/资格评审问题.txt'
|
||||||
|
else: #大多数情况
|
||||||
|
user_querys = generate_qual_question(matching_keys_list) # 生成提问->‘附件:资格审查’
|
||||||
|
file_id2 = upload_file(truncate3)
|
||||||
|
results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表,调用qianwen-long
|
||||||
|
res_list = []
|
||||||
|
if not results2:
|
||||||
|
print("未调用大模型询问资格评审文件要求!")
|
||||||
|
else:
|
||||||
|
# 打印结果
|
||||||
|
for question, response in results2:
|
||||||
|
cleaned_res = clean_json_string(response)
|
||||||
|
res_list.append(cleaned_res) # 都是问资格评审表得出的
|
||||||
|
merged_dict = merge_dictionaries_under_common_key(res_list, '资格评审')
|
||||||
|
consortium_dict=get_consortium_dict(knowledge_name)
|
||||||
|
updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) # 合并字典
|
||||||
|
final_qualify_json = add_keys_to_json(updated_qualify_json, non_matching_dict)
|
||||||
|
return final_qualify_json
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
qualification_review={'营业执照': '具备有效的营业执照', '资质等级': '具备建设行政主管部门颁发的市政公用工程监理乙级及以上资质或房屋建筑工程监理乙级及以上资质或工程监理综合资质证书', '财务状况': '投标人须提供近三年(2018 年、2019 年、2020 年)完', '类似项目业绩': '投标人近 5 年(2017 年至今)须具有至少一项投资概算在4000 万元及以上房屋建筑工程或市政公用工程监理业绩,同时提供中标通知书及监理服务合同,项目规模及项目时间认定以监理服务合同内信息为准', '信誉': '根据《关于在招标投标活动中对失信被执行', '主要人员': '监理工程师:至少提供 1 名具有房屋建筑工程专'}
|
||||||
|
truncate3=""
|
||||||
|
knowledge_name="测试"
|
||||||
|
res=process_qualification(qualification_review,truncate3,knowledge_name)
|
||||||
|
print(res)
|
||||||
|
|
||||||
|
#该招标文件中资格评审关于财务状况的内容是怎样的?请你以json格式返回结果,外层键名为'财务状况',请你忠于原文,回答要求完整准确,不要擅自总结、删减,且不要回答诸如'见投标人须知前附表'或'见第x.x项规定'这类无实质性内容的回答。
|
||||||
|
@ -1,19 +1,19 @@
|
|||||||
#资质要求:
|
#资质要求:
|
||||||
#1.该招标文件对于投标人的资质条件(等级)是怎样的,要求给出完整资质要求内容、需要提交的证明材料,并按json格式给我提供信息,外层键名为'资质条件',对于各个'资质条件',嵌套键名为'内容'和'证明材料',若存在未知信息,在对应的键值中填'未知',不要回答有关业绩要求、财务要求、主要人员要求、信誉要求的内容。
|
1.该招标文件对于投标人的资质条件(等级)是怎样的,要求给出完整资质要求内容、需要提交的证明材料,并按json格式给我提供信息,外层键名为'资质条件',对于各个'资质条件',嵌套键名为'内容'和'证明材料',若存在未知信息,在对应的键值中填'未知',不要回答有关业绩要求、财务要求、主要人员要求、信誉要求的内容。
|
||||||
|
|
||||||
#业绩要求:
|
#业绩要求:
|
||||||
#2.该招标文件对于投标人的业绩要求是怎样的,至少需要包含业绩时间范围、业绩内容(如有)、金额要求(如有)、需要提交的证明材料、备注(其他关于业绩要求的内容,如有),请按json格式给我提供信息,最外层键名为'业绩要求',若相关要求不存在,则在对应的键值中填'未知'。
|
2.该招标文件对于投标人的业绩要求是怎样的,至少需要包含业绩时间范围、业绩内容(如有)、金额要求(如有)、需要提交的证明材料、备注(其他关于业绩要求的内容,如有),请按json格式给我提供信息,最外层键名为'业绩要求',若相关要求不存在,则在对应的键值中填'未知'。
|
||||||
|
|
||||||
#财务要求:
|
#财务要求:
|
||||||
#3.该招标文件对于投标人的财务要求是怎样的,要求给出财务报告的时间范围、营收(若利润)要求、需要提交的证明材料、备注(其他关于财务要求的内容,如有),请按json格式给我提供信息,最外层键名为'财务要求',若相关要求不存在,则在对应的键值中填'未知'。
|
3.该招标文件对于投标人的财务要求是怎样的,要求给出财务报告的时间范围、营收(若利润)要求、需要提交的证明材料、备注(其他关于财务要求的内容,如有),请按json格式给我提供信息,最外层键名为'财务要求',若相关要求不存在,则在对应的键值中填'未知'。
|
||||||
|
|
||||||
#信誉要求:
|
#信誉要求:
|
||||||
#4.该招标文件对于投标人的信誉要求是怎样的,请按json格式给我提供信息,键名为'信誉要求',对于各个'信誉要求',嵌套键名为'内容'和'证明材料',若存在未知信息,在对应的键值中填'未知',不要回答无关信誉要求的内容。
|
4.该招标文件对于投标人的信誉要求是怎样的,请按json格式给我提供信息,键名为'信誉要求',对于各个'信誉要求',嵌套键名为'内容'和'证明材料',若存在未知信息,在对应的键值中填'未知',不要回答无关信誉要求的内容。
|
||||||
|
|
||||||
#(存在问题)主要人员要求:
|
#(存在问题)主要人员要求:
|
||||||
#5.该招标文件对于投标人的项目经理(监理)和技术负责人的要求是怎样的,请依次给出需要的数量、资格要求、需要提交的证明材料(如具体的社保证明、技能证书等,若有时间要求请注明时间范围)、在岗要求、备注(除上述要求外文档中提及的其他关于项目经理(监理)和技术负责人要求的信息),以json的形式给出,键名分别是"项目经理"和"技术负责人",若相关要求不存在,则以“未知”填充。
|
5.该招标文件对于投标人的项目经理(监理)和技术负责人的要求是怎样的,请依次给出需要的数量、资格要求、需要提交的证明材料(如具体的社保证明、技能证书等,若有时间要求请注明时间范围)、在岗要求、备注(除上述要求外文档中提及的其他关于项目经理(监理)和技术负责人要求的信息),以json的形式给出,键名分别是"项目经理"和"技术负责人",若相关要求不存在,则以“未知”填充。
|
||||||
|
|
||||||
#6.该招标文件对于项目管理机构中除项目经理和技术负责人外的其他人员要求是怎样的,请依次给出所需的岗位、对应的数量、资格要求、需要提交的证明材料(如具体的社保证明、技能证书等,若有时间要求请注明时间范围)、在岗要求、备注(除上述要求外文档中提及的其他关于人员要求的信息),以json的形式给出,最外层键名为"其他人员",嵌套的键名为为具体的岗位名称,若相关要求不存在,则以“未知”填充。
|
6.该招标文件对于项目管理机构中除项目经理和技术负责人外的其他人员要求是怎样的,请依次给出所需的岗位、对应的数量、资格要求、需要提交的证明材料(如具体的社保证明、技能证书等,若有时间要求请注明时间范围)、在岗要求、备注(除上述要求外文档中提及的其他关于人员要求的信息),以json的形式给出,最外层键名为"其他人员",嵌套的键名为为具体的岗位名称,若相关要求不存在,则以“未知”填充。
|
||||||
|
|
||||||
#(需要与第一章对应)联合体投标:
|
#(需要与第一章对应)联合体投标:
|
||||||
#该招标文件是否接受联合体投标?
|
#该招标文件是否接受联合体投标?
|
||||||
|
Loading…
x
Reference in New Issue
Block a user