11.5货物标截取优化

This commit is contained in:
zy123 2024-11-06 09:17:40 +08:00
parent f9f17757f3
commit d4d1a14c06
4 changed files with 52 additions and 48 deletions

View File

@ -0,0 +1,23 @@
import os
from openai import OpenAI
import json
# 初始化 OpenAI 客户端
client = OpenAI(
api_key=os.getenv("DASHSCOPE_API_KEY"),
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)
# 获取文件列表
file_stk = client.files.list()
# 将文件信息解析为 JSON 格式
file_data = json.loads(file_stk.model_dump_json())
# 提取所有文件的 id
file_ids = [file["id"] for file in file_data["data"]]
# 循环删除每个文件
for file_id in file_ids:
file_object = client.files.delete(file_id)
print(file_object.model_dump_json())

View File

@ -337,17 +337,17 @@ def truncate_pdf_main(input_path, output_folder, selection):
# Selection 2: 评标办法 # Selection 2: 评标办法
pattern_pairs = [ pattern_pairs = [
( (
re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(磋商|谈判|评标|评定|评审)\s*(办法|方法)'), re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))'),
# Alternative begin pattern # Alternative begin pattern
re.compile(r'评标办法正文|评标办法|^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+') re.compile(r'评标办法正文|评标办法|^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+')
# Alternative end pattern # Alternative end pattern
), ),
( (
re.compile( re.compile(
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、]*?(磋商|谈判|评标|评定|评审)\s*(办法|方法)[\u4e00-\u9fff、]*\s*$|\s*评标(办法|方法)前附表\s*$', r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]*?(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))[\u4e00-\u9fff、()]*\s*$|\s*评标(办法|方法)前附表\s*$',
re.MULTILINE re.MULTILINE
), ),
re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE) re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE)
) )
] ]
output_suffix = "evaluation_method" output_suffix = "evaluation_method"
@ -559,22 +559,23 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, uniqu
# TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下比如说就不进行跳转见xx表 开评定标这里也要考虑 如果评分表为空,也要处理。 # TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下比如说就不进行跳转见xx表 开评定标这里也要考虑 如果评分表为空,也要处理。
# TODO:zbtest8 zbtest18有问题 后期需要完善,截取需要截两次,第一次严格第二次宽松
# TODO:目前merged_baseinfo没有包含投标人须知正文。 # TODO:目前merged_baseinfo没有包含投标人须知正文。
#TODO:zbtest20有问题
# 投标人须知前附表改为货物标一样的 # 投标人须知前附表改为货物标一样的
if __name__ == "__main__": if __name__ == "__main__":
start_time = time.time() start_time = time.time()
# input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标\\HBDL-2024-0017-001-招标文件.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标\\HBDL-2024-0017-001-招标文件.pdf"
input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest1.pdf" input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\tmp" output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\tmp"
# files=truncate_pdf_multiple(input_path,output_folder) # files=truncate_pdf_multiple(input_path,output_folder)
selections = [4, 1] # 仅处理 selection 4、1 # selections = [4, 1] # 仅处理 selection 4、1
files=truncate_pdf_specific_engineering(input_path,output_folder,selections) # files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
# print(files) # print(files)
# selection = 2 # 例如1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标 selection = 2 # 例如1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
# generated_files = truncate_pdf_main(input_path, output_folder, selection) generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print(generated_files) # print(generated_files)
# print("生成的文件:", generated_files) # print("生成的文件:", generated_files)
end_time = time.time() end_time = time.time()

View File

@ -151,11 +151,12 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
def get_patterns_for_procurement(): def get_patterns_for_procurement():
begin_pattern = re.compile( begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|' r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分).*(?:采购|需求).*',
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|' re.MULTILINE)
r'^[一二三四五六七八九十百千]+、\s*采购清单', re.MULTILINE) end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
end_pattern = re.compile( # begin_pattern=re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) # r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、]*?(?:服务|项目|商务|技术)\s*(办法|方法)[\u4e00-\u9fff、]*\s*$'
# )
return begin_pattern, end_pattern return begin_pattern, end_pattern
@ -176,21 +177,6 @@ def get_patterns_for_evaluation_method():
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE) r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE)
return begin_pattern, end_pattern return begin_pattern, end_pattern
# def get_patterns_for_qualification():
# # # 原始匹配逻辑
# # begin_pattern_original = re.compile(
# # r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE)
# # end_pattern_original = re.compile(
# # r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
#
# # 新匹配逻辑
# begin_pattern_new = re.compile(
# r'^资格性检查', re.MULTILINE)
# end_pattern_new = re.compile(
# r'^附件\s*\d+|^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
#
# return begin_pattern_new, end_pattern_new
def get_patterns_for_qualification(): def get_patterns_for_qualification():
# begin_pattern = re.compile( # begin_pattern = re.compile(
# r'^(?:附录(?:一|1)?[:]|附件(?:一|1)?[:]|附表(?:一|1)?[:]|资格性检查|资格审查表).*(?:资格|符合性)?.*$', # r'^(?:附录(?:一|1)?[:]|附件(?:一|1)?[:]|附表(?:一|1)?[:]|资格性检查|资格审查表).*(?:资格|符合性)?.*$',
@ -570,20 +556,6 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,be
return "" return ""
# def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):
# if output_suffix=='notice':
# print(start_page)
# base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
# output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
# output_doc = PdfWriter()
# for page_num in range(start_page, end_page + 1):
# output_doc.add_page(pdf_document.pages[page_num])
# with open(output_pdf_path, 'wb') as f:
# output_doc.write(f)
# print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
# return output_pdf_path
def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix): def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):
try: try:
# 检查 start_page 和 end_page 是否为 None # 检查 start_page 和 end_page 是否为 None
@ -701,9 +673,17 @@ def process_input(input_path, output_folder, selection, output_suffix):
local_output_suffix = "tobidders_notice" local_output_suffix = "tobidders_notice"
elif selection == 5: elif selection == 5:
begin_pattern = re.compile( begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分).*?采购.*|^第[一二三四五六七八九十百千]+(?:章|部分).*?需求.*') r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分).*(?:采购|需求).*')
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+') end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
local_output_suffix = "procurement" local_output_suffix = "procurement"
# begin_pattern = re.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|'
# r'^[一二三四五六七八九十百千]+、\s*采购清单', re.MULTILINE)
# end_pattern = re.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
elif selection == 6: elif selection == 6:
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:格式).*') begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:格式).*')
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
@ -825,11 +805,11 @@ if __name__ == "__main__":
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp" # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹" output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1"
# files = truncate_pdf_multiple(input_path, output_folder) # files = truncate_pdf_multiple(input_path, output_folder)
# selections = [1,4] # selections = [1,4]
# files=truncate_pdf_specific_goods(input_path,output_folder,selections) # files=truncate_pdf_specific_goods(input_path,output_folder,selections)
# print(files) # print(files)
selection = 3# 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 selection = 5# 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
generated_files = truncate_pdf_main(input_path, output_folder, selection) generated_files = truncate_pdf_main(input_path, output_folder, selection)
print(generated_files) print(generated_files)

View File

@ -434,7 +434,7 @@ def combine_qualification_review(invalid_path, output_folder, qualification_path
else: else:
file_to_process = qualification_path file_to_process = qualification_path
combined_res = process_file(file_to_process) combined_res = process_file(file_to_process,invalid_path)
match_keys = find_chapter_clause_references(combined_res) match_keys = find_chapter_clause_references(combined_res)
if not match_keys: if not match_keys: