11.5货物标截取优化
This commit is contained in:
parent
f9f17757f3
commit
d4d1a14c06
23
flask_app/general/清除file_id.py
Normal file
23
flask_app/general/清除file_id.py
Normal file
@ -0,0 +1,23 @@
|
||||
import os
|
||||
from openai import OpenAI
|
||||
import json
|
||||
|
||||
# 初始化 OpenAI 客户端
|
||||
client = OpenAI(
|
||||
api_key=os.getenv("DASHSCOPE_API_KEY"),
|
||||
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||
)
|
||||
|
||||
# 获取文件列表
|
||||
file_stk = client.files.list()
|
||||
|
||||
# 将文件信息解析为 JSON 格式
|
||||
file_data = json.loads(file_stk.model_dump_json())
|
||||
|
||||
# 提取所有文件的 id
|
||||
file_ids = [file["id"] for file in file_data["data"]]
|
||||
|
||||
# 循环删除每个文件
|
||||
for file_id in file_ids:
|
||||
file_object = client.files.delete(file_id)
|
||||
print(file_object.model_dump_json())
|
@ -337,17 +337,17 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
||||
# Selection 2: 评标办法
|
||||
pattern_pairs = [
|
||||
(
|
||||
re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(磋商|谈判|评标|评定|评审)\s*(办法|方法)'),
|
||||
re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))'),
|
||||
# Alternative begin pattern
|
||||
re.compile(r'评标办法正文|评标办法|^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
||||
# Alternative end pattern
|
||||
),
|
||||
(
|
||||
re.compile(
|
||||
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、]*?(磋商|谈判|评标|评定|评审)\s*(办法|方法)[\u4e00-\u9fff、]*\s*$|\s*评标(办法|方法)前附表\s*$',
|
||||
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]*?(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))[\u4e00-\u9fff、()()]*\s*$|\s*评标(办法|方法)前附表\s*$',
|
||||
re.MULTILINE
|
||||
),
|
||||
re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE)
|
||||
re.compile(r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE)
|
||||
)
|
||||
]
|
||||
output_suffix = "evaluation_method"
|
||||
@ -559,22 +559,23 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, uniqu
|
||||
|
||||
|
||||
# TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下,比如说就不进行跳转(见xx表) 开评定标这里也要考虑 如果评分表为空,也要处理。
|
||||
# TODO:zbtest8 zbtest18有问题 后期需要完善,截取需要截两次,第一次严格第二次宽松
|
||||
# TODO:目前merged_baseinfo没有包含投标人须知正文。
|
||||
|
||||
#TODO:zbtest20有问题
|
||||
# 投标人须知前附表改为货物标一样的
|
||||
if __name__ == "__main__":
|
||||
start_time = time.time()
|
||||
# input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标\\HBDL-2024-0017-001-招标文件.pdf"
|
||||
input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf"
|
||||
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf"
|
||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
|
||||
# input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest1.pdf"
|
||||
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
|
||||
output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\tmp"
|
||||
# files=truncate_pdf_multiple(input_path,output_folder)
|
||||
selections = [4, 1] # 仅处理 selection 4、1
|
||||
files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
|
||||
# selections = [4, 1] # 仅处理 selection 4、1
|
||||
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
|
||||
# print(files)
|
||||
# selection = 2 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
|
||||
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
selection = 2 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
|
||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
# print(generated_files)
|
||||
# print("生成的文件:", generated_files)
|
||||
end_time = time.time()
|
||||
|
@ -151,11 +151,12 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
||||
|
||||
def get_patterns_for_procurement():
|
||||
begin_pattern = re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|'
|
||||
r'^[一二三四五六七八九十百千]+、\s*采购清单', re.MULTILINE)
|
||||
end_pattern = re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分).*(?:采购|需求).*',
|
||||
re.MULTILINE)
|
||||
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||
# begin_pattern=re.compile(
|
||||
# r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、]*?(?:服务|项目|商务|技术)\s*(办法|方法)[\u4e00-\u9fff、]*\s*$'
|
||||
# )
|
||||
return begin_pattern, end_pattern
|
||||
|
||||
|
||||
@ -176,21 +177,6 @@ def get_patterns_for_evaluation_method():
|
||||
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE)
|
||||
return begin_pattern, end_pattern
|
||||
|
||||
|
||||
# def get_patterns_for_qualification():
|
||||
# # # 原始匹配逻辑
|
||||
# # begin_pattern_original = re.compile(
|
||||
# # r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE)
|
||||
# # end_pattern_original = re.compile(
|
||||
# # r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||
#
|
||||
# # 新匹配逻辑
|
||||
# begin_pattern_new = re.compile(
|
||||
# r'^资格性检查', re.MULTILINE)
|
||||
# end_pattern_new = re.compile(
|
||||
# r'^附件\s*\d+|^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||
#
|
||||
# return begin_pattern_new, end_pattern_new
|
||||
def get_patterns_for_qualification():
|
||||
# begin_pattern = re.compile(
|
||||
# r'^(?:附录(?:一|1)?[::]|附件(?:一|1)?[::]|附表(?:一|1)?[::]|资格性检查|资格审查表).*(?:资格|符合性)?.*$',
|
||||
@ -570,20 +556,6 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,be
|
||||
return ""
|
||||
|
||||
|
||||
# def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):
|
||||
# if output_suffix=='notice':
|
||||
# print(start_page)
|
||||
# base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||
# output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
|
||||
# output_doc = PdfWriter()
|
||||
# for page_num in range(start_page, end_page + 1):
|
||||
# output_doc.add_page(pdf_document.pages[page_num])
|
||||
# with open(output_pdf_path, 'wb') as f:
|
||||
# output_doc.write(f)
|
||||
# print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
|
||||
# return output_pdf_path
|
||||
|
||||
|
||||
def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):
|
||||
try:
|
||||
# 检查 start_page 和 end_page 是否为 None
|
||||
@ -701,9 +673,17 @@ def process_input(input_path, output_folder, selection, output_suffix):
|
||||
local_output_suffix = "tobidders_notice"
|
||||
elif selection == 5:
|
||||
begin_pattern = re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分).*?采购.*|^第[一二三四五六七八九十百千]+(?:章|部分).*?需求.*')
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分).*(?:采购|需求).*')
|
||||
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
||||
local_output_suffix = "procurement"
|
||||
|
||||
# begin_pattern = re.compile(
|
||||
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
|
||||
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|'
|
||||
# r'^[一二三四五六七八九十百千]+、\s*采购清单', re.MULTILINE)
|
||||
# end_pattern = re.compile(
|
||||
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||
|
||||
elif selection == 6:
|
||||
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:格式).*')
|
||||
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||
@ -825,11 +805,11 @@ if __name__ == "__main__":
|
||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
||||
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
|
||||
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
|
||||
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
|
||||
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1"
|
||||
# files = truncate_pdf_multiple(input_path, output_folder)
|
||||
# selections = [1,4]
|
||||
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
|
||||
# print(files)
|
||||
selection = 3# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
||||
selection = 5# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
print(generated_files)
|
@ -434,7 +434,7 @@ def combine_qualification_review(invalid_path, output_folder, qualification_path
|
||||
else:
|
||||
file_to_process = qualification_path
|
||||
|
||||
combined_res = process_file(file_to_process)
|
||||
combined_res = process_file(file_to_process,invalid_path)
|
||||
match_keys = find_chapter_clause_references(combined_res)
|
||||
|
||||
if not match_keys:
|
||||
|
Loading…
x
Reference in New Issue
Block a user