1.13
This commit is contained in:
parent
a99923bea8
commit
1f0d65e904
@ -95,11 +95,11 @@ def delete_mark(docx_path):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# input=r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\ztbfile.pdf'
|
# input=r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\ztbfile.pdf'
|
||||||
# input=r'C:\Users\Administrator\Desktop\new招标文件\output5\广水市公安局音视频监控系统设备采购项目_procurement.pdf'
|
input=r'C:\Users\Administrator\Downloads\国网湖北电力荆州供电公司2024年第四次服务授权竞争性谈判采购-采购文件(15DJ04).pdf'
|
||||||
# output=insert_mark(input)
|
output=insert_mark(input)
|
||||||
doc_path = r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\invalid_added.docx'
|
# doc_path = r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\invalid_added.docx'
|
||||||
res=delete_mark(doc_path)
|
# res=delete_mark(doc_path)
|
||||||
if res:
|
# if res:
|
||||||
print(res)
|
# print(res)
|
||||||
else:
|
# else:
|
||||||
print("No")
|
# print("No")
|
@ -294,29 +294,107 @@ def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
|
|||||||
break
|
break
|
||||||
return start_page, end_page
|
return start_page, end_page
|
||||||
|
|
||||||
def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header):
|
|
||||||
|
def generate_end_pattern(extraction_stage, chapter_type=None):
|
||||||
|
"""
|
||||||
|
根据提取阶段和章节类型动态生成 end_pattern。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
extraction_stage (str): 提取阶段标记,如 'first' 表示第一次,'third' 表示第三次。
|
||||||
|
chapter_type (str, optional): 章节类型,如 '章' 或 '部分'。
|
||||||
|
|
||||||
|
返回:
|
||||||
|
regex.Pattern: 编译后的正则表达式模式。
|
||||||
|
"""
|
||||||
|
# 定义 end_pattern 模板
|
||||||
|
end_pattern_template_first = (
|
||||||
|
r'^(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff]+'
|
||||||
|
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$'
|
||||||
|
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$'
|
||||||
|
)
|
||||||
|
|
||||||
|
end_pattern_template_third = (
|
||||||
|
r'第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff、()()]+\s*$'
|
||||||
|
)
|
||||||
|
|
||||||
|
# 选择模板并替换 {chapter_type}
|
||||||
|
if extraction_stage == 'third':
|
||||||
|
template = end_pattern_template_third
|
||||||
|
else:
|
||||||
|
template = end_pattern_template_first
|
||||||
|
|
||||||
|
if chapter_type:
|
||||||
|
pattern_str = template.format(chapter_type=chapter_type)
|
||||||
|
else:
|
||||||
|
pattern_str = template.format(chapter_type='章|部分')
|
||||||
|
|
||||||
|
return regex.compile(pattern_str, regex.MULTILINE)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_mid_pattern(chapter_type=None):
|
||||||
|
"""
|
||||||
|
根据章节类型动态生成 mid_pattern。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
chapter_type (str, optional): 章节类型,如 '章' 或 '部分'。
|
||||||
|
|
||||||
|
返回:
|
||||||
|
regex.Pattern: 编译后的正则表达式模式。
|
||||||
|
"""
|
||||||
|
# 定义基础的 mid_pattern
|
||||||
|
base_mid_pattern = (
|
||||||
|
r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|'
|
||||||
|
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
|
||||||
|
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$'
|
||||||
|
)
|
||||||
|
|
||||||
|
additional_mid_pattern = ''
|
||||||
|
if chapter_type:
|
||||||
|
if chapter_type == '章':
|
||||||
|
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)'
|
||||||
|
elif chapter_type == '部分':
|
||||||
|
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:章)'
|
||||||
|
|
||||||
|
if additional_mid_pattern:
|
||||||
|
combined_pattern = rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})'
|
||||||
|
else:
|
||||||
|
combined_pattern = base_mid_pattern
|
||||||
|
|
||||||
|
return regex.compile(combined_pattern, regex.MULTILINE)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_header):
|
||||||
"""
|
"""
|
||||||
从PDF文档中提取起始页、中间页和结束页。
|
从PDF文档中提取起始页、中间页和结束页。
|
||||||
如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取。
|
如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取。
|
||||||
"""
|
"""
|
||||||
exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')
|
exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')
|
||||||
def run_extraction(local_begin_pattern, local_end_pattern=None):
|
|
||||||
|
def run_extraction(begin_pattern, extraction_stage='first'):
|
||||||
"""
|
"""
|
||||||
使用提供的 begin 和 end 模式运行提取过程。
|
使用提供的 begin_pattern 运行提取过程。
|
||||||
如果未提供 local_end_pattern,则根据匹配的章节类型动态生成 end_pattern。
|
根据 extraction_stage 判断是“第一次提取”还是“第三次提取”,
|
||||||
|
从而选择对应的默认 end_pattern(动态生成)。
|
||||||
|
|
||||||
参数:
|
参数:
|
||||||
local_begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。
|
begin_pattern (regex.Pattern): 用于识别起始的正则表达式模式。
|
||||||
local_end_pattern (str 或 regex.Pattern, 可选): 用于识别结束的正则表达式模式。
|
extraction_stage (str): 提取阶段标记,如 'first' 表示第一次,'third' 表示第三次。
|
||||||
|
|
||||||
返回:
|
返回:
|
||||||
tuple: (start_page, mid_page, end_page)
|
tuple: (start_page, mid_page, end_page)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# ===== 函数内部变量 =====
|
||||||
start_page = None
|
start_page = None
|
||||||
mid_page = None
|
mid_page = None
|
||||||
end_page = None
|
end_page = None
|
||||||
combined_mid_pattern = None # 中间页的组合模式
|
combined_mid_pattern = None # 中间页的组合模式
|
||||||
|
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
||||||
|
|
||||||
|
# ===== 遍历每一页,执行提取逻辑 =====
|
||||||
for i, page in enumerate(pdf_document.pages):
|
for i, page in enumerate(pdf_document.pages):
|
||||||
text = page.extract_text() or ""
|
text = page.extract_text() or ""
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
@ -324,90 +402,41 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
|
|||||||
# 如果已经找到中间页,且当前页匹配排除模式,则跳过
|
# 如果已经找到中间页,且当前页匹配排除模式,则跳过
|
||||||
if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
|
if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
|
||||||
continue
|
continue
|
||||||
if begin_page==0 and catalog_pattern.search(cleaned_text):
|
|
||||||
continue # 如果存在目录,跳过当前页面
|
|
||||||
|
|
||||||
# 识别起始页
|
# 如果为第 0 页之后的目录,直接跳过
|
||||||
|
if start_page == 0 and catalog_pattern.search(cleaned_text):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ========== 识别起始页 ==========
|
||||||
if start_page is None:
|
if start_page is None:
|
||||||
match = regex.search(local_begin_pattern, cleaned_text)
|
match = regex.search(begin_pattern, cleaned_text)
|
||||||
if match and i > begin_page:
|
if match and (start_page is None or i > start_page):
|
||||||
start_page = i
|
start_page = i
|
||||||
matched_text:str = match.group(0) # 获取整个匹配的文本
|
matched_text = match.group(0) # 获取整个匹配的文本
|
||||||
# 如果未提供固定的 end_pattern,则根据匹配的章节类型动态生成
|
# 根据匹配到的内容识别是“章”还是“部分”,用于后续组合 mid_pattern
|
||||||
if not local_end_pattern:
|
if '章' in matched_text:
|
||||||
if '章' in matched_text:
|
chapter_type = '章'
|
||||||
chapter_type = '章'
|
elif '部分' in matched_text:
|
||||||
elif '部分' in matched_text:
|
chapter_type = '部分'
|
||||||
chapter_type = '部分'
|
|
||||||
else:
|
|
||||||
chapter_type = None # 未匹配到“章”或“部分”
|
|
||||||
|
|
||||||
if chapter_type:
|
|
||||||
# 根据 chapter_type 动态生成 end_pattern
|
|
||||||
end_pattern_dynamic = regex.compile(
|
|
||||||
rf'^(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
|
|
||||||
regex.MULTILINE
|
|
||||||
)
|
|
||||||
|
|
||||||
# 根据 chapter_type 动态生成 additional_mid_pattern
|
|
||||||
if chapter_type == '章':
|
|
||||||
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)'
|
|
||||||
elif chapter_type == '部分':
|
|
||||||
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:章)'
|
|
||||||
else:
|
|
||||||
additional_mid_pattern = ''
|
|
||||||
|
|
||||||
# 定义基础的 mid_pattern
|
|
||||||
base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \
|
|
||||||
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
|
|
||||||
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$'
|
|
||||||
|
|
||||||
# 合并基础模式和额外模式
|
|
||||||
if additional_mid_pattern:
|
|
||||||
combined_mid_pattern = regex.compile(
|
|
||||||
rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})',
|
|
||||||
regex.MULTILINE
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
combined_mid_pattern = regex.compile(
|
|
||||||
rf'{base_mid_pattern}',
|
|
||||||
regex.MULTILINE
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# 如果未匹配到“章”或“部分”,使用默认的 end_pattern 和 mid_pattern
|
|
||||||
end_pattern_dynamic = regex.compile(
|
|
||||||
r'^(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
|
|
||||||
regex.MULTILINE
|
|
||||||
)
|
|
||||||
|
|
||||||
# 定义基础的 mid_pattern
|
|
||||||
base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \
|
|
||||||
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
|
|
||||||
combined_mid_pattern = regex.compile(
|
|
||||||
rf'{base_mid_pattern}',
|
|
||||||
regex.MULTILINE
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
# 如果提供了固定的 end_pattern,则使用默认的 mid_pattern
|
chapter_type = None # 未匹配到“章”或“部分”
|
||||||
base_mid_pattern = r'.*[((]?\s*[一二12]?[))]?\s*[、..]*\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)\s*$' # 可以匹配"东莞市粤隆招标有限公司编制 46一、说明"
|
|
||||||
combined_mid_pattern = regex.compile(
|
# ===== 动态生成 end_pattern_dynamic =====
|
||||||
rf'{base_mid_pattern}',
|
end_pattern_dynamic = generate_end_pattern(extraction_stage, chapter_type)
|
||||||
regex.MULTILINE
|
# ========== 动态生成 mid_pattern ==========
|
||||||
)
|
combined_mid_pattern = generate_mid_pattern(chapter_type)
|
||||||
|
# 找到 start_page 后,继续循环,进入下一页
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 识别中间页
|
# ========== 识别中间页 ==========
|
||||||
if start_page is not None and mid_page is None and combined_mid_pattern:
|
if start_page is not None and mid_page is None and combined_mid_pattern:
|
||||||
# if (start_page + 1 == i) and regex.search(local_begin_pattern, cleaned_text):
|
|
||||||
# continue
|
|
||||||
if regex.search(combined_mid_pattern, cleaned_text):
|
if regex.search(combined_mid_pattern, cleaned_text):
|
||||||
mid_page = i
|
mid_page = i
|
||||||
|
|
||||||
# 识别结束页
|
# ========== 识别结束页 ==========
|
||||||
if start_page is not None and mid_page is not None:
|
if start_page is not None and mid_page is not None:
|
||||||
# 使用提供的 end_pattern 或动态生成的 end_pattern
|
# 当前使用的 end_pattern(上面根据 extraction_stage 和 chapter_type 选择好了)
|
||||||
current_end_pattern = local_end_pattern if local_end_pattern else end_pattern_dynamic
|
if regex.search(end_pattern_dynamic, cleaned_text):
|
||||||
if regex.search(current_end_pattern, cleaned_text):
|
|
||||||
if i > mid_page:
|
if i > mid_page:
|
||||||
end_page = i
|
end_page = i
|
||||||
break
|
break
|
||||||
@ -423,7 +452,12 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
|
|||||||
return path1, path2
|
return path1, path2
|
||||||
|
|
||||||
# 第一次提取尝试,使用初始的 begin_pattern
|
# 第一次提取尝试,使用初始的 begin_pattern
|
||||||
start_page, mid_page, end_page = run_extraction(begin_pattern)
|
begin_pattern = regex.compile(
|
||||||
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+|'
|
||||||
|
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$',
|
||||||
|
regex.MULTILINE
|
||||||
|
)
|
||||||
|
start_page, mid_page, end_page = run_extraction(begin_pattern, extraction_stage='first')
|
||||||
# 如果第一次提取成功,保存并返回路径
|
# 如果第一次提取成功,保存并返回路径
|
||||||
if start_page is not None and mid_page is not None and end_page is not None:
|
if start_page is not None and mid_page is not None and end_page is not None:
|
||||||
return perform_extraction_and_save(start_page, mid_page, end_page)
|
return perform_extraction_and_save(start_page, mid_page, end_page)
|
||||||
@ -435,19 +469,13 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
|
|||||||
if start_page is not None and mid_page is not None and end_page is not None:
|
if start_page is not None and mid_page is not None and end_page is not None:
|
||||||
return perform_extraction_and_save(start_page, mid_page, end_page)
|
return perform_extraction_and_save(start_page, mid_page, end_page)
|
||||||
|
|
||||||
# 如果第二次提取失败,定义新的 begin_pattern 和 end_pattern 进行第三次尝试
|
print("第二次提取tobidders_notice失败,尝试第三次提取!")
|
||||||
new_begin_pattern = regex.compile(
|
new_begin_pattern = regex.compile(
|
||||||
r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
|
r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
|
||||||
r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表',
|
r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表',
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
)
|
)
|
||||||
new_end_pattern = regex.compile(
|
start_page, mid_page, end_page = run_extraction(new_begin_pattern, extraction_stage='third')
|
||||||
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$',
|
|
||||||
regex.MULTILINE
|
|
||||||
)
|
|
||||||
print("第二次提取tobidders_notice失败,尝试第三次提取!")
|
|
||||||
start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern)
|
|
||||||
|
|
||||||
if start_page is not None and mid_page is not None and end_page is not None:
|
if start_page is not None and mid_page is not None and end_page is not None:
|
||||||
return perform_extraction_and_save(start_page, mid_page, end_page)
|
return perform_extraction_and_save(start_page, mid_page, end_page)
|
||||||
else:
|
else:
|
||||||
|
@ -88,14 +88,14 @@ def build_response(judge_result: JudgeResult, logger) -> Any:
|
|||||||
data=''
|
data=''
|
||||||
)
|
)
|
||||||
elif judge_result == JudgeResult.YES:
|
elif judge_result == JudgeResult.YES:
|
||||||
logger.error("判断是否为招标文件成功!YES")
|
logger.info("判断是否为招标文件成功!YES")
|
||||||
return create_response_normal(
|
return create_response_normal(
|
||||||
message='判断是否为招标文件成功!',
|
message='判断是否为招标文件成功!',
|
||||||
status='success',
|
status='success',
|
||||||
data='yes'
|
data='yes'
|
||||||
)
|
)
|
||||||
elif judge_result == JudgeResult.NO:
|
elif judge_result == JudgeResult.NO:
|
||||||
logger.error("判断是否为招标文件成功!NO")
|
logger.info("判断是否为招标文件成功!NO")
|
||||||
return create_response_normal(
|
return create_response_normal(
|
||||||
message='判断是否为招标文件成功!',
|
message='判断是否为招标文件成功!',
|
||||||
status='success',
|
status='success',
|
||||||
|
@ -248,13 +248,8 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
|
|||||||
)
|
)
|
||||||
local_output_suffix = "qualification1"
|
local_output_suffix = "qualification1"
|
||||||
elif selection == 4:
|
elif selection == 4:
|
||||||
begin_pattern = regex.compile(
|
path1, path2 = extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_header)
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+|'
|
return [path1 or "", path2 or ""]
|
||||||
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$',
|
|
||||||
regex.MULTILINE
|
|
||||||
)
|
|
||||||
end_pattern = None
|
|
||||||
local_output_suffix = "tobidders_notice"
|
|
||||||
elif selection == 5:
|
elif selection == 5:
|
||||||
begin_pattern = regex.compile(
|
begin_pattern = regex.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术|供货).*?要求|'
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术|供货).*?要求|'
|
||||||
@ -266,12 +261,8 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
|
|||||||
)
|
)
|
||||||
local_output_suffix = "procurement"
|
local_output_suffix = "procurement"
|
||||||
elif selection == 6:
|
elif selection == 6:
|
||||||
begin_page = last_begin_index
|
|
||||||
invalid_path = get_invalid_file(pdf_path, output_folder, common_header, begin_page)
|
invalid_path = get_invalid_file(pdf_path, output_folder, common_header, begin_page)
|
||||||
if invalid_path:
|
return [invalid_path or ""]
|
||||||
return [invalid_path]
|
|
||||||
else:
|
|
||||||
return [""]
|
|
||||||
else:
|
else:
|
||||||
print("无效的选择:请选择1-6")
|
print("无效的选择:请选择1-6")
|
||||||
return ['']
|
return ['']
|
||||||
@ -279,11 +270,7 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
|
|||||||
# 如果传入的 output_suffix 是 'default',则使用本地生成的 output_suffix
|
# 如果传入的 output_suffix 是 'default',则使用本地生成的 output_suffix
|
||||||
if output_suffix == "default":
|
if output_suffix == "default":
|
||||||
output_suffix = local_output_suffix
|
output_suffix = local_output_suffix
|
||||||
|
result = extract_pages(
|
||||||
if selection==4:
|
|
||||||
result = extract_pages_tobidders_notice(pdf_path,output_folder,begin_pattern,begin_page,common_header)
|
|
||||||
else:
|
|
||||||
result = extract_pages(
|
|
||||||
pdf_path,
|
pdf_path,
|
||||||
output_folder,
|
output_folder,
|
||||||
begin_pattern,
|
begin_pattern,
|
||||||
@ -294,20 +281,12 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
|
|||||||
)
|
)
|
||||||
# 根据提取结果以及不同的 output_suffix 进行处理
|
# 根据提取结果以及不同的 output_suffix 进行处理
|
||||||
if result:
|
if result:
|
||||||
if output_suffix == "tobidders_notice":
|
if output_suffix == "qualification1":
|
||||||
# 确保返回的是元组,并将其中的 None 转换为 ""
|
|
||||||
if isinstance(result, tuple) and len(result) == 2:
|
|
||||||
path1, path2 = result
|
|
||||||
return [path1 or "", path2 or ""]
|
|
||||||
else:
|
|
||||||
logger.error("Unexpected result format for 'tobidders_notice'")
|
|
||||||
return ["",""]
|
|
||||||
elif output_suffix == "qualification1":
|
|
||||||
merge_and_cleanup(result, "qualification3")
|
merge_and_cleanup(result, "qualification3")
|
||||||
return [result or ""]
|
return [result or ""]
|
||||||
else:
|
else:
|
||||||
return [result or ""]
|
return [result or ""]
|
||||||
return [""] * (2 if selection == 4 else 1)
|
return [""]
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in processing file '{input_path}': {e}")
|
logger.error(f"Error in processing file '{input_path}': {e}")
|
||||||
@ -347,10 +326,10 @@ if __name__ == "__main__":
|
|||||||
logger = get_global_logger("123")
|
logger = get_global_logger("123")
|
||||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
||||||
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
||||||
pdf_path=r"D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\ztbfile.pdf"
|
pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles"
|
||||||
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
||||||
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
||||||
output_folder = r"D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\tmp"
|
output_folder = r"C:\Users\Administrator\Desktop\货物标\output4_2"
|
||||||
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
||||||
selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
|
selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
|
||||||
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user