This commit is contained in:
zy123 2025-01-10 17:38:55 +08:00
parent 06a3c2bbaf
commit 4617595c82
8 changed files with 72 additions and 84 deletions

View File

@ -161,7 +161,7 @@ def get_invalid_file(file_path, output_folder, common_header,begin_page):
# 定义排除模式 # 定义排除模式
exclusion_pattern = regex.compile( exclusion_pattern = regex.compile(
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制', r'文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制',
regex.MULTILINE regex.MULTILINE
) )
@ -294,29 +294,16 @@ def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
break break
return start_page, end_page return start_page, end_page
def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern): def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header):
""" """
从PDF文档中提取起始页中间页和结束页 从PDF文档中提取起始页中间页和结束页
如果第一次提取失败则使用新的 begin_pattern end_pattern 重新提取 如果第一次提取失败则使用新的 begin_pattern end_pattern 重新提取
参数:
pdf_document (PDFDocument): 要处理的PDF文档对象
begin_pattern (str regex.Pattern): 用于识别起始的正则表达式模式
begin_page (int): 开始搜索的页码
common_header (str): 每页需要清理的公共头部文本
exclusion_pattern (str regex.Pattern): 用于排除某些页的模式
返回:
tuple: (start_page, mid_page, end_page) 如果成功否则 (None, None, None)
""" """
exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')
def run_extraction(local_begin_pattern, local_end_pattern=None): def run_extraction(local_begin_pattern, local_end_pattern=None):
""" """
使用提供的 begin end 模式运行提取过程 使用提供的 begin end 模式运行提取过程
如果未提供 local_end_pattern则根据匹配的章节类型动态生成 end_pattern 如果未提供 local_end_pattern则根据匹配的章节类型动态生成 end_pattern
参数: 参数:
local_begin_pattern (str regex.Pattern): 用于识别起始的正则表达式模式 local_begin_pattern (str regex.Pattern): 用于识别起始的正则表达式模式
local_end_pattern (str regex.Pattern, 可选): 用于识别结束的正则表达式模式 local_end_pattern (str regex.Pattern, 可选): 用于识别结束的正则表达式模式
@ -427,32 +414,46 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
return start_page, mid_page, end_page return start_page, mid_page, end_page
def perform_extraction_and_save(start_page, mid_page, end_page):
path1 = save_extracted_pages(pdf_path,output_folder,start_page,mid_page,"tobidders_notice_part1",common_header)
if mid_page != end_page:
path2 = save_extracted_pages(pdf_path,output_folder,mid_page,end_page,"tobidders_notice_part2",common_header)
else:
path2 = path1
return path1, path2
# 第一次提取尝试,使用初始的 begin_pattern # 第一次提取尝试,使用初始的 begin_pattern
start_page, mid_page, end_page = run_extraction(begin_pattern) start_page, mid_page, end_page = run_extraction(begin_pattern)
# 如果第一次提取成功,保存并返回路径
if start_page is not None and mid_page is not None and end_page is not None:
return perform_extraction_and_save(start_page, mid_page, end_page)
# 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取 print(f"第一次提取tobidders_notice失败尝试第二次提取: {pdf_path}")
if not (start_page and mid_page and end_page): pdf_document = PdfReader(pdf_path)
print(f"第二次尝试 tobidders_notice!") start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
pdf_document = PdfReader(pdf_path)
start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
if start_page and end_page and mid_page:
return start_page, mid_page, end_page
else:
# 定义新的 begin_pattern 和 end_pattern
new_begin_pattern = regex.compile(
r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表',
regex.MULTILINE
)
new_end_pattern = regex.compile(
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
regex.MULTILINE
)
print("第三次尝试 tobidders_notice! ")
# 第二次提取尝试,使用新的模式
start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern)
return start_page, mid_page, end_page if start_page is not None and mid_page is not None and end_page is not None:
return perform_extraction_and_save(start_page, mid_page, end_page)
# 如果第二次提取失败,定义新的 begin_pattern 和 end_pattern 进行第三次尝试
new_begin_pattern = regex.compile(
r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表',
regex.MULTILINE
)
new_end_pattern = regex.compile(
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
regex.MULTILINE
)
print("第二次提取tobidders_notice失败尝试第三次提取!")
start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern)
if start_page is not None and mid_page is not None and end_page is not None:
return perform_extraction_and_save(start_page, mid_page, end_page)
else:
# 所有提取尝试均失败
print(f"三次提取tobidders_notice均失败!! {pdf_path}")
return "", ""
def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page): def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page):

View File

@ -118,7 +118,7 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
if __name__ == '__main__': if __name__ == '__main__':
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf' # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
file_path=r"C:\Users\Administrator\Downloads\2024-福建-2024年度防汛抢险物资储备.pdf" file_path=r'D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\ztbfile.pdf'
# file_path = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf" # file_path = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf" # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"

View File

@ -34,6 +34,7 @@ def judge_zbfile() -> Any:
try: try:
start_time = time.time() start_time = time.time()
downloaded_filename = os.path.join(output_folder, "ztbfile") downloaded_filename = os.path.join(output_folder, "ztbfile")
logger.info(f"接收到的url:{file_url}")
downloaded_filepath, file_type = download_file(file_url, downloaded_filename) downloaded_filepath, file_type = download_file(file_url, downloaded_filename)
if not downloaded_filepath or file_type == 4: if not downloaded_filepath or file_type == 4:

View File

@ -44,7 +44,7 @@ def judge_zbfile_exec(file_path):
if __name__ == '__main__': if __name__ == '__main__':
start_time = time.time() start_time = time.time()
pdf_path = r"C:/Users/Administrator/Downloads/094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件 - 副本.pdf" pdf_path = r"C:\Users\Administrator\Downloads\商务标null1736476867734.docx"
res = judge_zbfile_exec(pdf_path) res = judge_zbfile_exec(pdf_path)
if res: if res:
print("yes") print("yes")

View File

@ -283,8 +283,10 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
# 2.废标项这边,考虑大模型+正则并用 # 2.废标项这边,考虑大模型+正则并用
# 废标项,增加对表格的提取+排除重复项,按顺序处理 # 废标项,增加对表格的提取+排除重复项,按顺序处理
# 考虑将工程标和货物标的 投标人须知那块逻辑结合 # 考虑将工程标和货物标的 投标人须知那块逻辑结合
# C:\Users\Administrator\Desktop\fsdownload\b29de31a-297e-42cf-b9ba-6859e530a472 这里的投标文件要求有点问题 # D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3排查一下
# 解决禅道 测试的bug # 解决禅道 测试的bug
# 截取:对应 按照 根据
# 评分点
if __name__ == "__main__": if __name__ == "__main__":
# 配置日志器 # 配置日志器

View File

@ -13,7 +13,7 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
is_secondary_match): is_secondary_match):
pdf_document = PdfReader(pdf_path) pdf_document = PdfReader(pdf_path)
exclusion_pattern = regex.compile( exclusion_pattern = regex.compile(
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制') r'文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制')
def run_extraction(): def run_extraction():
start_page = None start_page = None
mid_page = None mid_page = None
@ -134,7 +134,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
end_page = None end_page = None
flag = True flag = True
exclusion_pattern = regex.compile( exclusion_pattern = regex.compile(
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制') r'文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制')
# 遍历文档的每一页,查找开始和结束短语的位置 # 遍历文档的每一页,查找开始和结束短语的位置
for i in range(len(pdf_document.pages)): for i in range(len(pdf_document.pages)):
page = pdf_document.pages[i] page = pdf_document.pages[i]

View File

@ -12,36 +12,18 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
common_header = extract_common_header(pdf_path) common_header = extract_common_header(pdf_path)
pdf_document = PdfReader(pdf_path) pdf_document = PdfReader(pdf_path)
exclusion_pattern = None exclusion_pattern = None
# 原有的处理逻辑保持不变
if output_suffix == "tobidders_notice": if output_suffix == "qualification1" or output_suffix == "procurement" or output_suffix == "evaluation_method":
exclusion_pattern = regex.compile( exclusion_pattern = regex.compile(
r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制') r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')
start_page, mid_page, end_page = extract_pages_tobidders_notice( start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern
)
if not start_page or not mid_page or not end_page:
print(f"三次提取tobidders_notice均失败!!{pdf_path}")
return "", ""
path1 = save_extracted_pages(pdf_path,output_folder,start_page, mid_page, "tobidders_notice_part1",common_header)
if mid_page != end_page:
path2 = save_extracted_pages(pdf_path,output_folder, mid_page, end_page,"tobidders_notice_part2",common_header)
else:
path2=path1
return path1, path2
else:
# 原有的处理逻辑保持不变
if output_suffix == "qualification1" or output_suffix == "procurement" or output_suffix == "evaluation_method":
exclusion_pattern = regex.compile(
r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
common_header, exclusion_pattern, output_suffix) common_header, exclusion_pattern, output_suffix)
if start_page is None or end_page is None: if start_page is None or end_page is None:
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page,logger) return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page,logger)
elif output_suffix == "qualification1": elif output_suffix == "qualification1":
truncate_pdf_main_goods(pdf_path, output_folder, 2, logger,"qualification3") # 合并'资格审查'章节和'评标办法'章节 truncate_pdf_main_goods(pdf_path, output_folder, 2, logger,"qualification3") # 合并'资格审查'章节和'评标办法'章节
return save_extracted_pages(pdf_path, output_folder,start_page, end_page, output_suffix,common_header) return save_extracted_pages(pdf_path, output_folder,start_page, end_page, output_suffix,common_header)
except Exception as e: except Exception as e:
print(f"Error processing {pdf_path}: {e}") print(f"Error processing {pdf_path}: {e}")
return "" return ""
@ -163,7 +145,7 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page, logger): def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page, logger):
try: try:
exclusion_pattern = regex.compile( exclusion_pattern = regex.compile(
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制') r'文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制')
pdf_document = PdfReader(pdf_path) pdf_document = PdfReader(pdf_path)
patterns = None patterns = None
start_page = None start_page = None
@ -298,16 +280,18 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
if output_suffix == "default": if output_suffix == "default":
output_suffix = local_output_suffix output_suffix = local_output_suffix
# 将原先的 process_files 逻辑合并到此处 if selection==4:
result = extract_pages( result = extract_pages_tobidders_notice(pdf_path,output_folder,begin_pattern,begin_page,common_header)
pdf_path, else:
output_folder, result = extract_pages(
begin_pattern, pdf_path,
begin_page, output_folder,
end_pattern, begin_pattern,
output_suffix, begin_page,
logger end_pattern,
) output_suffix,
logger
)
# 根据提取结果以及不同的 output_suffix 进行处理 # 根据提取结果以及不同的 output_suffix 进行处理
if result: if result:
if output_suffix == "tobidders_notice": if output_suffix == "tobidders_notice":
@ -363,10 +347,10 @@ if __name__ == "__main__":
logger = get_global_logger("123") logger = get_global_logger("123")
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标" # input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf" # pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
pdf_path=r"C:\Users\Administrator\Downloads\招标文件 (4).pdf" pdf_path=r"D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\ztbfile.pdf"
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf" # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf" # pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
output_folder = r"D:\flask_project\flask_app\static\output\output1\test" output_folder = r"D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\tmp"
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2" # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
selection = 4 # 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path selection = 4 # 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger) generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)

View File

@ -297,7 +297,7 @@ def generate_prompt(judge_res, full_text=None):
**输出格式** **输出格式**
1.JSON格式外层键名为需要采购的货物设备或系统名称(一级键) 1.JSON格式外层键名为需要采购的货物设备或系统名称(一级键)
2.层次关系用嵌套键值对表示: 2.层次关系用嵌套键值对表示:
-采购活动可能将目标划分为多个系统或货物若文档通过大标题或表格层次或序号标明这种归属关系请在JSON中以嵌套形式表示 -采购活动可能将目标划分为多个系统或货物若文档通过大标题或表格结构或序号标明这种归属关系请在JSON中以嵌套形式表示
-系统作为一级键包含的货物作为二级键 -系统作为一级键包含的货物作为二级键
-例子:""" -例子:"""
1. 激光扫描系统xxx 1. 激光扫描系统xxx