From 4617595c82f216fc8bb98c705abc0632363d1046 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Fri, 10 Jan 2025 17:38:55 +0800 Subject: [PATCH] 1.10 --- flask_app/general/截取pdf通用函数.py | 77 ++++++++++++----------- flask_app/general/读取文件/按页读取pdf.py | 2 +- flask_app/routes/judge_zbfile.py | 1 + flask_app/routes/判断是否是招标文件.py | 2 +- flask_app/routes/货物标解析main.py | 4 +- flask_app/工程标/截取pdf工程标版.py | 4 +- flask_app/货物标/截取pdf货物标版.py | 64 +++++++------------ flask_app/货物标/技术参数要求提取.py | 2 +- 8 files changed, 72 insertions(+), 84 deletions(-) diff --git a/flask_app/general/截取pdf通用函数.py b/flask_app/general/截取pdf通用函数.py index 4e4aa5b..66f2809 100644 --- a/flask_app/general/截取pdf通用函数.py +++ b/flask_app/general/截取pdf通用函数.py @@ -161,7 +161,7 @@ def get_invalid_file(file_path, output_folder, common_header,begin_page): # 定义排除模式 exclusion_pattern = regex.compile( - r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制', + r'文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制', regex.MULTILINE ) @@ -294,29 +294,16 @@ def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, break return start_page, end_page -def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern): +def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header): """ 从PDF文档中提取起始页、中间页和结束页。 - 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取。 - - 参数: - pdf_document (PDFDocument): 要处理的PDF文档对象。 - begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。 - begin_page (int): 开始搜索的页码。 - common_header (str): 每页需要清理的公共头部文本。 - exclusion_pattern (str 或 regex.Pattern): 用于排除某些页的模式。 - - 返回: - tuple: (start_page, mid_page, end_page) 如果成功,否则 (None, None, None) """ - + exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制') def run_extraction(local_begin_pattern, local_end_pattern=None): """ 使用提供的 begin 和 end 模式运行提取过程。 - 如果未提供 local_end_pattern,则根据匹配的章节类型动态生成 end_pattern。 - 参数: local_begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。 local_end_pattern (str 或 regex.Pattern, 可选): 用于识别结束的正则表达式模式。 @@ -427,32 +414,46 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h return start_page, mid_page, end_page + def perform_extraction_and_save(start_page, mid_page, end_page): + path1 = save_extracted_pages(pdf_path,output_folder,start_page,mid_page,"tobidders_notice_part1",common_header) + if mid_page != end_page: + path2 = save_extracted_pages(pdf_path,output_folder,mid_page,end_page,"tobidders_notice_part2",common_header) + else: + path2 = path1 + return path1, path2 + # 第一次提取尝试,使用初始的 begin_pattern start_page, mid_page, end_page = run_extraction(begin_pattern) + # 如果第一次提取成功,保存并返回路径 + if start_page is not None and mid_page is not None and end_page is not None: + return perform_extraction_and_save(start_page, mid_page, end_page) - # 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取 - if not (start_page and mid_page and end_page): - print(f"第二次尝试 tobidders_notice!") - pdf_document = PdfReader(pdf_path) - start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page) - if start_page and end_page and mid_page: - return start_page, mid_page, end_page - else: - # 定义新的 begin_pattern 和 end_pattern - new_begin_pattern = regex.compile( - r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|' - r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表', - regex.MULTILINE - ) - new_end_pattern = regex.compile( - r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', - regex.MULTILINE - ) - print("第三次尝试 tobidders_notice! ") - # 第二次提取尝试,使用新的模式 - start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern) + print(f"第一次提取tobidders_notice失败,尝试第二次提取: {pdf_path}") + pdf_document = PdfReader(pdf_path) + start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page) - return start_page, mid_page, end_page + if start_page is not None and mid_page is not None and end_page is not None: + return perform_extraction_and_save(start_page, mid_page, end_page) + + # 如果第二次提取失败,定义新的 begin_pattern 和 end_pattern 进行第三次尝试 + new_begin_pattern = regex.compile( + r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|' + r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表', + regex.MULTILINE + ) + new_end_pattern = regex.compile( + r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', + regex.MULTILINE + ) + print("第二次提取tobidders_notice失败,尝试第三次提取!") + start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern) + + if start_page is not None and mid_page is not None and end_page is not None: + return perform_extraction_and_save(start_page, mid_page, end_page) + else: + # 所有提取尝试均失败 + print(f"三次提取tobidders_notice均失败!! {pdf_path}") + return "", "" def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page): diff --git a/flask_app/general/读取文件/按页读取pdf.py b/flask_app/general/读取文件/按页读取pdf.py index 1378c4b..b957ae5 100644 --- a/flask_app/general/读取文件/按页读取pdf.py +++ b/flask_app/general/读取文件/按页读取pdf.py @@ -118,7 +118,7 @@ def save_extracted_text_to_txt(pdf_path, txt_path): if __name__ == '__main__': # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf' - file_path=r"C:\Users\Administrator\Downloads\2024-福建-2024年度防汛抢险物资储备.pdf" + file_path=r'D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\ztbfile.pdf' # file_path = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf" # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf' # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf" diff --git a/flask_app/routes/judge_zbfile.py b/flask_app/routes/judge_zbfile.py index 6086445..0d4f3a9 100644 --- a/flask_app/routes/judge_zbfile.py +++ b/flask_app/routes/judge_zbfile.py @@ -34,6 +34,7 @@ def judge_zbfile() -> Any: try: start_time = time.time() downloaded_filename = os.path.join(output_folder, "ztbfile") + logger.info(f"接收到的url:{file_url}") downloaded_filepath, file_type = download_file(file_url, downloaded_filename) if not downloaded_filepath or file_type == 4: diff --git a/flask_app/routes/判断是否是招标文件.py b/flask_app/routes/判断是否是招标文件.py index b8e3494..88977db 100644 --- a/flask_app/routes/判断是否是招标文件.py +++ b/flask_app/routes/判断是否是招标文件.py @@ -44,7 +44,7 @@ def judge_zbfile_exec(file_path): if __name__ == '__main__': start_time = time.time() - pdf_path = r"C:/Users/Administrator/Downloads/094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件 - 副本.pdf" + pdf_path = r"C:\Users\Administrator\Downloads\商务标null1736476867734.docx" res = judge_zbfile_exec(pdf_path) if res: print("yes") diff --git a/flask_app/routes/货物标解析main.py b/flask_app/routes/货物标解析main.py index d059110..7ffe836 100644 --- a/flask_app/routes/货物标解析main.py +++ b/flask_app/routes/货物标解析main.py @@ -283,8 +283,10 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id): # 2.废标项这边,考虑大模型+正则并用 # 废标项,增加对表格的提取+排除重复项,按顺序处理 # 考虑将工程标和货物标的 投标人须知那块逻辑结合 -# C:\Users\Administrator\Desktop\fsdownload\b29de31a-297e-42cf-b9ba-6859e530a472 这里的投标文件要求有点问题 +# D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3排查一下 # 解决禅道 测试的bug +# 截取:对应 按照 根据 +# 评分点 if __name__ == "__main__": # 配置日志器 diff --git a/flask_app/工程标/截取pdf工程标版.py b/flask_app/工程标/截取pdf工程标版.py index ab4771c..b12c904 100644 --- a/flask_app/工程标/截取pdf工程标版.py +++ b/flask_app/工程标/截取pdf工程标版.py @@ -13,7 +13,7 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin is_secondary_match): pdf_document = PdfReader(pdf_path) exclusion_pattern = regex.compile( - r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制') + r'文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制') def run_extraction(): start_page = None mid_page = None @@ -134,7 +134,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter end_page = None flag = True exclusion_pattern = regex.compile( - r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制') + r'文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制') # 遍历文档的每一页,查找开始和结束短语的位置 for i in range(len(pdf_document.pages)): page = pdf_document.pages[i] diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py index ee09f4b..001f1db 100644 --- a/flask_app/货物标/截取pdf货物标版.py +++ b/flask_app/货物标/截取pdf货物标版.py @@ -12,36 +12,18 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter common_header = extract_common_header(pdf_path) pdf_document = PdfReader(pdf_path) exclusion_pattern = None - - if output_suffix == "tobidders_notice": + # 原有的处理逻辑保持不变 + if output_suffix == "qualification1" or output_suffix == "procurement" or output_suffix == "evaluation_method": exclusion_pattern = regex.compile( r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制') - start_page, mid_page, end_page = extract_pages_tobidders_notice( - pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern - ) - if not start_page or not mid_page or not end_page: - print(f"三次提取tobidders_notice均失败!!{pdf_path}") - return "", "" - path1 = save_extracted_pages(pdf_path,output_folder,start_page, mid_page, "tobidders_notice_part1",common_header) - if mid_page != end_page: - path2 = save_extracted_pages(pdf_path,output_folder, mid_page, end_page,"tobidders_notice_part2",common_header) - else: - path2=path1 - return path1, path2 - - else: - # 原有的处理逻辑保持不变 - if output_suffix == "qualification1" or output_suffix == "procurement" or output_suffix == "evaluation_method": - exclusion_pattern = regex.compile( - r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制') - start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, + start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern, output_suffix) - if start_page is None or end_page is None: - print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") - return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page,logger) - elif output_suffix == "qualification1": - truncate_pdf_main_goods(pdf_path, output_folder, 2, logger,"qualification3") # 合并'资格审查'章节和'评标办法'章节 - return save_extracted_pages(pdf_path, output_folder,start_page, end_page, output_suffix,common_header) + if start_page is None or end_page is None: + print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") + return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page,logger) + elif output_suffix == "qualification1": + truncate_pdf_main_goods(pdf_path, output_folder, 2, logger,"qualification3") # 合并'资格审查'章节和'评标办法'章节 + return save_extracted_pages(pdf_path, output_folder,start_page, end_page, output_suffix,common_header) except Exception as e: print(f"Error processing {pdf_path}: {e}") return "" @@ -163,7 +145,7 @@ def extract_pages_qualification(pdf_document, begin_page, common_header): def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page, logger): try: exclusion_pattern = regex.compile( - r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制') + r'文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制') pdf_document = PdfReader(pdf_path) patterns = None start_page = None @@ -298,16 +280,18 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_ if output_suffix == "default": output_suffix = local_output_suffix - # 将原先的 process_files 逻辑合并到此处 - result = extract_pages( - pdf_path, - output_folder, - begin_pattern, - begin_page, - end_pattern, - output_suffix, - logger - ) + if selection==4: + result = extract_pages_tobidders_notice(pdf_path,output_folder,begin_pattern,begin_page,common_header) + else: + result = extract_pages( + pdf_path, + output_folder, + begin_pattern, + begin_page, + end_pattern, + output_suffix, + logger + ) # 根据提取结果以及不同的 output_suffix 进行处理 if result: if output_suffix == "tobidders_notice": @@ -363,10 +347,10 @@ if __name__ == "__main__": logger = get_global_logger("123") # input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标" # pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf" - pdf_path=r"C:\Users\Administrator\Downloads\招标文件 (4).pdf" + pdf_path=r"D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\ztbfile.pdf" # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf" # pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf" - output_folder = r"D:\flask_project\flask_app\static\output\output1\test" + output_folder = r"D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\tmp" # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2" selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger) diff --git a/flask_app/货物标/技术参数要求提取.py b/flask_app/货物标/技术参数要求提取.py index e6f35a6..b08c88c 100644 --- a/flask_app/货物标/技术参数要求提取.py +++ b/flask_app/货物标/技术参数要求提取.py @@ -297,7 +297,7 @@ def generate_prompt(judge_res, full_text=None): **输出格式**: 1.JSON格式,外层键名为需要采购的货物、设备或系统名称(一级键)。 2.层次关系用嵌套键值对表示: - -采购活动可能将目标划分为多个系统或货物。若文档通过大标题或表格层次或序号标明这种归属关系,请在JSON中以嵌套形式表示: + -采购活动可能将目标划分为多个系统或货物。若文档通过大标题或表格结构或序号标明这种归属关系,请在JSON中以嵌套形式表示: -系统作为一级键,包含的货物作为二级键。 -例子:""" 1. 激光扫描系统:xxx