diff --git a/flask_app/general/insert_del_pagemark.py b/flask_app/general/insert_del_pagemark.py index de3f645..061d47c 100644 --- a/flask_app/general/insert_del_pagemark.py +++ b/flask_app/general/insert_del_pagemark.py @@ -95,11 +95,11 @@ def delete_mark(docx_path): if __name__ == '__main__': # input=r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\ztbfile.pdf' - # input=r'C:\Users\Administrator\Desktop\new招标文件\output5\广水市公安局音视频监控系统设备采购项目_procurement.pdf' - # output=insert_mark(input) - doc_path = r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\invalid_added.docx' - res=delete_mark(doc_path) - if res: - print(res) - else: - print("No") \ No newline at end of file + input=r'C:\Users\Administrator\Downloads\国网湖北电力荆州供电公司2024年第四次服务授权竞争性谈判采购-采购文件(15DJ04).pdf' + output=insert_mark(input) + # doc_path = r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\invalid_added.docx' + # res=delete_mark(doc_path) + # if res: + # print(res) + # else: + # print("No") \ No newline at end of file diff --git a/flask_app/general/截取pdf通用函数.py b/flask_app/general/截取pdf通用函数.py index 66f2809..443de7c 100644 --- a/flask_app/general/截取pdf通用函数.py +++ b/flask_app/general/截取pdf通用函数.py @@ -294,29 +294,107 @@ def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, break return start_page, end_page -def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header): + +def generate_end_pattern(extraction_stage, chapter_type=None): + """ + 根据提取阶段和章节类型动态生成 end_pattern。 + + 参数: + extraction_stage (str): 提取阶段标记,如 'first' 表示第一次,'third' 表示第三次。 + chapter_type (str, optional): 章节类型,如 '章' 或 '部分'。 + + 返回: + regex.Pattern: 编译后的正则表达式模式。 + """ + # 定义 end_pattern 模板 + end_pattern_template_first = ( + r'^(? begin_page: + match = regex.search(begin_pattern, cleaned_text) + if match and (start_page is None or i > start_page): start_page = i - matched_text:str = match.group(0) # 获取整个匹配的文本 - # 如果未提供固定的 end_pattern,则根据匹配的章节类型动态生成 - if not local_end_pattern: - if '章' in matched_text: - chapter_type = '章' - elif '部分' in matched_text: - chapter_type = '部分' - else: - chapter_type = None # 未匹配到“章”或“部分” - - if chapter_type: - # 根据 chapter_type 动态生成 end_pattern - end_pattern_dynamic = regex.compile( - rf'^(? mid_page: end_page = i break @@ -423,7 +452,12 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin return path1, path2 # 第一次提取尝试,使用初始的 begin_pattern - start_page, mid_page, end_page = run_extraction(begin_pattern) + begin_pattern = regex.compile( + r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+|' + r'(? Any: data='' ) elif judge_result == JudgeResult.YES: - logger.error("判断是否为招标文件成功!YES") + logger.info("判断是否为招标文件成功!YES") return create_response_normal( message='判断是否为招标文件成功!', status='success', data='yes' ) elif judge_result == JudgeResult.NO: - logger.error("判断是否为招标文件成功!NO") + logger.info("判断是否为招标文件成功!NO") return create_response_normal( message='判断是否为招标文件成功!', status='success', diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py index 001f1db..a75541c 100644 --- a/flask_app/货物标/截取pdf货物标版.py +++ b/flask_app/货物标/截取pdf货物标版.py @@ -248,13 +248,8 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_ ) local_output_suffix = "qualification1" elif selection == 4: - begin_pattern = regex.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+|' - r'(?