diff --git a/flask_app/general/截取pdf_main.py b/flask_app/general/截取pdf_main.py index 11442a7..3bb086c 100644 --- a/flask_app/general/截取pdf_main.py +++ b/flask_app/general/截取pdf_main.py @@ -119,10 +119,10 @@ if __name__ == "__main__": # input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标" # pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf" # pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf" - pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\“平安城市”三期暨雪亮工程一体化 建设项目.pdf" + pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp\file1736998876340 (1).doc" # pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf" # input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf" - output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)" + output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp" # selections = [1, 4] # 仅处理 selection 4、1 # selections = [1, 2, 3, 5] # files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods', selections) #engineering diff --git a/flask_app/general/投标人须知正文提取指定内容.py b/flask_app/general/投标人须知正文提取指定内容.py index d2a875a..07b478f 100644 --- a/flask_app/general/投标人须知正文提取指定内容.py +++ b/flask_app/general/投标人须知正文提取指定内容.py @@ -406,9 +406,9 @@ def extract_from_notice(merged_baseinfo_path, clause_path, type): if __name__ == "__main__": # file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json' merged_baseinfo_path=r"C:\Users\Administrator\Desktop\fsdownload\b29de31a-297e-42cf-b9ba-6859e530a472\ztbfile_merged_baseinfo.pdf" - clause_path=r"D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\tmp\clause1.json" + clause_path=r"C:\Users\Administrator\Desktop\fsdownload\ec7d5328-9c57-450f-baf4-2e5a6f90ed1d\tmp\clause1.json" try: - res = extract_from_notice(merged_baseinfo_path,clause_path, 2) # 可以改变此处的 type 参数测试不同的场景 + res = extract_from_notice(merged_baseinfo_path,clause_path, 1) # 可以改变此处的 type 参数测试不同的场景 res2 = json.dumps(res, ensure_ascii=False, indent=4) print(res2) except ValueError as e: diff --git a/flask_app/general/投标人须知正文条款提取成json文件.py b/flask_app/general/投标人须知正文条款提取成json文件.py index 58028e8..df241a6 100644 --- a/flask_app/general/投标人须知正文条款提取成json文件.py +++ b/flask_app/general/投标人须知正文条款提取成json文件.py @@ -481,7 +481,7 @@ def convert_clause_to_json(file_path, output_folder, type=1): if __name__ == "__main__": - file_path = r'C:\Users\Administrator\Desktop\货物标\zbfiles\output4444\招标文件111_tobidders_notice_part2.pdf' + file_path = r'C:\Users\Administrator\Desktop\货物标\zbfiles\output4444\唐山市公安交通警察支队机动车查验机构视频存储回放系统竞争性谈判-招标文件正文(1)_tobidders_notice_part1.pdf' # file_path=r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf' # file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\6.2定版视频会议磋商文件_tobidders_notice_part2.pdf' diff --git a/flask_app/general/无效标和废标公共代码.py b/flask_app/general/无效标和废标公共代码.py index 1805ac2..81b59f8 100644 --- a/flask_app/general/无效标和废标公共代码.py +++ b/flask_app/general/无效标和废标公共代码.py @@ -344,10 +344,12 @@ def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keyword continue # 跳过空白行,进入下一个循环 if not found_next_number: # 修改后的正则,支持 '数字 、' 格式 - number_pattern=(r'^([A-Za-z0-9]+(?:[..][A-Za-z0-9]+)*)' #A1 1.1 abc.123 - r'|([((]\s*[一二三四五六七八九十\d]+\s*[))])' #(1) (2) - r'|([一二三四五六七八九十\d]+\s*、)') #1、 2、 - next_section_number = re.match(number_pattern,next_text) + number_pattern = ( + r'^([A-Za-z0-9]+(?:[..][A-Za-z0-9]+)*)|' # 第一部分:匹配 A1, 1.1, abc.123 等 + r'([((]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[))])|' # 第二部分:匹配 (一), (12) 等,其中中文数字1到2位或任意位数字 + r'((?:[一二三四五六七八九十]{1,2}|\d+)\s*、)' # 第三部分:匹配 一、12、等,其中中文数字1到2位或任意位数字 + ) + next_section_number = re.match(number_pattern,next_text) #re.match总是从字符串开头匹配,即使正则模式内部没有 ^ 锚点。 if next_section_number: found_next_number = True if next_section_number.group(1): @@ -355,9 +357,9 @@ def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keyword dynamic_pattern = r'^' + r'[..]'.join( [r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b' elif next_section_number.group(2): - dynamic_pattern = r'^[\(\(]\s*[一二三四五六七八九十\d]+\s*[\)\)]' + dynamic_pattern = r'^[\(\(]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[\)\)]' elif next_section_number.group(3): - dynamic_pattern = r'^[一二三四五六七八九十\d]+\s*、' + dynamic_pattern = r'^(?:[一二三四五六七八九十]{1,2}|\d+)\s*、' current_section_pattern = re.compile(dynamic_pattern) if current_section_pattern and re.match(current_section_pattern, next_text): extracted_paragraphs[active_key].append(next_text) @@ -394,12 +396,12 @@ def split_cell_text(text): # 2. 分割句子,保证句子完整性(按标点符号和序号分割) split_sentences = regex.split( r'(?<=[。!?!?\?])|' # 在中文句号、感叹号、问号后分割 - r'(?\d+(?:[..]\d+)+)(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万台份家]))|' # 匹配多级编号,(?=(?>\d+(?:[..]\d+)+)让多级编号匹配“一口气”完成;限制后面不能是指定关键字 r'(?