diff --git a/flask_app/general/无效标和废标公共代码.py b/flask_app/general/无效标和废标公共代码.py index 476a824..1805ac2 100644 --- a/flask_app/general/无效标和废标公共代码.py +++ b/flask_app/general/无效标和废标公共代码.py @@ -344,8 +344,10 @@ def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keyword continue # 跳过空白行,进入下一个循环 if not found_next_number: # 修改后的正则,支持 '数字 、' 格式 - next_section_number = re.match(r'^([A-Za-z0-9]+(?:[..][A-Za-z0-9]+)*)|([((]\s*\d+\s*[))])|(\d+\s*、)', - next_text) + number_pattern=(r'^([A-Za-z0-9]+(?:[..][A-Za-z0-9]+)*)' #A1 1.1 abc.123 + r'|([((]\s*[一二三四五六七八九十\d]+\s*[))])' #(1) (2) + r'|([一二三四五六七八九十\d]+\s*、)') #1、 2、 + next_section_number = re.match(number_pattern,next_text) if next_section_number: found_next_number = True if next_section_number.group(1): @@ -353,9 +355,9 @@ def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keyword dynamic_pattern = r'^' + r'[..]'.join( [r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b' elif next_section_number.group(2): - dynamic_pattern = r'^[\(\(]\s*\d+\s*[\)\)]' + dynamic_pattern = r'^[\(\(]\s*[一二三四五六七八九十\d]+\s*[\)\)]' elif next_section_number.group(3): - dynamic_pattern = r'^\d+\s*、' + dynamic_pattern = r'^[一二三四五六七八九十\d]+\s*、' current_section_pattern = re.compile(dynamic_pattern) if current_section_pattern and re.match(current_section_pattern, next_text): extracted_paragraphs[active_key].append(next_text) @@ -488,17 +490,17 @@ def handle_query(file_path, user_query, output_file, result_key, keywords): processed_paragraphs = preprocess_paragraphs(doc_contents) extracted_contents = extract_text_with_keywords(processed_paragraphs, [keywords], follow_up_keywords) all_texts1,all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表 + # print(all_texts2) # 1. 得到有序的 all_text1_items all_text1_items = sorted(all_texts1.items(), key=lambda x: x[0]) # 2. 得到纯内容列表 all_texts1_list = [content for (_, content) in all_text1_items] - # print(all_texts) # Proceed only if there is content to write selected_contents = {} final_list=[f"未解析到'{result_key}'!"] seen_contents = set() # 使用集合跟踪已添加的内容以去重 - if all_texts1_list: + if all_texts1_list or all_texts2: with open(output_file, 'w', encoding='utf-8') as file: counter = 1 for content in all_texts1_list: @@ -511,11 +513,14 @@ def handle_query(file_path, user_query, output_file, result_key, keywords): counter += 1 # 生成用户查询 - user_query = generate_full_user_query(output_file, user_query) - model_ans = qianwen_plus(user_query) # 豆包模型返回结果 - # file_id = upload_file(output_file) - # model_ans = qianwen_long(file_id, user_query) - num_list = process_string_list(model_ans) # 处理模型返回的序号 + if not all_texts1_list: + num_list=[] + else: + user_query = generate_full_user_query(output_file, user_query) + model_ans = qianwen_plus(user_query) # 豆包模型返回结果 + # file_id = upload_file(output_file) + # model_ans = qianwen_long(file_id, user_query) + num_list = process_string_list(model_ans) # 处理模型返回的序号 print(result_key + "选中的序号:" + str(num_list)) for index in num_list: @@ -656,7 +661,7 @@ if __name__ == '__main__': output_dir = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\废标" # invalid_added = insert_mark(pdf_path) # # invalid_added_docx = pdf2docx(invalid_added) - invalid_added_docx=r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\invalid_added.docx' + invalid_added_docx=r'C:\Users\Administrator\Desktop\测试信号测试信号.docx' results = combine_find_invalid(invalid_added_docx, output_dir) end_time = time.time() print("Results:", json.dumps(results, ensure_ascii=False, indent=4)) diff --git a/flask_app/general/读取文件/读取docx.py b/flask_app/general/读取文件/读取docx.py index 9a66725..7b69ca6 100644 --- a/flask_app/general/读取文件/读取docx.py +++ b/flask_app/general/读取文件/读取docx.py @@ -90,7 +90,7 @@ def read_docx_by_paragraphs(file_path): return [] if __name__ == "__main__": - file_path = r'D:\flask_project\flask_app\static\output\output1\2179c978-b638-4332-8bd2-2007ad6f7c9b\ztbfile.docx' + file_path = r'C:\Users\Administrator\Desktop\测试信号测试信号.docx' read_docx(file_path) #按行读取 # paragraphs = read_docx_by_paragraphs(file_path) #按段落读取 diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py index 7003453..2076c47 100644 --- a/flask_app/货物标/截取pdf货物标版.py +++ b/flask_app/货物标/截取pdf货物标版.py @@ -315,11 +315,11 @@ if __name__ == "__main__": logger = get_global_logger("123") # input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标" # pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf" - pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\开评定标\(发布稿)恒丰理财有限责任公司智能投研终端项目竞争性磋商.pdf" + pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp\251WA0201146未来大厦LED屏幕采购及安装-自行组织货物类招标文件V1.2.pdf" # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf" # pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf" - output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\开评定标" + output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp" # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2" - selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path + selection = 5 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger) print(generated_files)