diff --git a/flask_app/general/format_change.py b/flask_app/general/format_change.py index 92d5455..68e5089 100644 --- a/flask_app/general/format_change.py +++ b/flask_app/general/format_change.py @@ -1,5 +1,7 @@ import json import os +import time + import requests from flask_app.main.download import download_file diff --git a/flask_app/general/投标人须知正文提取指定内容.py b/flask_app/general/投标人须知正文提取指定内容.py index 4c7d372..f846927 100644 --- a/flask_app/general/投标人须知正文提取指定内容.py +++ b/flask_app/general/投标人须知正文提取指定内容.py @@ -215,7 +215,7 @@ def get_requirements_with_gpt(invalid_path, selection): } """, 2: """ - 该招标文件中开评定标要求是什么?你需要从'开标'、'开标异议'、'评标'、'定标'四个角度回答,其中'评标'可以从特殊情况的处置、评标办法及流程、评标委员会的组建角度来说明,'定标'可以从定标流程、履约能力的审查角度来说明,请以json格式返回给我结果,外层键名分别为'开标'、'开标异议'、'评标'、'定标',你可以用嵌套键值对组织回答,嵌套键名为你对相关子要求的总结,而嵌套键名应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。输出格式示例如下: + 该招标文件中开标、评标、定标要求(或磋商流程内容)是什么?你需要从'开标'、'开标异议'、'评标'、'定标'四个角度回答,其中'评标'可以从特殊情况的处置、评标办法及流程、评标委员会的组建角度来说明,'定标'可以从定标流程、履约能力的审查角度来说明,请以json格式返回给我结果,外层键名分别为'开标'、'开标异议'、'评标'、'定标',你可以用嵌套键值对组织回答,嵌套键名为你对相关子要求的总结,而嵌套键名应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。输出格式示例如下: { "开标":"招标文件关于项目开标的要求", "开标异议":"招标文件中关于开标异议的项", @@ -231,7 +231,7 @@ def get_requirements_with_gpt(invalid_path, selection): } """, 3: """ - 该招标文件中重新招标、不再招标、终止招标的情况分别是什么?请以json格式返回给我结果,键名分别为'重新招标','不再招标','终止招标',键值应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。示例输出如下: + 该招标文件中重新招标(或采购)、不再招标(或采购)、终止招标(或采购)的情况分别是什么?请以json格式返回给我结果,键名分别为'重新招标','不再招标','终止招标',键值应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。示例输出如下: { "重新招标":"有下列情形之一的,招标人将重新招标:(1)投标截止时间止,投标人少于3个的;(2)经评标委员会评审后否决所有投标的;", "不再招标":"重新招标后投标人仍少于3个或者所有投标被否决的,属于必须审批或核准的工程建设项目,经原审批或核准部门批准后不再进行招标。", diff --git a/flask_app/general/读取文件/按页读取pdf.py b/flask_app/general/读取文件/按页读取pdf.py index c411bec..6559287 100644 --- a/flask_app/general/读取文件/按页读取pdf.py +++ b/flask_app/general/读取文件/按页读取pdf.py @@ -149,12 +149,12 @@ def extract_text_by_page(file_path): if __name__ == '__main__': - # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次)_tobidders_notice_part2.pdf' + file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest18.pdf" # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf' # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf" - file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf" + # file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf" # ress = extract_common_header(file_path) # print(ress) res=extract_text_by_page(file_path) diff --git a/flask_app/main/工程标解析main.py b/flask_app/main/工程标解析main.py index af4fcfc..15f649e 100644 --- a/flask_app/main/工程标解析main.py +++ b/flask_app/main/工程标解析main.py @@ -47,13 +47,13 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id): else: logger.error("Unsupported file type provided. Preprocessing halted.") return None - # 调用截取PDF多次 truncate_files = truncate_pdf_multiple(pdf_path, output_folder,unique_id) # 处理各个部分 tobidders_notice_table=truncate_files[0] truncate0_docpath = pdf2docx(tobidders_notice_table) # 投标人须知前附表转docx + truncate_jsonpath = extract_tables_main(truncate0_docpath, output_folder) # 投标人须知前附表docx->json tobidders_notice = truncate_files[1] #投标人须知正文 @@ -86,26 +86,6 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id): 'clause_path': clause_path, 'invalid_docpath': invalid_docpath } - - -def post_processing(data,includes): - # 初始化结果字典,预设'其他'分类为空字典 - result = {"其他": {}} - - # 遍历原始字典的每一个键值对 - for key, value in data.items(): - if key in includes: - # 如果键在includes列表中,直接保留这个键值对 - result[key] = value - else: - # 如果键不在includes列表中,将这个键值对加入到'其他'分类中 - result["其他"][key] = value - - # 如果'其他'分类没有任何内容,可以选择删除这个键 - if not result["其他"]: - del result["其他"] - - return result # 基本信息 def fetch_project_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path): @@ -242,8 +222,9 @@ if __name__ == "__main__": file_type = 2 #1:docx 2:pdf 3:其他 input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf" print("yes") - for output in engineering_bid_main(output_folder, input_file, file_type, "zytest"): - print(output) + # for output in engineering_bid_main(output_folder, input_file, file_type, "zytest"): + # print(output) + preprocess_files(output_folder,input_file,2,"121") end_time = time.time() elapsed_time = end_time - start_time # 计算耗时 print(f"Function execution took {elapsed_time} seconds.") diff --git a/flask_app/main/截取pdf.py b/flask_app/main/截取pdf.py index cb273c7..adb41e1 100644 --- a/flask_app/main/截取pdf.py +++ b/flask_app/main/截取pdf.py @@ -1,5 +1,7 @@ import re import os +import time + from PyPDF2 import PdfReader, PdfWriter from flask_app.general.merge_pdfs import merge_pdfs import concurrent.futures @@ -86,7 +88,6 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf") before_doc = PdfWriter() toc_page = -1 - # 查找目录页 for page_num in range(min(start_page, total_pages)): page_text = pdf_document.pages[page_num].extract_text() @@ -97,12 +98,13 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en # 确定截取的页数 pages_to_extract = toc_page + 1 if toc_page != -1 else start_page - # 提取页面 for page_num in range(pages_to_extract): before_doc.add_page(pdf_document.pages[page_num]) + print(before_pdf_path) with open(before_pdf_path, 'wb') as f_before: before_doc.write(f_before) + # print(f"已保存页面从 0 到 {pages_to_extract} 为 {before_pdf_path}") # 提取指定范围的页面 @@ -113,7 +115,6 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en # 保存新的PDF文件 with open(output_pdf_path, 'wb') as f_output: output_doc.write(f_output) - print(f"{output_suffix} 已截取并保存页面从 {start_page + 1} 到 {end_page + 1} 为 {output_pdf_path}") return output_pdf_path @@ -246,17 +247,11 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter text = page.extract_text() if text: cleaned_text = clean_page_content(text, common_header) - if not is_secondary_match: - if re.search(begin_pattern, cleaned_text) and i >= begin_page: - if output_suffix == "invalid" and start_page: - continue - else: - start_page = i - else: - if re.search(begin_pattern, cleaned_text) and i >=begin_page: - if output_suffix == "notice" and start_page: #考虑投标人须知前附表中有' 同招标公告 '的情况 - pass - else: + if re.search(begin_pattern, cleaned_text) and i >= begin_page: + if not start_page: + if (output_suffix == "notice" or output_suffix == "invalid") and is_secondary_match: + pass # 针对 '同招标公告' 的情况 + elif output_suffix not in ["notice", "invalid"] or not is_secondary_match: start_page = i if start_page is not None and re.search(end_pattern, cleaned_text): condition = i > start_page @@ -300,7 +295,10 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter def get_start_and_common_header(input_path): common_header = extract_common_header(input_path) last_begin_index = 0 - begin_pattern = re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函)\s*$',re.MULTILINE) + begin_pattern = re.compile( + r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\))]?\s*$', + re.MULTILINE + ) pdf_document = PdfReader(input_path) for i, page in enumerate(pdf_document.pages): if i > 25: @@ -366,11 +364,11 @@ def truncate_pdf_main(input_path, output_folder, selection): pattern_pairs = [ ( re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*|^第一卷|^投标邀请书'), + r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'), re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) ), ( - re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|第一卷)\s*$', re.MULTILINE), + re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\))]?\s*$',re.MULTILINE), re.compile(r".*(?:投标人须知|投标人须知前附表)\s*$", re.MULTILINE) ) ] @@ -381,11 +379,11 @@ def truncate_pdf_main(input_path, output_folder, selection): pattern_pairs = [ ( re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*|^第一卷|^投标邀请书'), + r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'), re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷', re.MULTILINE) ), ( - re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|第一卷)\s*$', re.MULTILINE), + re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|第一卷|投标邀请)\s*$', re.MULTILINE), re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷', re.MULTILINE) ) ] @@ -496,6 +494,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix,common_header,las print(f"Error in extract_pages_twice: {e}") return [] + def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_name): """ 合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件, @@ -550,28 +549,39 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na else: print(f"没有找到以 '{suffix}' 结尾的文件。") - print(f"总共将要合并的 PDF 文件数量: {len(all_pdfs_to_merge)}") - - if not all_pdfs_to_merge: - print("没有找到要合并的 PDF 文件。") - return "" - # 过滤掉不存在或为空的文件路径 all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f)] if not all_pdfs_to_merge: - print("没有有效的 PDF 文件需要合并。") + print("没有找到要合并的 PDF 文件。") return "" # 调用 merge_pdfs 函数进行合并 try: merge_pdfs(all_pdfs_to_merge, output_path) print(f"已成功合并 PDF 文件到 '{output_path}'。") - return output_path except Exception as e: print(f"合并 PDF 文件时出错: {e}") return "" + # 检查合并后的文件是否存在且不为空 + if os.path.exists(output_path) and os.path.getsize(output_path) > 0: + # 合并成功,删除 {base_file_name}_before.pdf 文件 + before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf") + if os.path.exists(before_pdf_path): + try: + os.remove(before_pdf_path) + print(f"已删除文件: {before_pdf_path}") + except Exception as e: + print(f"删除文件 {before_pdf_path} 时出错: {e}") + else: + print(f"未找到要删除的文件: {before_pdf_path}") + + return output_path + else: + print(f"合并失败,没有生成 '{output_path}'。") + return "" + def truncate_pdf_multiple(input_path, output_folder, unique_id="123"): global logger logger = get_global_logger(unique_id) @@ -675,10 +685,12 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, uniqu #TODO:目前merged_baseinfo没有包含投标人须知正文。 #投标人须知前附表改为货物标一样的 if __name__ == "__main__": - input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest7.pdf" # 可以是单个PDF文件路径或文件夹路径 #zbtest20.pdf zbtest4_evaluation_method.pdf zbtest8.pdf + start_time=time.time() + input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\招标文件\\HBDL-2024-0181-001-招标文件.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\68549b0b-e892-41a9-897c-c3694535ee61\\ztbfile.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf" - output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\新建文件夹" + # input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹" + output_folder="C:\\Users\\Administrator\\Desktop\\new招标文件\\招标文件\\tmp" # files=truncate_pdf_multiple(input_path,output_folder) # selections = [5, 1] # 仅处理 selection 5、1 # files=truncate_pdf_specific_engineering(input_path,output_folder,selections) @@ -686,4 +698,6 @@ if __name__ == "__main__": selection = 5 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标 generated_files = truncate_pdf_main(input_path, output_folder, selection) - # print("生成的文件:", generated_files) \ No newline at end of file + print("生成的文件:", generated_files) + end_time=time.time() + print("耗时:"+str(end_time-start_time)) \ No newline at end of file diff --git a/flask_app/货物标/商务服务其他要求提取.py b/flask_app/货物标/商务服务其他要求提取.py index 01b2430..c092179 100644 --- a/flask_app/货物标/商务服务其他要求提取.py +++ b/flask_app/货物标/商务服务其他要求提取.py @@ -9,6 +9,8 @@ from flask_app.货物标.截取pdf货物标版 import extract_common_header, cle #正则表达式判断原文中是否有商务、服务、其他要求 def find_exists(truncate_file, required_keys): + if not truncate_file: + return ["商务要求", "服务要求", "其他要求"] common_header = extract_common_header(truncate_file) pdf_document = PdfReader(truncate_file) text = "" diff --git a/flask_app/货物标/基础信息解析main.py b/flask_app/货物标/基础信息解析main.py index 7b16142..5162aea 100644 --- a/flask_app/货物标/基础信息解析main.py +++ b/flask_app/货物标/基础信息解析main.py @@ -3,7 +3,7 @@ import json import threading import time import concurrent.futures -from flask_app.general.json_utils import clean_json_string +from flask_app.general.json_utils import clean_json_string, rename_outer_key from flask_app.general.通用功能函数 import judge_consortium_bidding, process_judge_questions from flask_app.general.多线程提问 import read_questions_from_file, multi_threading from flask_app.general.通义千问long import upload_file @@ -13,6 +13,8 @@ from flask_app.货物标.提取采购需求main import fetch_procurement_reqs def aggregate_basic_info_goods(baseinfo_list): + for i in baseinfo_list: + print(json.dumps(i,ensure_ascii=False,indent=4)) """ 将基础信息列表中的数据进行合并和分类。 @@ -124,6 +126,7 @@ def get_base_info(baseinfo_file_path,clause_path): chosen_numbers, merged = merge_json_to_list(baseinfo_list.pop()) baseinfo_list.append(merged) judge_file_path = 'flask_app/static/提示词/是否相关问题货物标.txt' + # judge_file_path ='D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题货物标.txt' with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: # 提交两个任务 future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, baseinfo_file_path, @@ -132,8 +135,10 @@ def get_base_info(baseinfo_file_path,clause_path): # 等待两个任务完成并获取结果 future1.result() # process_judge_questions 直接修改 baseinfo_list,不需要返回值 - rebidding_situation = future2.result() # 获取提取失败的情况 - baseinfo_list.append(rebidding_situation) + rebidding_situation = future2.result() + update_json = rename_outer_key(rebidding_situation, "重新招标、不再招标和终止招标") + baseinfo_list.append(update_json) + # # judge_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题货物标.txt' # judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers) # # print(judge_questions) @@ -153,7 +158,7 @@ def get_base_info(baseinfo_file_path,clause_path): # baseinfo_list.append(clean_json_string(response)) return baseinfo_list -def combine_basic_info(baseinfo_file_path, procurement_file_path,clause_path): +def combine_basic_info(baseinfo_file_path, procurement_path,procurement_docpath,clause_path): baseinfo_list = [] temp_list = [] procurement_reqs = {} @@ -164,7 +169,7 @@ def combine_basic_info(baseinfo_file_path, procurement_file_path,clause_path): # 定义一个线程函数来获取采购需求 def fetch_procurement_reqs_thread(): nonlocal procurement_reqs - procurement_reqs = fetch_procurement_reqs(procurement_file_path) + procurement_reqs = fetch_procurement_reqs(procurement_path,procurement_docpath) # 创建并启动获取基础信息的线程 thread1 = threading.Thread(target=get_base_info_thread) thread1.start() @@ -186,10 +191,12 @@ def combine_basic_info(baseinfo_file_path, procurement_file_path,clause_path): if __name__ == "__main__": start_time=time.time() # baseinfo_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all\\ztbfile_merged_baseinfo\\ztbfile_merged_baseinfo_3-31.pdf" - baseinfo_file_path="D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\ztbfile_merged_baseinfo.pdf" + baseinfo_file_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_merged_baseinfo.pdf" # procurement_file_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\tmp\\ztbfile_procurement.pdf" - procurement_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zboutpub\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次)_procurement.pdf" - res = combine_basic_info(baseinfo_file_path, procurement_file_path) + procurement_file_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx" + clause_path='D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\clause1.json' + res = combine_basic_info(baseinfo_file_path, procurement_file_path,clause_path) + print("------------------------------------") print(json.dumps(res, ensure_ascii=False, indent=4)) end_time=time.time() print("elasped time:"+str(end_time-start_time)) diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py index 66fa264..3bba4c8 100644 --- a/flask_app/货物标/截取pdf货物标版.py +++ b/flask_app/货物标/截取pdf货物标版.py @@ -124,38 +124,15 @@ def process_files(file_path, output_folder, begin_pattern, begin_page, end_patte if output_suffix == "tobidders_notice": # 确保返回的是元组,并将其中的 None 转换为 "" path1, path2 = result - return (path1 or "", path2 or "") + return [path1 or "", path2 or ""] elif output_suffix == "qualification1": merge_and_cleanup(result, "qualification3") - return result or "" - return result or "" - return "" # 返回空字符串 + return [result or ""] + return [result or ""] + return [""] # 返回空字符串 -def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): - if not os.path.exists(output_folder): - os.makedirs(output_folder) - generated_files = [] - if os.path.isdir(input_path): - for file_name in os.listdir(input_path): - file_path = os.path.join(input_path, file_name) - if is_pdf_or_doc(file_path): - result = process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) - if isinstance(result, tuple): - generated_files.extend([f if f else "" for f in result]) # 保留空字符串 - else: - generated_files.append(result) # 直接添加result,可能是空字符串 - elif os.path.isfile(input_path) and is_pdf_or_doc(input_path): - result = process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) - if isinstance(result, tuple): - generated_files.extend([f if f else "" for f in result]) # 保留空字符串 - else: - generated_files.append(result) # 直接添加result,可能是空字符串 - else: - print("提供的路径既不是文件夹也不是PDF文件。") - - return generated_files # 默认逻辑是start_page匹配上就不再设置了,一般不匹配上目录的原因是设置了begin_page=5,但是匹配'第一章 招标公告'的时候start_page可能会错误匹配到目录。 def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None, @@ -171,11 +148,8 @@ def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, else: if exclusion_pattern and re.search(exclusion_pattern, cleaned_text): continue - if output_suffix == "notice": - if re.search(begin_pattern, cleaned_text) and i > begin_page: - start_page = i - else: - if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page: + if start_page is None and re.search(begin_pattern, cleaned_text): + if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page): start_page = i if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page: end_page = i @@ -273,8 +247,15 @@ def get_patterns_for_qualification2(): re.MULTILINE ) return begin_pattern, end_pattern - def get_patterns_for_notice(): + begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*') + end_pattern = re.compile( + # r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)', + r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', + re.MULTILINE + ) + return begin_pattern, end_pattern +def get_patterns_for_notice_twice(): begin_pattern = re.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*', re.MULTILINE ) @@ -476,7 +457,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header): patterns = [get_patterns_for_qualification(),get_patterns_for_qualification2()] begin_page = 5 elif output_suffix == "notice": - patterns = [get_patterns_for_notice()] + patterns = [get_patterns_for_notice(),get_patterns_for_notice_twice()] begin_page = 0 if patterns: @@ -567,7 +548,6 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na - truncate_files (list): 包含 PDF 文件路径的列表。 - output_path (str): 合并后的 PDF 文件保存路径。 - base_file_name (str): 用于匹配文件名的基础名称。 - - logger (logging.Logger): 日志记录器对象。 返回: - str: 如果合并成功,返回 output_path;否则,返回空字符串 ""。 @@ -625,6 +605,17 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na # 检查合并后的文件是否存在且不为空 if os.path.exists(output_path) and os.path.getsize(output_path) > 0: + # 合并成功,删除 {base_file_name}_before.pdf 文件 + before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf") + if os.path.exists(before_pdf_path): + try: + os.remove(before_pdf_path) + print(f"已删除文件: {before_pdf_path}") + except Exception as e: + print(f"删除文件 {before_pdf_path} 时出错: {e}") + else: + print(f"未找到要删除的文件: {before_pdf_path}") + return output_path else: print(f"合并失败,没有生成 '{output_path}'。") @@ -633,82 +624,92 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na def get_start_and_common_header(input_path): common_header = extract_common_header(input_path) last_begin_index = 0 - begin_pattern = re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函)\s*$',re.MULTILINE) + begin_pattern = re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\))]?\s*$',re.MULTILINE) pdf_document = PdfReader(input_path) for i, page in enumerate(pdf_document.pages): - if i > 25: + if i > 10: return common_header, 0 # 如果页码大于25,直接返回last_begin_index=0 text = page.extract_text() if text: cleaned_text = clean_page_content(text, common_header) - if begin_pattern.search(cleaned_text): + if begin_pattern.search(cleaned_text) and not re.search(r'目\s*录', cleaned_text): last_begin_index = i # 更新第一个匹配的索引,页码从0开始 return common_header,last_begin_index return common_header,last_begin_index + def truncate_pdf_main(input_path, output_folder, selection, output_suffix="default"): try: - last_begin_index = get_start_and_common_header(input_path) + # 检查是否为文件夹 + if os.path.isdir(input_path): + generated_files = [] + for file_name in os.listdir(input_path): + file_path = os.path.join(input_path, file_name) + if is_pdf_or_doc(file_path): + result = process_input(file_path, output_folder, selection, output_suffix) + if isinstance(result, tuple): + generated_files.extend([f if f else "" for f in result]) + else: + generated_files.append(result) + return generated_files + + # 单文件情况 + elif os.path.isfile(input_path) and is_pdf_or_doc(input_path): + return process_input(input_path, output_folder, selection, output_suffix) + + else: + print("提供的路径既不是文件夹也不是PDF文件。") + return [''] + + except Exception as e: + print(f"Error in truncate_pdf_main: {e}") + return [''] # 返回空字符串 + + +def process_input(input_path, output_folder, selection, output_suffix): + try: + # 创建输出文件夹 + if not os.path.exists(output_folder): + os.makedirs(output_folder) + + # 获取起始和通用页眉 + common_header, last_begin_index = get_start_and_common_header(input_path) begin_page = last_begin_index if last_begin_index != 0 else { - 4: 1, # 前附表 - 2: 5, # 评标 - 3: 5, # 资格 - 1: 0, # 公告 - 5: 3 # 采购需求 + 4: 1, + 2: 5, + 3: 5, + 1: 0, + 5: 3 }.get(selection, 0) - if selection == 1: #招标公告 - begin_pattern = re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*' - ) - end_pattern = re.compile( - # r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人|磋商|供应商)须知前附表)' - r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE - ) + + # 根据选择设置对应的模式和结束模式 + if selection == 1: + begin_pattern = re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\))]?\s*$', re.MULTILINE) + end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) local_output_suffix = "notice" elif selection == 2: begin_pattern = re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(磋商|谈判|评标|评定|评审)(方法|办法).*' - ) - end_pattern = re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+' - ) + r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(磋商|谈判|评标|评定|评审)(方法|办法).*') + end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+') local_output_suffix = "evaluation_method" elif selection == 3: - begin_pattern = re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE - ) - end_pattern = re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE - ) + begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE) + end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) local_output_suffix = "qualification1" - elif selection == 4: # 投标人须知前附表和正文 + elif selection == 4: begin_pattern = re.compile( r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表)', - re.MULTILINE - ) - end_pattern=None - # end_pattern = re.compile( - # r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE - # ) + re.MULTILINE) + end_pattern = None local_output_suffix = "tobidders_notice" - elif selection == 5: #采购需求 - # 更新的正则表达式以匹配"第x章"和"第x部分",考虑到可能的空格和其他文字 + elif selection == 5: begin_pattern = re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|' - r'^第[一二三四五六七八九十百千]+(?:章|部分).*?采购.*|' - r'^第[一二三四五六七八九十百千]+(?:章|部分).*?需求.*' - ) - end_pattern = re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+' - ) + r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分).*?采购.*|^第[一二三四五六七八九十百千]+(?:章|部分).*?需求.*') + end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+') local_output_suffix = "procurement" - elif selection==6: #投标文件格式 - begin_pattern=re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:格式).*' - ) - end_pattern=re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE - ) + elif selection == 6: + begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:格式).*') + end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) local_output_suffix = "format" else: print("无效的选择:请选择1-5") @@ -718,11 +719,12 @@ def truncate_pdf_main(input_path, output_folder, selection, output_suffix="defau if output_suffix == "default": output_suffix = local_output_suffix - # 调用相应的处理函数 - return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) or "" + # 调用实际处理文件内容的函数 + return process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) + except Exception as e: - print(f"Error in truncate_pdf_main: {e}") - return [''] # 返回空字符串 + print(f"Error in process_input: {e}") + return [''] def truncate_pdf_multiple(pdf_path, output_folder,unique_id="123"): @@ -819,17 +821,17 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1 # TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况,类似工程标 if __name__ == "__main__": - # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf" + input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.pdf" - input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles" + # input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf" # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp" output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹" - # files = truncate_pdf_multiple(input_path, output_folder) + files = truncate_pdf_multiple(input_path, output_folder) # selections = [1,4] # files=truncate_pdf_specific_goods(input_path,output_folder,selections) - # print(files) - selection = 4# 例如:1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-公告 - generated_files = truncate_pdf_main(input_path, output_folder, selection) + print(files) + # selection = 1# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 + # generated_files = truncate_pdf_main(input_path, output_folder, selection) # print(generated_files) \ No newline at end of file diff --git a/flask_app/货物标/技术参数要求提取.py b/flask_app/货物标/技术参数要求提取.py index 6919c79..d9e906d 100644 --- a/flask_app/货物标/技术参数要求提取.py +++ b/flask_app/货物标/技术参数要求提取.py @@ -164,7 +164,7 @@ def test_all_files_in_folder(input_folder, output_folder): print(f"处理文件 {file_path} 时出错: {e}") if __name__ == "__main__": - truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次)_procurement.pdf" + truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx" output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\tmp" file_id = upload_file(truncate_file) res=get_technical_requirements(file_id) diff --git a/flask_app/货物标/投标人须知正文提取指定内容货物标版.py b/flask_app/货物标/投标人须知正文提取指定内容货物标版.py index b133b3d..6cd2ffb 100644 --- a/flask_app/货物标/投标人须知正文提取指定内容货物标版.py +++ b/flask_app/货物标/投标人须知正文提取指定内容货物标版.py @@ -114,10 +114,11 @@ def extract_from_notice(invalid_path,clause_path, type): return DEFAULT_RESULT if __name__ == "__main__": - file_path = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\tmp\\clause1.json' + file_path = 'D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\clause1.json' + invalid_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile.pdf" # file_path = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\clause1.json' try: - res = extract_from_notice(file_path, 2) # 可以改变此处的 type 参数测试不同的场景 + res = extract_from_notice(invalid_path,file_path, 1) # 可以改变此处的 type 参数测试不同的场景 res2=json.dumps(res,ensure_ascii=False,indent=4) print(res2) except ValueError as e: diff --git a/flask_app/货物标/提取采购需求main.py b/flask_app/货物标/提取采购需求main.py index 31040e7..29215cd 100644 --- a/flask_app/货物标/提取采购需求main.py +++ b/flask_app/货物标/提取采购需求main.py @@ -1,15 +1,13 @@ import concurrent.futures import json import time - -from flask_app.general.format_change import pdf2docx from flask_app.货物标.技术参数要求提取 import get_technical_requirements from flask_app.general.通义千问long import upload_file from flask_app.货物标.商务服务其他要求提取 import get_business_requirements #获取采购清单 -def fetch_procurement_reqs(truncate_file): +def fetch_procurement_reqs(procurement_path,procurement_docpath): # 定义默认的 procurement_reqs 字典 DEFAULT_PROCUREMENT_REQS = { "技术要求": "", @@ -18,19 +16,18 @@ def fetch_procurement_reqs(truncate_file): "其他要求": "" } # 如果 truncate_file 是空字符串,直接返回包含空字符串的字典 - if not truncate_file: + if not procurement_docpath: return DEFAULT_PROCUREMENT_REQS.copy() try: # 上传文件并获取 file_id - truncate_file_docx=pdf2docx(truncate_file) - file_id = upload_file(truncate_file_docx) + file_id = upload_file(procurement_docpath) # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements with concurrent.futures.ThreadPoolExecutor() as executor: # 提交任务给线程池 future_technical = executor.submit(get_technical_requirements, file_id) time.sleep(1) # 如果需要延迟,可以保留,否则建议移除以提高效率 - future_business = executor.submit(get_business_requirements, truncate_file, file_id) + future_business = executor.submit(get_business_requirements, procurement_path, file_id) # 获取并行任务的结果 technical_requirements = future_technical.result() @@ -42,7 +39,6 @@ def fetch_procurement_reqs(truncate_file): "商务要求": business_requirements.get("商务要求", {}), "服务要求": business_requirements.get("服务要求", {}), "其他要求": business_requirements.get("其他要求", {}), - "货物列表":business_requirements.get("货物列表",{}) } return procurement_reqs @@ -55,6 +51,6 @@ def fetch_procurement_reqs(truncate_file): if __name__ == "__main__": output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output" # file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_procurement.pdf" - file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\磋商文件_procurement.pdf" + file_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx" res=fetch_procurement_reqs(file_path) print(json.dumps(res, ensure_ascii=False, indent=4)) diff --git a/flask_app/货物标/评分标准提取main.py b/flask_app/货物标/评分标准提取main.py index 4beba32..16ce5b5 100644 --- a/flask_app/货物标/评分标准提取main.py +++ b/flask_app/货物标/评分标准提取main.py @@ -255,7 +255,7 @@ def combine_evaluation_standards(truncate_file): ) # 执行第二个查询 evaluation_res = qianwen_long(file_id, user_query) - print(evaluation_res) + # print(evaluation_res) # 清理和处理响应 cleaned_evaluation_res = parse_json_with_duplicates(evaluation_res) #处理重复键名的情况 result_data = process_data_based_on_key(cleaned_evaluation_res) #处理不知名外键的情况 diff --git a/flask_app/货物标/货物标解析main.py b/flask_app/货物标/货物标解析main.py index 4f914a1..fded120 100644 --- a/flask_app/货物标/货物标解析main.py +++ b/flask_app/货物标/货物标解析main.py @@ -1,6 +1,6 @@ # 竞磋 竞谈 磋商 询价 邀请 单一来源 import json - +import time from flask_app.general.format_change import docx2pdf, pdf2docx,doc2docx from flask_app.general.json_utils import transform_json_values from flask_app.货物标.基础信息解析main import combine_basic_info @@ -56,7 +56,8 @@ def preprocess_files(output_folder, file_path, file_type): # 处理各个部分 invalid_path=pdf_path invalid_docpath = docx_path # docx截取无效标部分 - procurement_path = truncate_files[5] # 商务技术服务要求 + procurement_path = truncate_files[5] # 采购需求 + procurement_docpath=pdf2docx(procurement_path) # 采购需求docx evaluation_method_path = truncate_files[1] # 评标办法 qualification_path = truncate_files[2] # 资格审查 tobidders_notice_path = truncate_files[4] # 投标人须知正文 @@ -72,6 +73,7 @@ def preprocess_files(output_folder, file_path, file_type): 'invalid_path': invalid_path, 'output_folder': output_folder, 'procurement_path': procurement_path, + 'procurement_docpath':procurement_docpath, 'evaluation_method_path': evaluation_method_path, 'qualification_path': qualification_path, 'notice_path': notice_path, @@ -81,12 +83,14 @@ def preprocess_files(output_folder, file_path, file_type): 'merged_baseinfo_path': merged_baseinfo_path } -def fetch_project_basic_info(invalid_path, merged_baseinfo_path, procurement_file_path, clause_path): +def fetch_project_basic_info(invalid_path,invalid_docpath, merged_baseinfo_path, procurement_path,procurement_docpath, clause_path): logger.info("starting 基础信息...") start_time = time.time() if not merged_baseinfo_path: merged_baseinfo_path = invalid_path - basic_res = combine_basic_info(merged_baseinfo_path, procurement_file_path, clause_path) + if not procurement_docpath: + procurement_docpath=invalid_docpath + basic_res = combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath, clause_path) base_info, good_list = post_process_baseinfo(basic_res) end_time = time.time() logger.info(f"基础信息 done,耗时:{end_time - start_time:.2f} 秒") @@ -125,20 +129,22 @@ def fetch_invalid_requirements(invalid_docpath, output_folder): logger.info(f"无效标与废标 done,耗时:{end_time - start_time:.2f} 秒") return find_invalid_res -def fetch_bidding_documents_requirements(clause_path): +def fetch_bidding_documents_requirements(invalid_path,clause_path): logger.info("starting 投标文件要求...") start_time = time.time() - fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1) + selection=1 + fetch_bidding_documents_requirements_json = extract_from_notice(invalid_path,clause_path, selection) end_time = time.time() logger.info(f"投标文件要求 done,耗时:{end_time - start_time:.2f} 秒") return {"投标文件要求": fetch_bidding_documents_requirements_json} # 开评定标流程 -def fetch_bid_opening(clause_path): +def fetch_bid_opening(invalid_path,clause_path): logger.info("starting 开评定标流程...") start_time = time.time() - fetch_bid_opening_json = extract_from_notice(clause_path, 2) + selection=2 + fetch_bid_opening_json = extract_from_notice(invalid_path,clause_path, selection) end_time = time.time() logger.info(f"开评定标流程 done,耗时:{end_time - start_time:.2f} 秒") return {"开评定标流程": fetch_bid_opening_json} @@ -195,11 +201,11 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id): processed_data['evaluation_method_path']), 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], output_folder), - 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, + 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'], processed_data['clause_path']), - 'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path']), - 'base_info': executor.submit(fetch_project_basic_info, processed_data['invalid_path'],processed_data['merged_baseinfo_path'], - processed_data['procurement_path'],processed_data['clause_path']), + 'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_path'],processed_data['clause_path']), + 'base_info': executor.submit(fetch_project_basic_info, processed_data['invalid_path'],processed_data['invalid_docpath'],processed_data['merged_baseinfo_path'], + processed_data['procurement_path'],processed_data['procurement_docpath'],processed_data['clause_path']), 'qualification_review': executor.submit(fetch_qualification_review, processed_data['invalid_path'],output_folder, processed_data['qualification_path'], processed_data['notice_path']), @@ -234,8 +240,6 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id): #TODO:区分output目录 陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1)_tobidders_notice_part2.pdf提取有问题 #good_list 金额 截取上下文 if __name__ == "__main__": - import time - # 配置日志器 unique_id = "uuidzyzy11" logger = get_global_logger(unique_id)