diff --git a/flask_app/general/投标人须知正文提取指定内容.py b/flask_app/general/投标人须知正文提取指定内容.py index f846927..bdd3943 100644 --- a/flask_app/general/投标人须知正文提取指定内容.py +++ b/flask_app/general/投标人须知正文提取指定内容.py @@ -1,3 +1,5 @@ +# -*- encoding:utf-8 -*- +import json import re from functools import cmp_to_key @@ -196,26 +198,60 @@ def get_requirements_with_gpt(invalid_path, selection): file_id = upload_file(invalid_path) # 定义 selection 对应的用户查询 user_queries = { - 1: """ - 该招标文件中对投标文件的要求是什么?你需要从'编写要求'、'格式要求'、'承诺书要求'、'递交要求'四个角度来回答,其中'格式'可以从投标文件格式要求、标记要求、装订要求、文件数量要求角度说明,'递交要求'可以从投标地点、投标文件交标方式、投标文件的修改与撤回角度说明,请以json格式返回给我结果,外层键名分别为'编写要求','格式','承诺书要求','递交要求',你可以用嵌套键值对组织回答,嵌套键名为你对相关子要求的总结,而嵌套键名应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。输出格式示例如下: + # 1: """ + # 该招标文件中对投标文件的要求是什么?你需要从'编写要求'、'格式要求'、'承诺书要求'、'递交要求'四个角度来回答,其中'格式'可以从投标文件格式要求、标记要求、装订要求、文件数量要求角度说明,而不一定一致;'递交要求'可以从投标地点、投标文件交标方式、投标文件的修改与撤回角度说明,而不一定一致;请以json格式返回给我结果,外层键名分别为'编写要求','格式','承诺书要求','递交要求',你可以用嵌套键值对组织回答,嵌套键名为你对相关子要求的总结,而嵌套键名应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。输出格式示例如下,内容与该文件无关: + # { + # "编写要求":"编写要求xxx", + # "格式要求":{ + # "投标文件格式要求":"投标文件格式要求", + # "标记要求":"投标文件标记要求", + # "装订要求":"投标文件装订要求", + # "文件数量":"投标文件文件数量要求" + # }, + # "承诺书要求":"承诺书要求xxx", + # "递交要求":{ + # "投标地点":"投标地点", + # "投标文件交标方式":"交标方式", + # "投标文件的修改与撤回":["投标文件的修改相关要求","投标文件的撤回相关要求"] + # } + # } + # """, + 1:""" + 该招标文件中对投标文件的要求有哪些?请以json格式给我返回结果,外层键名为文中提到的有关投标文件的若干要求,对于各子要求,你可以用嵌套键值对组织回答,其中嵌套键名为你对相关子要求的总结或是原文中的标题,而最内层的键值应该完全与原文内容保持一致;若存在多个并列内容,那么键值为一个列表,列表中包含若干描述该要求的字符串,要求键值与原文内容一致,不得总结、删减。示例如下,仅供格式参考: { - "编写要求":"投标函的编写要求xxx;法定代表人身份证明要求xx", - "格式要求":{ - "投标文件格式要求":"投标文件格式要求", - "标记要求":"投标文件标记要求", - "装订要求":"投标文件装订要求", - "文件数量":"投标文件文件数量要求" - }, - "承诺书要求":"未知", - "递交要求":{ - "投标地点":"使用加密其投标文件的CA数字证书(企业锁)登录“电子交易系统”,进入“开标大厅”选择所投标段进行签到,并实时在线关注招标人的操作情况。", - "投标文件交标方式":"线上开标", - "投标文件的修改与撤回":"在投标人须知前附表规定的投标有效期内,投标人不得要求撤销或修改其投标文件。出现特殊情况需要延长投标有效期的,招标人以书面形式通知所有投标人延长投标有效期。投标人同意延长的,应相应延长其投标保证金的有效期,但不得要求或被允许修改或撤销其投标文件;投标人拒绝延长的,其投标失效,但投标人有权收回其投标保证金。" + "编写要求":"编写要求xxx", + "格式要求":{ + "投标文件格式要求":"投标文件格式要求", + "标记要求":"投标文件标记要求", + "装订要求":"投标文件装订要求", + "文件数量":"投标文件文件数量要求" + }, + "承诺书要求":"承诺书要求xxx", + "递交要求":{ + "递交投标文件的截止日期及递交地点":["递交投标文件的截止日期","递交投标文件的递交地点"], + "投标文件交标方式":"交标方式", + "投标文件的修改与撤回":["投标文件的修改相关要求","投标文件的撤回相关要求"] + } } - } """, - 2: """ - 该招标文件中开标、评标、定标要求(或磋商流程内容)是什么?你需要从'开标'、'开标异议'、'评标'、'定标'四个角度回答,其中'评标'可以从特殊情况的处置、评标办法及流程、评标委员会的组建角度来说明,'定标'可以从定标流程、履约能力的审查角度来说明,请以json格式返回给我结果,外层键名分别为'开标'、'开标异议'、'评标'、'定标',你可以用嵌套键值对组织回答,嵌套键名为你对相关子要求的总结,而嵌套键名应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。输出格式示例如下: + # 2: """ + # 该招标文件中开标、评标、定标要求(或磋商流程内容)是什么?你需要从'开标'、'开标异议'、'评标'、'定标'四个角度回答,其中'评标'可以从特殊情况的处置、评标办法及流程、评标委员会的组建角度来说明,'定标'可以从定标流程、履约能力的审查角度来说明,请以json格式返回给我结果,外层键名分别为'开标'、'开标异议'、'评标'、'定标',你可以用嵌套键值对组织回答,嵌套键名为你对相关子要求的总结,而嵌套键值应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。输出格式示例如下: + # { + # "开标":"招标文件关于项目开标的要求", + # "开标异议":"招标文件中关于开标异议的项", + # "评标":{ + # "特殊情况的处置":"因“电子交易系统”系统故障导致无法投标的,交易中心及时通知招标人,招标人视情况决定是否顺延投标截止时间。因投标人自身原因导致无法完成投标的,由投标人自行承担后果。", + # "评标办法及流程":"评标流程", + # "评标委员会的组建":"评标由招标人依法组建的评标委员会负责。评标委员会由招标人或其委托的招标代理机构熟悉相关业务的代表,以及有关技术、经济等方面的专家组成。" + # }, + # "定标":{ + # "定标流程":"定标流程", + # "履约能力的审查":"履约能力的审查" + # } + # } + # """, + 2:""" + 该招标文件中开评定标要求(或磋商流程内容)是什么?请以json格式给我返回结果,外层键名为文中提到的有关开评定标要求(或磋商流程内容)的若干要求,对于各子要求,你可以用嵌套键值对组织回答,其中嵌套键名为你对相关子要求的总结或是原文中的标题,而最内层的键值应该完全与原文内容保持一致;若存在多个并列内容,那么键值为一个列表,列表中包含若干描述该要求的字符串,要求键值与原文内容一致,不得总结、删减。示例如下,仅供格式参考: { "开标":"招标文件关于项目开标的要求", "开标异议":"招标文件中关于开标异议的项", @@ -231,7 +267,7 @@ def get_requirements_with_gpt(invalid_path, selection): } """, 3: """ - 该招标文件中重新招标(或采购)、不再招标(或采购)、终止招标(或采购)的情况分别是什么?请以json格式返回给我结果,键名分别为'重新招标','不再招标','终止招标',键值应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。示例输出如下: + 该招标文件中重新招标(或重新采购)、不再招标(或不再采购)、终止招标(或终止采购)的情况分别是什么?请以json格式返回给我结果,键名分别为'重新招标','不再招标','终止招标',键值应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。示例输出如下: { "重新招标":"有下列情形之一的,招标人将重新招标:(1)投标截止时间止,投标人少于3个的;(2)经评标委员会评审后否决所有投标的;", "不再招标":"重新招标后投标人仍少于3个或者所有投标被否决的,属于必须审批或核准的工程建设项目,经原审批或核准部门批准后不再进行招标。", @@ -242,6 +278,7 @@ def get_requirements_with_gpt(invalid_path, selection): # 根据 selection 选择相应的 user_query user_query = user_queries.get(selection) + if not user_query: return {"error": f"无效的 selection 值: {selection}. 请选择 1、2 或 3。"} # 调用大模型并处理响应 @@ -250,4 +287,10 @@ def get_requirements_with_gpt(invalid_path, selection): cleaned_res = clean_json_string(res) return cleaned_res except Exception as e: - return {"error": "调用大模型失败"} \ No newline at end of file + return {"error": "调用大模型失败"} + +if __name__ == "__main__": + merged_baseinfo_path = r"D:\flask_project\flask_app\static\output\output1\648e094b-e677-47ce-9073-09e0c82af210\ztbfile_merged_baseinfo.pdf" + selection=2 + res=get_requirements_with_gpt(merged_baseinfo_path,selection) + print(json.dumps(res,ensure_ascii=False,indent=4)) \ No newline at end of file diff --git a/flask_app/general/读取文件/按页读取pdf.py b/flask_app/general/读取文件/按页读取pdf.py index 9dcd722..1828641 100644 --- a/flask_app/general/读取文件/按页读取pdf.py +++ b/flask_app/general/读取文件/按页读取pdf.py @@ -95,7 +95,7 @@ def extract_text_by_page(file_path): if __name__ == '__main__': - input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf" + file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf' @@ -103,5 +103,5 @@ if __name__ == '__main__': # file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf" # ress = extract_common_header(file_path) # print(ress) - res=extract_text_by_page(input_path) + res=extract_text_by_page(file_path) # print(res)磋商文件_tobidders_notice_part2.pdf \ No newline at end of file diff --git a/flask_app/general/通用功能函数.py b/flask_app/general/通用功能函数.py index 38e72f3..64b6ddb 100644 --- a/flask_app/general/通用功能函数.py +++ b/flask_app/general/通用功能函数.py @@ -9,7 +9,7 @@ from flask_app.general.多线程提问 import multi_threading from flask_app.general.通义千问long import upload_file from flask_app.main.判断是否分包等 import read_questions_from_judge -def process_judge_questions(judge_file_path, chosen_numbers, baseinfo_path, baseinfo_list1): +def process_judge_questions(judge_file_path, chosen_numbers, merged_baseinfo_path, baseinfo_list1): judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers) judge_consortium = judge_consortium_bidding(baseinfo_list1) if judge_consortium: @@ -18,7 +18,7 @@ def process_judge_questions(judge_file_path, chosen_numbers, baseinfo_path, base "外层键名为'联合体投标要求',其中有一个嵌套键值对为:\"是否接受联合体投标\":\"是\"" ) judge_questions.append(judge_consortium_question) - file_id3 = upload_file(baseinfo_path) + file_id3 = upload_file(merged_baseinfo_path) res2 = multi_threading(judge_questions, "", file_id3, 2) if not res2: diff --git a/flask_app/main/基础信息整合快速版.py b/flask_app/main/基础信息整合快速版.py index 517c3d1..08c18a2 100644 --- a/flask_app/main/基础信息整合快速版.py +++ b/flask_app/main/基础信息整合快速版.py @@ -143,7 +143,7 @@ def process_baseinfo_list(baseinfo_list, tobidders_notice): return [] -def combine_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path): +def combine_basic_info(merged_baseinfo_path,merged_baseinfo_path_more, tobidders_notice_table, tobidders_notice, clause_path): """ 综合和处理基础信息,生成最终的基础信息字典。 @@ -173,7 +173,7 @@ def combine_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_tabl # 提交两个任务 future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, tobidders_notice_table, baseinfo_list1) future2 = executor.submit(process_baseinfo_list, baseinfo_list1_copy, tobidders_notice) - future3 = executor.submit(extract_from_notice, invalid_path, clause_path, 3) # 新增的多线程任务 + future3 = executor.submit(extract_from_notice, merged_baseinfo_path_more, clause_path, 3) # 新增的多线程任务 # 等待两个任务完成并获取结果 future1.result() # process_judge_questions 直接修改 baseinfo_list1,不需要返回值 diff --git a/flask_app/main/工程标解析main.py b/flask_app/main/工程标解析main.py index 15f649e..e4dc8df 100644 --- a/flask_app/main/工程标解析main.py +++ b/flask_app/main/工程标解析main.py @@ -1,11 +1,13 @@ # -*- encoding:utf-8 -*- import json import logging +import os import time from concurrent.futures import ThreadPoolExecutor from flask_app.general.投标人须知正文提取指定内容 import get_requirements_with_gpt from flask_app.main.截取pdf import truncate_pdf_multiple +from flask_app.general.merge_pdfs import merge_pdfs from flask_app.main.table_content_extraction import extract_tables_main from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json from flask_app.general.json_utils import transform_json_values @@ -68,6 +70,9 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id): merged_baseinfo_path=truncate_files[-1] + more_path=[merged_baseinfo_path,tobidders_notice] + merged_baseinfo_path_more=os.path.join(output_folder,"merged_baseinfo_path_more.pdf") + merge_pdfs(more_path,merged_baseinfo_path_more) clause_path = convert_clause_to_json(tobidders_notice, output_folder) # 投标人须知正文条款pdf->json end_time=time.time() logger.info(f"文件预处理 done,耗时:{end_time - start_time:.2f} 秒") @@ -83,21 +88,24 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id): 'qualification': qualification, 'truncate0_jsonpath': truncate_jsonpath, 'merged_baseinfo_path':merged_baseinfo_path, + 'merged_baseinfo_path_more':merged_baseinfo_path_more, 'clause_path': clause_path, 'invalid_docpath': invalid_docpath } # 基本信息 -def fetch_project_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path): +def fetch_project_basic_info(invalid_path, merged_baseinfo_path, merged_baseinfo_path_more,tobidders_notice_table, tobidders_notice, clause_path): logger.info("starting 基础信息...") start_time = time.time() if not merged_baseinfo_path: merged_baseinfo_path = invalid_path + if not merged_baseinfo_path_more: + merged_baseinfo_path_more=invalid_path if not tobidders_notice_table: tobidders_notice_table = invalid_path if not tobidders_notice: tobidders_notice = invalid_path - basic_res = combine_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path) + basic_res = combine_basic_info(merged_baseinfo_path,merged_baseinfo_path_more, tobidders_notice_table, tobidders_notice, clause_path) end_time = time.time() logger.info(f"基础信息 done,耗时:{end_time - start_time:.2f} 秒") return basic_res @@ -145,21 +153,25 @@ def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpat # 投标文件要求 -def fetch_bidding_documents_requirements(invalid_path, clause_path): +def fetch_bidding_documents_requirements(invalid_path, merged_baseinfo_path_more,clause_path): logger.info("starting 投标文件要求...") start_time = time.time() + if not merged_baseinfo_path_more: + merged_baseinfo_path_more=invalid_path selection = 1 - fetch_bidding_documents_requirements_json = extract_from_notice(invalid_path, clause_path, selection) + fetch_bidding_documents_requirements_json = extract_from_notice(merged_baseinfo_path_more, clause_path, selection) end_time = time.time() logger.info(f"投标文件要求 done,耗时:{end_time - start_time:.2f} 秒") return {"投标文件要求": fetch_bidding_documents_requirements_json} # 开评定标流程 -def fetch_bid_opening(invalid_path, clause_path): +def fetch_bid_opening(invalid_path, merged_baseinfo_path_more,clause_path): logger.info("starting 开评定标流程...") start_time = time.time() + if not merged_baseinfo_path_more: + merged_baseinfo_path_more=invalid_path selection = 2 - fetch_bid_opening_json = extract_from_notice(invalid_path, clause_path, selection) + fetch_bid_opening_json = extract_from_notice(merged_baseinfo_path_more, clause_path, selection) end_time = time.time() logger.info(f"开评定标流程 done,耗时:{end_time - start_time:.2f} 秒") return {"开评定标流程": fetch_bid_opening_json} @@ -177,7 +189,7 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_ with concurrent.futures.ThreadPoolExecutor() as executor: # 立即启动不依赖 knowledge_name 和 index 的任务 futures = { - 'base_info': executor.submit(fetch_project_basic_info,processed_data['invalid_path'] ,processed_data['merged_baseinfo_path'], + 'base_info': executor.submit(fetch_project_basic_info,processed_data['invalid_path'] ,processed_data['merged_baseinfo_path'],processed_data['merged_baseinfo_path_more'], processed_data['tobidders_notice_table'], processed_data['tobidders_notice'], processed_data['clause_path']), 'qualification_review': executor.submit(fetch_qualification_review, processed_data['evaluation_method'], @@ -189,8 +201,8 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_ 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], output_folder, processed_data['truncate0_jsonpath'], processed_data['clause_path'], processed_data['qualification']), - 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'], processed_data['clause_path']), - 'opening_bid': executor.submit(fetch_bid_opening,processed_data['invalid_path'], processed_data['clause_path']) + 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'], processed_data['merged_baseinfo_path_more'],processed_data['clause_path']), + 'opening_bid': executor.submit(fetch_bid_opening,processed_data['invalid_path'],processed_data['merged_baseinfo_path_more'], processed_data['clause_path']) } # 提前处理这些不依赖的任务,按完成顺序返回 diff --git a/flask_app/main/投标人须知正文条款提取成json文件.py b/flask_app/main/投标人须知正文条款提取成json文件.py index 973625c..293618b 100644 --- a/flask_app/main/投标人须知正文条款提取成json文件.py +++ b/flask_app/main/投标人须知正文条款提取成json文件.py @@ -30,32 +30,32 @@ def extract_text_from_pdf(file_path, start_word, end_pattern): pdf_document = PdfReader(file_path) all_pages_text = [] start_index = None + # 处理所有页面 for i, page in enumerate(pdf_document.pages): page_text = page.extract_text() if page.extract_text() else "" cleaned_text = clean_page_content(page_text, common_header) - # print(cleaned_text) + # 在第一页查找开始位置 if i == 0 and start_index is None: - start_match = re.search(start_word, cleaned_text, re.MULTILINE) - if start_match: - start_index = start_match.start() - cleaned_text = cleaned_text[start_index:] + for pattern in (start_word if isinstance(start_word, list) else [start_word]): + start_match = re.search(pattern, cleaned_text) + if start_match: + start_index = start_match.start() + cleaned_text = cleaned_text[start_index:] + break # 找到一个匹配后跳出循环 # 在最后一页查找结束位置 if i == len(pdf_document.pages) - 1: - for pattern in end_pattern: - matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE)) - if matches: - end_index = matches[-1].start() - cleaned_text = cleaned_text[:end_index] - break + matches = list(re.finditer(end_pattern, cleaned_text)) + if matches: + end_index = matches[-1].start() + cleaned_text = cleaned_text[:end_index] all_pages_text.append(cleaned_text) # 合并所有页面的文本 full_text = "\n".join(all_pages_text) - # print(full_text) return full_text def extract_section(text, start_pattern, end_phrases): # 查找开始模式 @@ -113,8 +113,16 @@ def parse_text_by_heading(text): current_content = [] append_newline = False lines = text.split('\n') + + def check_year_pattern(line): + line_no_spaces = line.replace(' ', '') + return re.match(r'^\d{4}', line_no_spaces) and line_no_spaces[4:5] == '年' for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n' line_stripped = line.strip().replace('.', '.') + # 检查是否以年份开头,如果是,属于上一个标题内容 + if check_year_pattern(line_stripped) and current_key is not None: + current_content.append(line_stripped) + continue # 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号 match = re.match(r'^(? current_key_num and new_key_num - current_key_num <= 5)): + # 处理临时标题与新主标题的关联 + if temp_title: + main_number = new_key_candidate.split('.')[0] + data[f"{main_number}."] = temp_title + temp_title = None # 重置临时标题 + # 开始新的标题 if current_key is not None: content_string = ''.join(current_content).strip() @@ -216,19 +247,28 @@ def parse_text_by_heading(text): current_content = [line_content] append_newline = True last_main_number = new_key_candidate.rstrip('.') + # if current_key is None or (current_key != new_key and ( #不给序号排序 + # len(current_content) == 0 or current_content[-1][-1] != '第')): + # if current_key is not None: + # content_string = ''.join(current_content).strip() + # data[current_key] = data.get(current_key, '') + content_string.replace(' ', '') + # current_key = new_key + # current_content = [line_content] + # append_newline = len(new_key.rstrip('.').split('.')) <= 2 + # last_main_number = new_key.split('.')[0] else: # 将当前行视为当前标题的内容 append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords) else: - if not skip_subheadings: - # 合并了中文标题、字母标题、阿拉伯数字标题的匹配逻辑 - pattern_title = re.compile(r'^\s*(?:[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)') + if not skip_subheadings: # 合并中文标题、字母标题、阿拉伯数字标题的匹配逻辑 + pattern_title = re.compile( + r'^\s*(?:[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)') chinese_match = pattern_title.match(line_stripped) letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped) arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped) - # 优化了处理逻辑,减少冗余 + # 优化处理逻辑,减少冗余 if chinese_match or letter_match or arabic_match: # 保存之前的 key 的内容(无论是中文标题还是阿拉伯数字标题) if current_key is not None: @@ -277,6 +317,9 @@ def parse_text_by_heading(text): data[current_key] = data.get(current_key, '') + content_string.replace(' ', '') if current_key_chinese is not None: data[current_key_chinese] = current_value_chinese + if temp_title: + # 处理任何未关联的临时标题 + data["未分类标题"] = temp_title # 您可以根据需要选择合适的键名 return data @@ -364,13 +407,13 @@ def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json print(f"The specified file does not exist: {file_path}") return "" if type == 1: - start_word = r'^\s*[((]?\s*[一1]\s*[))]?\s*[、.]*\s*(说\s*明|总\s*则)' - end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+' + start_word = r'^\s*(?:[((]?\s*[一二12]?\s*[))]?\s*[、..]*\s*)?(说\s*明|总\s*则)' + end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*)$' text = extract_text_from_pdf(file_path, start_word, end_pattern) result = parse_text_by_heading(text) else: - start_word = r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*' - end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人|磋商|供应商)须知前附表)' + start_word = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*|.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*)$' + end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+' text = extract_text_from_pdf(file_path, start_word, end_pattern) result=parse_text_to_dict(text) # result = convert_to_json(input_path, start_word, end_pattern) @@ -407,9 +450,9 @@ def process_folder(input_folder, output_folder): #TODO: 投标人须知正文这块,序号可能是乱序的,或许可以删除判断序号大小的逻辑,只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理 if __name__ == "__main__": # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf' - file_path='D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\ztbfile_tobidders_notice_part2.pdf' + file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf' - output_folder = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\tmp' + output_folder = 'D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\tmp' try: output_path = convert_clause_to_json(file_path,output_folder,1) print(f"Final JSON result saved to: {output_path}") diff --git a/flask_app/货物标/货物标解析main.py b/flask_app/货物标/货物标解析main.py index fded120..5f31508 100644 --- a/flask_app/货物标/货物标解析main.py +++ b/flask_app/货物标/货物标解析main.py @@ -129,22 +129,26 @@ def fetch_invalid_requirements(invalid_docpath, output_folder): logger.info(f"无效标与废标 done,耗时:{end_time - start_time:.2f} 秒") return find_invalid_res -def fetch_bidding_documents_requirements(invalid_path,clause_path): +def fetch_bidding_documents_requirements(invalid_path,merged_baseinfo_path,clause_path): logger.info("starting 投标文件要求...") + if not merged_baseinfo_path: + merged_baseinfo_path=invalid_path start_time = time.time() selection=1 - fetch_bidding_documents_requirements_json = extract_from_notice(invalid_path,clause_path, selection) + fetch_bidding_documents_requirements_json = extract_from_notice(merged_baseinfo_path,clause_path, selection) end_time = time.time() logger.info(f"投标文件要求 done,耗时:{end_time - start_time:.2f} 秒") return {"投标文件要求": fetch_bidding_documents_requirements_json} # 开评定标流程 -def fetch_bid_opening(invalid_path,clause_path): +def fetch_bid_opening(invalid_path,merged_baseinfo_path,clause_path): logger.info("starting 开评定标流程...") + if not merged_baseinfo_path: + merged_baseinfo_path=invalid_path start_time = time.time() selection=2 - fetch_bid_opening_json = extract_from_notice(invalid_path,clause_path, selection) + fetch_bid_opening_json = extract_from_notice(merged_baseinfo_path,clause_path, selection) end_time = time.time() logger.info(f"开评定标流程 done,耗时:{end_time - start_time:.2f} 秒") return {"开评定标流程": fetch_bid_opening_json} @@ -201,9 +205,9 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id): processed_data['evaluation_method_path']), 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], output_folder), - 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'], + 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'],processed_data['merged_baseinfo_path'], processed_data['clause_path']), - 'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_path'],processed_data['clause_path']), + 'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_path'],processed_data['merged_baseinfo_path'],processed_data['clause_path']), 'base_info': executor.submit(fetch_project_basic_info, processed_data['invalid_path'],processed_data['invalid_docpath'],processed_data['merged_baseinfo_path'], processed_data['procurement_path'],processed_data['procurement_docpath'],processed_data['clause_path']), 'qualification_review': executor.submit(fetch_qualification_review, processed_data['invalid_path'],output_folder,