This commit is contained in:
zy123 2024-11-04 17:13:06 +08:00
parent 21351ddbe3
commit 9f2fceb8c2
10 changed files with 242 additions and 105 deletions

View File

@ -1,3 +1,5 @@
# -*- encoding:utf-8 -*-
import json
import re import re
from functools import cmp_to_key from functools import cmp_to_key
@ -196,26 +198,60 @@ def get_requirements_with_gpt(invalid_path, selection):
file_id = upload_file(invalid_path) file_id = upload_file(invalid_path)
# 定义 selection 对应的用户查询 # 定义 selection 对应的用户查询
user_queries = { user_queries = {
1: """ # 1: """
该招标文件中对投标文件的要求是什么你需要从'编写要求''格式要求''承诺书要求''递交要求'四个角度来回答其中'格式'可以从投标文件格式要求标记要求装订要求文件数量要求角度说明'递交要求'可以从投标地点投标文件交标方式投标文件的修改与撤回角度说明请以json格式返回给我结果外层键名分别为'编写要求''格式''承诺书要求''递交要求'你可以用嵌套键值对组织回答嵌套键名为你对相关子要求的总结而嵌套键名应该完全与原文内容保持一致不得擅自总结删减如果原文中未提及相关内容在键值中填'未知'输出格式示例如下 # 该招标文件中对投标文件的要求是什么?你需要从'编写要求'、'格式要求'、'承诺书要求'、'递交要求'四个角度来回答,其中'格式'可以从投标文件格式要求、标记要求、装订要求、文件数量要求角度说明,而不一定一致;'递交要求'可以从投标地点、投标文件交标方式、投标文件的修改与撤回角度说明而不一定一致请以json格式返回给我结果外层键名分别为'编写要求''格式''承诺书要求''递交要求',你可以用嵌套键值对组织回答,嵌套键名为你对相关子要求的总结,而嵌套键名应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。输出格式示例如下,内容与该文件无关:
# {
# "编写要求":"编写要求xxx",
# "格式要求":{
# "投标文件格式要求":"投标文件格式要求",
# "标记要求":"投标文件标记要求",
# "装订要求":"投标文件装订要求",
# "文件数量":"投标文件文件数量要求"
# },
# "承诺书要求":"承诺书要求xxx",
# "递交要求":{
# "投标地点":"投标地点",
# "投标文件交标方式":"交标方式",
# "投标文件的修改与撤回":["投标文件的修改相关要求","投标文件的撤回相关要求"]
# }
# }
# """,
1:"""
该招标文件中对投标文件的要求有哪些请以json格式给我返回结果外层键名为文中提到的有关投标文件的若干要求对于各子要求你可以用嵌套键值对组织回答其中嵌套键名为你对相关子要求的总结或是原文中的标题而最内层的键值应该完全与原文内容保持一致若存在多个并列内容那么键值为一个列表列表中包含若干描述该要求的字符串要求键值与原文内容一致不得总结删减示例如下仅供格式参考
{ {
"编写要求":"投标函的编写要求xxx法定代表人身份证明要求xx", "编写要求":"编写要求xxx",
"格式要求":{ "格式要求":{
"投标文件格式要求":"投标文件格式要求", "投标文件格式要求":"投标文件格式要求",
"标记要求":"投标文件标记要求", "标记要求":"投标文件标记要求",
"装订要求":"投标文件装订要求", "装订要求":"投标文件装订要求",
"文件数量":"投标文件文件数量要求" "文件数量":"投标文件文件数量要求"
}, },
"承诺书要求":"未知", "承诺书要求":"承诺书要求xxx",
"递交要求":{ "递交要求":{
"投标地点":"使用加密其投标文件的CA数字证书企业锁登录“电子交易系统”进入“开标大厅”选择所投标段进行签到并实时在线关注招标人的操作情况。", "递交投标文件的截止日期及递交地点":["递交投标文件的截止日期","递交投标文件的递交地点"],
"投标文件交标方式":"线上开标", "投标文件交标方式":"交标方式",
"投标文件的修改与撤回":"在投标人须知前附表规定的投标有效期内,投标人不得要求撤销或修改其投标文件。出现特殊情况需要延长投标有效期的,招标人以书面形式通知所有投标人延长投标有效期。投标人同意延长的,应相应延长其投标保证金的有效期,但不得要求或被允许修改或撤销其投标文件;投标人拒绝延长的,其投标失效,但投标人有权收回其投标保证金。" "投标文件的修改与撤回":["投标文件的修改相关要求","投标文件的撤回相关要求"]
}
} }
}
""", """,
2: """ # 2: """
该招标文件中开标评标定标要求(或磋商流程内容)是什么你需要从'开标''开标异议''评标''定标'四个角度回答其中'评标'可以从特殊情况的处置评标办法及流程评标委员会的组建角度来说明'定标'可以从定标流程履约能力的审查角度来说明请以json格式返回给我结果外层键名分别为'开标''开标异议''评标''定标'你可以用嵌套键值对组织回答嵌套键名为你对相关子要求的总结而嵌套键名应该完全与原文内容保持一致不得擅自总结删减如果原文中未提及相关内容在键值中填'未知'输出格式示例如下 # 该招标文件中开标、评标、定标要求(或磋商流程内容)是什么?你需要从'开标'、'开标异议'、'评标'、'定标'四个角度回答,其中'评标'可以从特殊情况的处置、评标办法及流程、评标委员会的组建角度来说明,'定标'可以从定标流程、履约能力的审查角度来说明请以json格式返回给我结果外层键名分别为'开标'、'开标异议'、'评标'、'定标',你可以用嵌套键值对组织回答,嵌套键名为你对相关子要求的总结,而嵌套键值应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。输出格式示例如下:
# {
# "开标":"招标文件关于项目开标的要求",
# "开标异议":"招标文件中关于开标异议的项",
# "评标":{
# "特殊情况的处置":"因“电子交易系统”系统故障导致无法投标的,交易中心及时通知招标人,招标人视情况决定是否顺延投标截止时间。因投标人自身原因导致无法完成投标的,由投标人自行承担后果。",
# "评标办法及流程":"评标流程",
# "评标委员会的组建":"评标由招标人依法组建的评标委员会负责。评标委员会由招标人或其委托的招标代理机构熟悉相关业务的代表,以及有关技术、经济等方面的专家组成。"
# },
# "定标":{
# "定标流程":"定标流程",
# "履约能力的审查":"履约能力的审查"
# }
# }
# """,
2:"""
该招标文件中开评定标要求(或磋商流程内容)是什么请以json格式给我返回结果外层键名为文中提到的有关开评定标要求(或磋商流程内容)的若干要求对于各子要求你可以用嵌套键值对组织回答其中嵌套键名为你对相关子要求的总结或是原文中的标题而最内层的键值应该完全与原文内容保持一致若存在多个并列内容那么键值为一个列表列表中包含若干描述该要求的字符串要求键值与原文内容一致不得总结删减示例如下仅供格式参考
{ {
"开标":"招标文件关于项目开标的要求", "开标":"招标文件关于项目开标的要求",
"开标异议":"招标文件中关于开标异议的项", "开标异议":"招标文件中关于开标异议的项",
@ -231,7 +267,7 @@ def get_requirements_with_gpt(invalid_path, selection):
} }
""", """,
3: """ 3: """
该招标文件中重新招标采购不再招标采购终止招标采购的情况分别是什么请以json格式返回给我结果键名分别为'重新招标''不再招标''终止招标'键值应该完全与原文内容保持一致不得擅自总结删减如果原文中未提及相关内容在键值中填'未知'示例输出如下 该招标文件中重新招标重新采购不再招标不再采购终止招标终止采购的情况分别是什么请以json格式返回给我结果键名分别为'重新招标''不再招标''终止招标'键值应该完全与原文内容保持一致不得擅自总结删减如果原文中未提及相关内容在键值中填'未知'示例输出如下
{ {
"重新招标":"有下列情形之一的招标人将重新招标1投标截止时间止投标人少于3个的2经评标委员会评审后否决所有投标的", "重新招标":"有下列情形之一的招标人将重新招标1投标截止时间止投标人少于3个的2经评标委员会评审后否决所有投标的",
"不再招标":"重新招标后投标人仍少于3个或者所有投标被否决的属于必须审批或核准的工程建设项目经原审批或核准部门批准后不再进行招标。", "不再招标":"重新招标后投标人仍少于3个或者所有投标被否决的属于必须审批或核准的工程建设项目经原审批或核准部门批准后不再进行招标。",
@ -242,6 +278,7 @@ def get_requirements_with_gpt(invalid_path, selection):
# 根据 selection 选择相应的 user_query # 根据 selection 选择相应的 user_query
user_query = user_queries.get(selection) user_query = user_queries.get(selection)
if not user_query: if not user_query:
return {"error": f"无效的 selection 值: {selection}. 请选择 1、2 或 3。"} return {"error": f"无效的 selection 值: {selection}. 请选择 1、2 或 3。"}
# 调用大模型并处理响应 # 调用大模型并处理响应
@ -250,4 +287,10 @@ def get_requirements_with_gpt(invalid_path, selection):
cleaned_res = clean_json_string(res) cleaned_res = clean_json_string(res)
return cleaned_res return cleaned_res
except Exception as e: except Exception as e:
return {"error": "调用大模型失败"} return {"error": "调用大模型失败"}
if __name__ == "__main__":
merged_baseinfo_path = r"D:\flask_project\flask_app\static\output\output1\648e094b-e677-47ce-9073-09e0c82af210\ztbfile_merged_baseinfo.pdf"
selection=2
res=get_requirements_with_gpt(merged_baseinfo_path,selection)
print(json.dumps(res,ensure_ascii=False,indent=4))

View File

@ -95,7 +95,7 @@ def extract_text_by_page(file_path):
if __name__ == '__main__': if __name__ == '__main__':
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf" file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
@ -103,5 +103,5 @@ if __name__ == '__main__':
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf" # file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
# ress = extract_common_header(file_path) # ress = extract_common_header(file_path)
# print(ress) # print(ress)
res=extract_text_by_page(input_path) res=extract_text_by_page(file_path)
# print(res)磋商文件_tobidders_notice_part2.pdf # print(res)磋商文件_tobidders_notice_part2.pdf

View File

@ -9,7 +9,7 @@ from flask_app.general.多线程提问 import multi_threading
from flask_app.general.通义千问long import upload_file from flask_app.general.通义千问long import upload_file
from flask_app.main.判断是否分包等 import read_questions_from_judge from flask_app.main.判断是否分包等 import read_questions_from_judge
def process_judge_questions(judge_file_path, chosen_numbers, baseinfo_path, baseinfo_list1): def process_judge_questions(judge_file_path, chosen_numbers, merged_baseinfo_path, baseinfo_list1):
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers) judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
judge_consortium = judge_consortium_bidding(baseinfo_list1) judge_consortium = judge_consortium_bidding(baseinfo_list1)
if judge_consortium: if judge_consortium:
@ -18,7 +18,7 @@ def process_judge_questions(judge_file_path, chosen_numbers, baseinfo_path, base
"外层键名为'联合体投标要求',其中有一个嵌套键值对为:\"是否接受联合体投标\":\"\"" "外层键名为'联合体投标要求',其中有一个嵌套键值对为:\"是否接受联合体投标\":\"\""
) )
judge_questions.append(judge_consortium_question) judge_questions.append(judge_consortium_question)
file_id3 = upload_file(baseinfo_path) file_id3 = upload_file(merged_baseinfo_path)
res2 = multi_threading(judge_questions, "", file_id3, 2) res2 = multi_threading(judge_questions, "", file_id3, 2)
if not res2: if not res2:

View File

@ -143,7 +143,7 @@ def process_baseinfo_list(baseinfo_list, tobidders_notice):
return [] return []
def combine_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path): def combine_basic_info(merged_baseinfo_path,merged_baseinfo_path_more, tobidders_notice_table, tobidders_notice, clause_path):
""" """
综合和处理基础信息生成最终的基础信息字典 综合和处理基础信息生成最终的基础信息字典
@ -173,7 +173,7 @@ def combine_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_tabl
# 提交两个任务 # 提交两个任务
future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, tobidders_notice_table, baseinfo_list1) future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, tobidders_notice_table, baseinfo_list1)
future2 = executor.submit(process_baseinfo_list, baseinfo_list1_copy, tobidders_notice) future2 = executor.submit(process_baseinfo_list, baseinfo_list1_copy, tobidders_notice)
future3 = executor.submit(extract_from_notice, invalid_path, clause_path, 3) # 新增的多线程任务 future3 = executor.submit(extract_from_notice, merged_baseinfo_path_more, clause_path, 3) # 新增的多线程任务
# 等待两个任务完成并获取结果 # 等待两个任务完成并获取结果
future1.result() # process_judge_questions 直接修改 baseinfo_list1不需要返回值 future1.result() # process_judge_questions 直接修改 baseinfo_list1不需要返回值

View File

@ -1,11 +1,13 @@
# -*- encoding:utf-8 -*- # -*- encoding:utf-8 -*-
import json import json
import logging import logging
import os
import time import time
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from flask_app.general.投标人须知正文提取指定内容 import get_requirements_with_gpt from flask_app.general.投标人须知正文提取指定内容 import get_requirements_with_gpt
from flask_app.main.截取pdf import truncate_pdf_multiple from flask_app.main.截取pdf import truncate_pdf_multiple
from flask_app.general.merge_pdfs import merge_pdfs
from flask_app.main.table_content_extraction import extract_tables_main from flask_app.main.table_content_extraction import extract_tables_main
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
from flask_app.general.json_utils import transform_json_values from flask_app.general.json_utils import transform_json_values
@ -68,6 +70,9 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id):
merged_baseinfo_path=truncate_files[-1] merged_baseinfo_path=truncate_files[-1]
more_path=[merged_baseinfo_path,tobidders_notice]
merged_baseinfo_path_more=os.path.join(output_folder,"merged_baseinfo_path_more.pdf")
merge_pdfs(more_path,merged_baseinfo_path_more)
clause_path = convert_clause_to_json(tobidders_notice, output_folder) # 投标人须知正文条款pdf->json clause_path = convert_clause_to_json(tobidders_notice, output_folder) # 投标人须知正文条款pdf->json
end_time=time.time() end_time=time.time()
logger.info(f"文件预处理 done耗时{end_time - start_time:.2f}") logger.info(f"文件预处理 done耗时{end_time - start_time:.2f}")
@ -83,21 +88,24 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id):
'qualification': qualification, 'qualification': qualification,
'truncate0_jsonpath': truncate_jsonpath, 'truncate0_jsonpath': truncate_jsonpath,
'merged_baseinfo_path':merged_baseinfo_path, 'merged_baseinfo_path':merged_baseinfo_path,
'merged_baseinfo_path_more':merged_baseinfo_path_more,
'clause_path': clause_path, 'clause_path': clause_path,
'invalid_docpath': invalid_docpath 'invalid_docpath': invalid_docpath
} }
# 基本信息 # 基本信息
def fetch_project_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path): def fetch_project_basic_info(invalid_path, merged_baseinfo_path, merged_baseinfo_path_more,tobidders_notice_table, tobidders_notice, clause_path):
logger.info("starting 基础信息...") logger.info("starting 基础信息...")
start_time = time.time() start_time = time.time()
if not merged_baseinfo_path: if not merged_baseinfo_path:
merged_baseinfo_path = invalid_path merged_baseinfo_path = invalid_path
if not merged_baseinfo_path_more:
merged_baseinfo_path_more=invalid_path
if not tobidders_notice_table: if not tobidders_notice_table:
tobidders_notice_table = invalid_path tobidders_notice_table = invalid_path
if not tobidders_notice: if not tobidders_notice:
tobidders_notice = invalid_path tobidders_notice = invalid_path
basic_res = combine_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path) basic_res = combine_basic_info(merged_baseinfo_path,merged_baseinfo_path_more, tobidders_notice_table, tobidders_notice, clause_path)
end_time = time.time() end_time = time.time()
logger.info(f"基础信息 done耗时{end_time - start_time:.2f}") logger.info(f"基础信息 done耗时{end_time - start_time:.2f}")
return basic_res return basic_res
@ -145,21 +153,25 @@ def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpat
# 投标文件要求 # 投标文件要求
def fetch_bidding_documents_requirements(invalid_path, clause_path): def fetch_bidding_documents_requirements(invalid_path, merged_baseinfo_path_more,clause_path):
logger.info("starting 投标文件要求...") logger.info("starting 投标文件要求...")
start_time = time.time() start_time = time.time()
if not merged_baseinfo_path_more:
merged_baseinfo_path_more=invalid_path
selection = 1 selection = 1
fetch_bidding_documents_requirements_json = extract_from_notice(invalid_path, clause_path, selection) fetch_bidding_documents_requirements_json = extract_from_notice(merged_baseinfo_path_more, clause_path, selection)
end_time = time.time() end_time = time.time()
logger.info(f"投标文件要求 done耗时{end_time - start_time:.2f}") logger.info(f"投标文件要求 done耗时{end_time - start_time:.2f}")
return {"投标文件要求": fetch_bidding_documents_requirements_json} return {"投标文件要求": fetch_bidding_documents_requirements_json}
# 开评定标流程 # 开评定标流程
def fetch_bid_opening(invalid_path, clause_path): def fetch_bid_opening(invalid_path, merged_baseinfo_path_more,clause_path):
logger.info("starting 开评定标流程...") logger.info("starting 开评定标流程...")
start_time = time.time() start_time = time.time()
if not merged_baseinfo_path_more:
merged_baseinfo_path_more=invalid_path
selection = 2 selection = 2
fetch_bid_opening_json = extract_from_notice(invalid_path, clause_path, selection) fetch_bid_opening_json = extract_from_notice(merged_baseinfo_path_more, clause_path, selection)
end_time = time.time() end_time = time.time()
logger.info(f"开评定标流程 done耗时{end_time - start_time:.2f}") logger.info(f"开评定标流程 done耗时{end_time - start_time:.2f}")
return {"开评定标流程": fetch_bid_opening_json} return {"开评定标流程": fetch_bid_opening_json}
@ -177,7 +189,7 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_
with concurrent.futures.ThreadPoolExecutor() as executor: with concurrent.futures.ThreadPoolExecutor() as executor:
# 立即启动不依赖 knowledge_name 和 index 的任务 # 立即启动不依赖 knowledge_name 和 index 的任务
futures = { futures = {
'base_info': executor.submit(fetch_project_basic_info,processed_data['invalid_path'] ,processed_data['merged_baseinfo_path'], 'base_info': executor.submit(fetch_project_basic_info,processed_data['invalid_path'] ,processed_data['merged_baseinfo_path'],processed_data['merged_baseinfo_path_more'],
processed_data['tobidders_notice_table'], processed_data['tobidders_notice_table'],
processed_data['tobidders_notice'], processed_data['clause_path']), processed_data['tobidders_notice'], processed_data['clause_path']),
'qualification_review': executor.submit(fetch_qualification_review, processed_data['evaluation_method'], 'qualification_review': executor.submit(fetch_qualification_review, processed_data['evaluation_method'],
@ -189,8 +201,8 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_
'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
output_folder, processed_data['truncate0_jsonpath'], output_folder, processed_data['truncate0_jsonpath'],
processed_data['clause_path'], processed_data['qualification']), processed_data['clause_path'], processed_data['qualification']),
'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'], processed_data['clause_path']), 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'], processed_data['merged_baseinfo_path_more'],processed_data['clause_path']),
'opening_bid': executor.submit(fetch_bid_opening,processed_data['invalid_path'], processed_data['clause_path']) 'opening_bid': executor.submit(fetch_bid_opening,processed_data['invalid_path'],processed_data['merged_baseinfo_path_more'], processed_data['clause_path'])
} }
# 提前处理这些不依赖的任务,按完成顺序返回 # 提前处理这些不依赖的任务,按完成顺序返回

View File

@ -30,32 +30,32 @@ def extract_text_from_pdf(file_path, start_word, end_pattern):
pdf_document = PdfReader(file_path) pdf_document = PdfReader(file_path)
all_pages_text = [] all_pages_text = []
start_index = None start_index = None
# 处理所有页面 # 处理所有页面
for i, page in enumerate(pdf_document.pages): for i, page in enumerate(pdf_document.pages):
page_text = page.extract_text() if page.extract_text() else "" page_text = page.extract_text() if page.extract_text() else ""
cleaned_text = clean_page_content(page_text, common_header) cleaned_text = clean_page_content(page_text, common_header)
# print(cleaned_text)
# 在第一页查找开始位置 # 在第一页查找开始位置
if i == 0 and start_index is None: if i == 0 and start_index is None:
start_match = re.search(start_word, cleaned_text, re.MULTILINE) for pattern in (start_word if isinstance(start_word, list) else [start_word]):
if start_match: start_match = re.search(pattern, cleaned_text)
start_index = start_match.start() if start_match:
cleaned_text = cleaned_text[start_index:] start_index = start_match.start()
cleaned_text = cleaned_text[start_index:]
break # 找到一个匹配后跳出循环
# 在最后一页查找结束位置 # 在最后一页查找结束位置
if i == len(pdf_document.pages) - 1: if i == len(pdf_document.pages) - 1:
for pattern in end_pattern: matches = list(re.finditer(end_pattern, cleaned_text))
matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE)) if matches:
if matches: end_index = matches[-1].start()
end_index = matches[-1].start() cleaned_text = cleaned_text[:end_index]
cleaned_text = cleaned_text[:end_index]
break
all_pages_text.append(cleaned_text) all_pages_text.append(cleaned_text)
# 合并所有页面的文本 # 合并所有页面的文本
full_text = "\n".join(all_pages_text) full_text = "\n".join(all_pages_text)
# print(full_text)
return full_text return full_text
def extract_section(text, start_pattern, end_phrases): def extract_section(text, start_pattern, end_phrases):
# 查找开始模式 # 查找开始模式
@ -113,8 +113,16 @@ def parse_text_by_heading(text):
current_content = [] current_content = []
append_newline = False append_newline = False
lines = text.split('\n') lines = text.split('\n')
def check_year_pattern(line):
line_no_spaces = line.replace(' ', '')
return re.match(r'^\d{4}', line_no_spaces) and line_no_spaces[4:5] == ''
for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n' for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n'
line_stripped = line.strip().replace('', '.') line_stripped = line.strip().replace('', '.')
# 检查是否以年份开头,如果是,属于上一个标题内容
if check_year_pattern(line_stripped) and current_key is not None:
current_content.append(line_stripped)
continue
# 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号 # 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped) match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match: if not match:
@ -168,31 +176,58 @@ def convert_to_json(file_path, start_word, end_phrases):
parsed_data = parse_text_by_heading(text) parsed_data = parse_text_by_heading(text)
return parsed_data return parsed_data
def convert_clause_to_json(input_path,output_folder,type=1):
def convert_clause_to_json(input_path, output_folder, type=1):
if not os.path.exists(input_path): if not os.path.exists(input_path):
# print(f"The specified file does not exist: {input_path}")
return "" return ""
if type==1:
start_word = "投标人须知正文" if type == 1:
end_phrases = [ # start_word 和编译后的 end_pattern 用于 type=1
r'^第[一二三四五六七八九十]+章\s*评标办法', start_word = [
r'^评标办法前附表', re.compile(
r'^附(?:录|件|表)(?:一)?[:]' r'^\s*(?:[(]\s*[一1]?\s*[)]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)'
r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文)',
re.MULTILINE
)
] ]
end_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
r'^评标办法前附表|'
r'^附录(?:一)?[:]|'
r'^附件(?:一)?[:]|'
r'^附表(?:一)?[:]',
re.MULTILINE
)
else: else:
start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:|投标邀请书|投标邀请函' start_word = [
end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表'] re.compile(
result = convert_to_json(input_path, start_word, end_phrases) #过滤无关信息 r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',
re.MULTILINE
),
re.compile(
r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)]?\s*$',
re.MULTILINE
)
]
end_pattern = re.compile(
r'第[一二三四五六七八九十]+章\s*投标人须知|'
r'投标人须知前附表',
re.MULTILINE
)
result = convert_to_json(input_path, start_word, end_pattern) # 过滤无关信息
# 检查输出文件夹是否存在,如果不存在则创建 # 检查输出文件夹是否存在,如果不存在则创建
if not os.path.exists(output_folder): if not os.path.exists(output_folder):
os.makedirs(output_folder) os.makedirs(output_folder)
print(f"Created output folder: {output_folder}") print(f"Created output folder: {output_folder}")
file_name = "clause1.json" if type == 1 else "clause2.json" file_name = "clause1.json" if type == 1 else "clause2.json"
output_path = os.path.join(output_folder, file_name) output_path = os.path.join(output_folder, file_name)
with open(output_path, 'w', encoding='utf-8') as f: with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=4, ensure_ascii=False) json.dump(result, f, indent=4, ensure_ascii=False)
# post_process_json(output_path)
# print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
return output_path return output_path
# def post_process_json(json_file_path): #处理一级标题如'5.1'过长的内容 zbtest20 # def post_process_json(json_file_path): #处理一级标题如'5.1'过长的内容 zbtest20
@ -237,16 +272,16 @@ def convert_clause_to_json(input_path,output_folder,type=1):
# json.dump(processed_data, file, ensure_ascii=False, indent=4) # json.dump(processed_data, file, ensure_ascii=False, indent=4)
if __name__ == "__main__": if __name__ == "__main__":
file_path='D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\东莞支队查验招标文件_tobidders_notice_part2_1-5.pdf' file_path='C:\\Users\\Administrator\\Desktop\\fsdownload\\dafa8e6f-319e-4547-abbd-65375bf82004\\ztbfile_tobidders_notice.pdf'
# file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件一中多媒体报告厅教学设备_tobidders_notice_part1.pdf' # file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件一中多媒体报告厅教学设备_tobidders_notice_part1.pdf'
# start_word = "投标人须知正文" # start_word = "投标人须知正文"
# end_phrases = [ # end_phrases = [
# r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', # r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
# r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:', # r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
# ] # ]
output_folder = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\tmp' output_folder = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\dafa8e6f-319e-4547-abbd-65375bf82004\\tmp'
try: try:
output_path = convert_clause_to_json("",output_folder) output_path = convert_clause_to_json(file_path,output_folder)
print(f"Final JSON result saved to: {output_path}") print(f"Final JSON result saved to: {output_path}")
except ValueError as e: except ValueError as e:
print("Error:", e) print("Error:", e)

View File

@ -114,8 +114,8 @@ def dynamic_key_handling(key_groups, detected_keys):
if key not in group: if key not in group:
group.append(key) group.append(key)
def get_base_info(baseinfo_file_path,clause_path): def get_base_info(merged_baseinfo_path,clause_path):
file_id = upload_file(baseinfo_file_path) file_id = upload_file(merged_baseinfo_path)
baseinfo_file_path='flask_app/static/提示词/基本信息货物标.txt' baseinfo_file_path='flask_app/static/提示词/基本信息货物标.txt'
# baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息货物标.txt' # baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息货物标.txt'
questions = read_questions_from_file(baseinfo_file_path) questions = read_questions_from_file(baseinfo_file_path)
@ -129,9 +129,9 @@ def get_base_info(baseinfo_file_path,clause_path):
# judge_file_path ='D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题货物标.txt' # judge_file_path ='D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题货物标.txt'
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交两个任务 # 提交两个任务
future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, baseinfo_file_path, future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, merged_baseinfo_path,
baseinfo_list) baseinfo_list)
future2 = executor.submit(extract_from_notice, baseinfo_file_path, clause_path, 3) # 新增的多线程任务 future2 = executor.submit(extract_from_notice, merged_baseinfo_path, clause_path, 3) # 新增的多线程任务
# 等待两个任务完成并获取结果 # 等待两个任务完成并获取结果
future1.result() # process_judge_questions 直接修改 baseinfo_list不需要返回值 future1.result() # process_judge_questions 直接修改 baseinfo_list不需要返回值
@ -158,14 +158,14 @@ def get_base_info(baseinfo_file_path,clause_path):
# baseinfo_list.append(clean_json_string(response)) # baseinfo_list.append(clean_json_string(response))
return baseinfo_list return baseinfo_list
def combine_basic_info(baseinfo_file_path, procurement_path,procurement_docpath,clause_path): def combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath,clause_path):
baseinfo_list = [] baseinfo_list = []
temp_list = [] temp_list = []
procurement_reqs = {} procurement_reqs = {}
# 定义一个线程函数来获取基础信息 # 定义一个线程函数来获取基础信息
def get_base_info_thread(): def get_base_info_thread():
nonlocal temp_list nonlocal temp_list
temp_list = get_base_info(baseinfo_file_path,clause_path) temp_list = get_base_info(merged_baseinfo_path,clause_path)
# 定义一个线程函数来获取采购需求 # 定义一个线程函数来获取采购需求
def fetch_procurement_reqs_thread(): def fetch_procurement_reqs_thread():
nonlocal procurement_reqs nonlocal procurement_reqs
@ -191,11 +191,11 @@ def combine_basic_info(baseinfo_file_path, procurement_path,procurement_docpath,
if __name__ == "__main__": if __name__ == "__main__":
start_time=time.time() start_time=time.time()
# baseinfo_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all\\ztbfile_merged_baseinfo\\ztbfile_merged_baseinfo_3-31.pdf" # baseinfo_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all\\ztbfile_merged_baseinfo\\ztbfile_merged_baseinfo_3-31.pdf"
baseinfo_file_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_merged_baseinfo.pdf" merged_baseinfo_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_merged_baseinfo.pdf"
# procurement_file_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\tmp\\ztbfile_procurement.pdf" # procurement_file_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\tmp\\ztbfile_procurement.pdf"
procurement_file_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx" procurement_file_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx"
clause_path='D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\clause1.json' clause_path='D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\clause1.json'
res = combine_basic_info(baseinfo_file_path, procurement_file_path,clause_path) res = combine_basic_info(merged_baseinfo_path, procurement_file_path,clause_path)
print("------------------------------------") print("------------------------------------")
print(json.dumps(res, ensure_ascii=False, indent=4)) print(json.dumps(res, ensure_ascii=False, indent=4))
end_time=time.time() end_time=time.time()

View File

@ -62,7 +62,7 @@ post_process 函数尝试将长字符串按特定模式分割成块,每块至
""" """
# 读取JSON数据提取内容转换结构并打印结果 # 读取JSON数据提取内容转换结构并打印结果
def extract_from_notice(invalid_path,clause_path, type): def extract_from_notice(merged_baseinfo_path,clause_path, type):
""" """
从公告中提取特定类型的内容 从公告中提取特定类型的内容
Args: Args:
@ -106,7 +106,7 @@ def extract_from_notice(invalid_path,clause_path, type):
transformed_data = process_with_outer_key(extracted_data) transformed_data = process_with_outer_key(extracted_data)
final_result = process_nested_data(transformed_data) final_result = process_nested_data(transformed_data)
if not final_result: if not final_result:
final_result = get_requirements_with_gpt(invalid_path, type) #万一没用正则匹配到,那就调用大模型 final_result = get_requirements_with_gpt(merged_baseinfo_path, type) #万一没用正则匹配到,那就调用大模型
return final_result return final_result
except Exception as e: except Exception as e:
@ -114,11 +114,11 @@ def extract_from_notice(invalid_path,clause_path, type):
return DEFAULT_RESULT return DEFAULT_RESULT
if __name__ == "__main__": if __name__ == "__main__":
file_path = 'D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\clause1.json' clause_path = 'D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\clause1.json'
invalid_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile.pdf" merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\648e094b-e677-47ce-9073-09e0c82af210\ztbfile_merged_baseinfo.pdf"
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\clause1.json' # file_path = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\clause1.json'
try: try:
res = extract_from_notice(invalid_path,file_path, 1) # 可以改变此处的 type 参数测试不同的场景 res = extract_from_notice(merged_baseinfo_path,clause_path, 1) # 可以改变此处的 type 参数测试不同的场景
res2=json.dumps(res,ensure_ascii=False,indent=4) res2=json.dumps(res,ensure_ascii=False,indent=4)
print(res2) print(res2)
except ValueError as e: except ValueError as e:

View File

@ -16,6 +16,7 @@ def extract_text_from_pdf(file_path, start_word, end_pattern):
# 从PDF文件中提取文本 # 从PDF文件中提取文本
common_header = extract_common_header(file_path) common_header = extract_common_header(file_path)
pdf_document = PdfReader(file_path) pdf_document = PdfReader(file_path)
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
all_pages_text = [] all_pages_text = []
start_index = None start_index = None
# 处理所有页面 # 处理所有页面
@ -132,18 +133,35 @@ def parse_text_by_heading(text):
append_newline = False append_newline = False
skip_subheadings = False skip_subheadings = False
last_main_number = None last_main_number = None
temp_title = None # 临时存储以点号开头但不带数字的标题
lines = text.split('\n') lines = text.split('\n')
def check_year_pattern(line):
# 检查是否为年份模式,如 '2014年'
line_no_spaces = line.replace(' ', '')
return re.match(r'^\d{4}\s*年', line_no_spaces) is not None
for i, line in enumerate(lines): for i, line in enumerate(lines):
line_stripped = line.strip().replace('', '.') line_stripped = line.strip().replace('', '.')
# 如果在一个章节内,跳过年份模式的行
if check_year_pattern(line_stripped) and current_key is not None:
current_content.append(line_stripped)
continue
# 匹配带数字的标题,例如 '12.1 内容'
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped) match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match: if not match:
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped) match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
# 新增:处理以点号开头的情况 # 匹配以点号开头并带有数字的情况,例如 '.12.1 内容'
dot_match = re.match(r'^\.(\d+(?:\.\d+)*)\s*(.+)$', line_stripped) dot_match = re.match(r'^[.](\d+(?:[.]\d+)*)\s*(.+)$', line_stripped)
# 新增:处理没有点号的纯数字开头的情况 # 匹配以点号开头但不带数字的情况,例如 '.投标文件的制作和签署'
dot_text_match = re.match(r'^[.]\s*(\D.+)$', line_stripped)
# 匹配不带点号的纯数字开头的情况,例如 '27xxxxx'
pure_number_match = re.match(r'^(\d+)([^.\d].*)', line_stripped) pure_number_match = re.match(r'^(\d+)([^.\d].*)', line_stripped)
if match: if match:
@ -160,6 +178,14 @@ def parse_text_by_heading(text):
data[current_key_chinese] = current_value_chinese data[current_key_chinese] = current_value_chinese
current_key_chinese = None current_key_chinese = None
# 处理临时标题与新主标题的关联
if temp_title:
# 提取主编号(第一个点前的数字)
main_number = new_key.split('.')[0]
# 关联临时标题到主编号
data[f"{main_number}."] = temp_title
temp_title = None # 重置临时标题
# 保存阿拉伯数字的标题内容 # 保存阿拉伯数字的标题内容
if current_key is None or (compare_headings(current_key, new_key) and ( if current_key is None or (compare_headings(current_key, new_key) and (
len(current_content) == 0 or current_content[-1][-1] != '')): len(current_content) == 0 or current_content[-1][-1] != '')):
@ -170,20 +196,13 @@ def parse_text_by_heading(text):
current_content = [line_content] current_content = [line_content]
append_newline = len(new_key.rstrip('.').split('.')) <= 2 append_newline = len(new_key.rstrip('.').split('.')) <= 2
last_main_number = new_key.split('.')[0] last_main_number = new_key.split('.')[0]
# if current_key is None or (current_key != new_key and ( #不给序号排序
# len(current_content) == 0 or current_content[-1][-1] != '第')):
# if current_key is not None:
# content_string = ''.join(current_content).strip()
# data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
# current_key = new_key
# current_content = [line_content]
# append_newline = len(new_key.rstrip('.').split('.')) <= 2
# last_main_number = new_key.split('.')[0]
else: else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords) append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
elif dot_match: elif dot_match:
# 处理以点号开头的情况 # 处理以点号开头并带有数字的情况
sub_number, line_content = dot_match.groups() sub_number, line_content = dot_match.groups()
sub_number = sub_number.replace('', '.')
if last_main_number: if last_main_number:
new_key = f"{last_main_number}.{sub_number}" new_key = f"{last_main_number}.{sub_number}"
if current_key is not None: if current_key is not None:
@ -194,7 +213,13 @@ def parse_text_by_heading(text):
append_newline = True append_newline = True
else: else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords) append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
elif pure_number_match: # 处理没有点号的纯数字开头的情况'27xxxxx'
elif dot_text_match:
# 处理以点号开头但不带数字的情况,存储到临时变量
temp_title = dot_text_match.group(1).strip()
continue # 跳过进一步处理该行
elif pure_number_match: # 处理不带点号的纯数字开头的情况,例如 '27xxxxx'
new_key_candidate, line_content = pure_number_match.groups() new_key_candidate, line_content = pure_number_match.groups()
new_key_candidate += '.' # 添加点号 new_key_candidate += '.' # 添加点号
line_content = line_content.lstrip('..、,') line_content = line_content.lstrip('..、,')
@ -208,6 +233,12 @@ def parse_text_by_heading(text):
# 判断新键名是否为新的标题条件是键名递增且差值在5以内 # 判断新键名是否为新的标题条件是键名递增且差值在5以内
if (current_key is None or (new_key_num > current_key_num and new_key_num - current_key_num <= 5)): if (current_key is None or (new_key_num > current_key_num and new_key_num - current_key_num <= 5)):
# 处理临时标题与新主标题的关联
if temp_title:
main_number = new_key_candidate.split('.')[0]
data[f"{main_number}."] = temp_title
temp_title = None # 重置临时标题
# 开始新的标题 # 开始新的标题
if current_key is not None: if current_key is not None:
content_string = ''.join(current_content).strip() content_string = ''.join(current_content).strip()
@ -216,19 +247,28 @@ def parse_text_by_heading(text):
current_content = [line_content] current_content = [line_content]
append_newline = True append_newline = True
last_main_number = new_key_candidate.rstrip('.') last_main_number = new_key_candidate.rstrip('.')
# if current_key is None or (current_key != new_key and ( #不给序号排序
# len(current_content) == 0 or current_content[-1][-1] != '第')):
# if current_key is not None:
# content_string = ''.join(current_content).strip()
# data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
# current_key = new_key
# current_content = [line_content]
# append_newline = len(new_key.rstrip('.').split('.')) <= 2
# last_main_number = new_key.split('.')[0]
else: else:
# 将当前行视为当前标题的内容 # 将当前行视为当前标题的内容
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords) append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
else: else:
if not skip_subheadings: if not skip_subheadings: # 合并中文标题、字母标题、阿拉伯数字标题的匹配逻辑
# 合并了中文标题、字母标题、阿拉伯数字标题的匹配逻辑 pattern_title = re.compile(
pattern_title = re.compile(r'^\s*(?:[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)') r'^\s*(?:[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
chinese_match = pattern_title.match(line_stripped) chinese_match = pattern_title.match(line_stripped)
letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped) letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped) arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
# 优化处理逻辑,减少冗余 # 优化处理逻辑,减少冗余
if chinese_match or letter_match or arabic_match: if chinese_match or letter_match or arabic_match:
# 保存之前的 key 的内容(无论是中文标题还是阿拉伯数字标题) # 保存之前的 key 的内容(无论是中文标题还是阿拉伯数字标题)
if current_key is not None: if current_key is not None:
@ -277,6 +317,9 @@ def parse_text_by_heading(text):
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '') data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
if current_key_chinese is not None: if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese data[current_key_chinese] = current_value_chinese
if temp_title:
# 处理任何未关联的临时标题
data["未分类标题"] = temp_title # 您可以根据需要选择合适的键名
return data return data
@ -364,13 +407,13 @@ def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json
print(f"The specified file does not exist: {file_path}") print(f"The specified file does not exist: {file_path}")
return "" return ""
if type == 1: if type == 1:
start_word = r'^\s*[(]?\s*[一1]\s*[)]?\s*[、.]*\s*(说\s*明|总\s*则)' start_word = r'^\s*(?:[(]?\s*[一12]?\s*[)]?\s*[、.]*\s*)?(说\s*明|总\s*则)'
end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+' end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*)$'
text = extract_text_from_pdf(file_path, start_word, end_pattern) text = extract_text_from_pdf(file_path, start_word, end_pattern)
result = parse_text_by_heading(text) result = parse_text_by_heading(text)
else: else:
start_word = r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*' start_word = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*|.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*)$'
end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人|磋商|供应商)须知前附表)' end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
text = extract_text_from_pdf(file_path, start_word, end_pattern) text = extract_text_from_pdf(file_path, start_word, end_pattern)
result=parse_text_to_dict(text) result=parse_text_to_dict(text)
# result = convert_to_json(input_path, start_word, end_pattern) # result = convert_to_json(input_path, start_word, end_pattern)
@ -407,9 +450,9 @@ def process_folder(input_folder, output_folder):
#TODO: 投标人须知正文这块,序号可能是乱序的,或许可以删除判断序号大小的逻辑,只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理 #TODO: 投标人须知正文这块,序号可能是乱序的,或许可以删除判断序号大小的逻辑,只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理
if __name__ == "__main__": if __name__ == "__main__":
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf' # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
file_path='D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\ztbfile_tobidders_notice_part2.pdf' file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf'
output_folder = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\tmp' output_folder = 'D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\tmp'
try: try:
output_path = convert_clause_to_json(file_path,output_folder,1) output_path = convert_clause_to_json(file_path,output_folder,1)
print(f"Final JSON result saved to: {output_path}") print(f"Final JSON result saved to: {output_path}")

View File

@ -129,22 +129,26 @@ def fetch_invalid_requirements(invalid_docpath, output_folder):
logger.info(f"无效标与废标 done耗时{end_time - start_time:.2f}") logger.info(f"无效标与废标 done耗时{end_time - start_time:.2f}")
return find_invalid_res return find_invalid_res
def fetch_bidding_documents_requirements(invalid_path,clause_path): def fetch_bidding_documents_requirements(invalid_path,merged_baseinfo_path,clause_path):
logger.info("starting 投标文件要求...") logger.info("starting 投标文件要求...")
if not merged_baseinfo_path:
merged_baseinfo_path=invalid_path
start_time = time.time() start_time = time.time()
selection=1 selection=1
fetch_bidding_documents_requirements_json = extract_from_notice(invalid_path,clause_path, selection) fetch_bidding_documents_requirements_json = extract_from_notice(merged_baseinfo_path,clause_path, selection)
end_time = time.time() end_time = time.time()
logger.info(f"投标文件要求 done耗时{end_time - start_time:.2f}") logger.info(f"投标文件要求 done耗时{end_time - start_time:.2f}")
return {"投标文件要求": fetch_bidding_documents_requirements_json} return {"投标文件要求": fetch_bidding_documents_requirements_json}
# 开评定标流程 # 开评定标流程
def fetch_bid_opening(invalid_path,clause_path): def fetch_bid_opening(invalid_path,merged_baseinfo_path,clause_path):
logger.info("starting 开评定标流程...") logger.info("starting 开评定标流程...")
if not merged_baseinfo_path:
merged_baseinfo_path=invalid_path
start_time = time.time() start_time = time.time()
selection=2 selection=2
fetch_bid_opening_json = extract_from_notice(invalid_path,clause_path, selection) fetch_bid_opening_json = extract_from_notice(merged_baseinfo_path,clause_path, selection)
end_time = time.time() end_time = time.time()
logger.info(f"开评定标流程 done耗时{end_time - start_time:.2f}") logger.info(f"开评定标流程 done耗时{end_time - start_time:.2f}")
return {"开评定标流程": fetch_bid_opening_json} return {"开评定标流程": fetch_bid_opening_json}
@ -201,9 +205,9 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
processed_data['evaluation_method_path']), processed_data['evaluation_method_path']),
'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
output_folder), output_folder),
'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'], 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'],processed_data['merged_baseinfo_path'],
processed_data['clause_path']), processed_data['clause_path']),
'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_path'],processed_data['clause_path']), 'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_path'],processed_data['merged_baseinfo_path'],processed_data['clause_path']),
'base_info': executor.submit(fetch_project_basic_info, processed_data['invalid_path'],processed_data['invalid_docpath'],processed_data['merged_baseinfo_path'], 'base_info': executor.submit(fetch_project_basic_info, processed_data['invalid_path'],processed_data['invalid_docpath'],processed_data['merged_baseinfo_path'],
processed_data['procurement_path'],processed_data['procurement_docpath'],processed_data['clause_path']), processed_data['procurement_path'],processed_data['procurement_docpath'],processed_data['clause_path']),
'qualification_review': executor.submit(fetch_qualification_review, processed_data['invalid_path'],output_folder, 'qualification_review': executor.submit(fetch_qualification_review, processed_data['invalid_path'],output_folder,