11.4
This commit is contained in:
parent
21351ddbe3
commit
9f2fceb8c2
@ -1,3 +1,5 @@
|
||||
# -*- encoding:utf-8 -*-
|
||||
import json
|
||||
import re
|
||||
from functools import cmp_to_key
|
||||
|
||||
@ -196,26 +198,60 @@ def get_requirements_with_gpt(invalid_path, selection):
|
||||
file_id = upload_file(invalid_path)
|
||||
# 定义 selection 对应的用户查询
|
||||
user_queries = {
|
||||
1: """
|
||||
该招标文件中对投标文件的要求是什么?你需要从'编写要求'、'格式要求'、'承诺书要求'、'递交要求'四个角度来回答,其中'格式'可以从投标文件格式要求、标记要求、装订要求、文件数量要求角度说明,'递交要求'可以从投标地点、投标文件交标方式、投标文件的修改与撤回角度说明,请以json格式返回给我结果,外层键名分别为'编写要求','格式','承诺书要求','递交要求',你可以用嵌套键值对组织回答,嵌套键名为你对相关子要求的总结,而嵌套键名应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。输出格式示例如下:
|
||||
# 1: """
|
||||
# 该招标文件中对投标文件的要求是什么?你需要从'编写要求'、'格式要求'、'承诺书要求'、'递交要求'四个角度来回答,其中'格式'可以从投标文件格式要求、标记要求、装订要求、文件数量要求角度说明,而不一定一致;'递交要求'可以从投标地点、投标文件交标方式、投标文件的修改与撤回角度说明,而不一定一致;请以json格式返回给我结果,外层键名分别为'编写要求','格式','承诺书要求','递交要求',你可以用嵌套键值对组织回答,嵌套键名为你对相关子要求的总结,而嵌套键名应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。输出格式示例如下,内容与该文件无关:
|
||||
# {
|
||||
# "编写要求":"编写要求xxx",
|
||||
# "格式要求":{
|
||||
# "投标文件格式要求":"投标文件格式要求",
|
||||
# "标记要求":"投标文件标记要求",
|
||||
# "装订要求":"投标文件装订要求",
|
||||
# "文件数量":"投标文件文件数量要求"
|
||||
# },
|
||||
# "承诺书要求":"承诺书要求xxx",
|
||||
# "递交要求":{
|
||||
# "投标地点":"投标地点",
|
||||
# "投标文件交标方式":"交标方式",
|
||||
# "投标文件的修改与撤回":["投标文件的修改相关要求","投标文件的撤回相关要求"]
|
||||
# }
|
||||
# }
|
||||
# """,
|
||||
1:"""
|
||||
该招标文件中对投标文件的要求有哪些?请以json格式给我返回结果,外层键名为文中提到的有关投标文件的若干要求,对于各子要求,你可以用嵌套键值对组织回答,其中嵌套键名为你对相关子要求的总结或是原文中的标题,而最内层的键值应该完全与原文内容保持一致;若存在多个并列内容,那么键值为一个列表,列表中包含若干描述该要求的字符串,要求键值与原文内容一致,不得总结、删减。示例如下,仅供格式参考:
|
||||
{
|
||||
"编写要求":"投标函的编写要求xxx;法定代表人身份证明要求xx",
|
||||
"格式要求":{
|
||||
"投标文件格式要求":"投标文件格式要求",
|
||||
"标记要求":"投标文件标记要求",
|
||||
"装订要求":"投标文件装订要求",
|
||||
"文件数量":"投标文件文件数量要求"
|
||||
},
|
||||
"承诺书要求":"未知",
|
||||
"递交要求":{
|
||||
"投标地点":"使用加密其投标文件的CA数字证书(企业锁)登录“电子交易系统”,进入“开标大厅”选择所投标段进行签到,并实时在线关注招标人的操作情况。",
|
||||
"投标文件交标方式":"线上开标",
|
||||
"投标文件的修改与撤回":"在投标人须知前附表规定的投标有效期内,投标人不得要求撤销或修改其投标文件。出现特殊情况需要延长投标有效期的,招标人以书面形式通知所有投标人延长投标有效期。投标人同意延长的,应相应延长其投标保证金的有效期,但不得要求或被允许修改或撤销其投标文件;投标人拒绝延长的,其投标失效,但投标人有权收回其投标保证金。"
|
||||
"编写要求":"编写要求xxx",
|
||||
"格式要求":{
|
||||
"投标文件格式要求":"投标文件格式要求",
|
||||
"标记要求":"投标文件标记要求",
|
||||
"装订要求":"投标文件装订要求",
|
||||
"文件数量":"投标文件文件数量要求"
|
||||
},
|
||||
"承诺书要求":"承诺书要求xxx",
|
||||
"递交要求":{
|
||||
"递交投标文件的截止日期及递交地点":["递交投标文件的截止日期","递交投标文件的递交地点"],
|
||||
"投标文件交标方式":"交标方式",
|
||||
"投标文件的修改与撤回":["投标文件的修改相关要求","投标文件的撤回相关要求"]
|
||||
}
|
||||
}
|
||||
}
|
||||
""",
|
||||
2: """
|
||||
该招标文件中开标、评标、定标要求(或磋商流程内容)是什么?你需要从'开标'、'开标异议'、'评标'、'定标'四个角度回答,其中'评标'可以从特殊情况的处置、评标办法及流程、评标委员会的组建角度来说明,'定标'可以从定标流程、履约能力的审查角度来说明,请以json格式返回给我结果,外层键名分别为'开标'、'开标异议'、'评标'、'定标',你可以用嵌套键值对组织回答,嵌套键名为你对相关子要求的总结,而嵌套键名应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。输出格式示例如下:
|
||||
# 2: """
|
||||
# 该招标文件中开标、评标、定标要求(或磋商流程内容)是什么?你需要从'开标'、'开标异议'、'评标'、'定标'四个角度回答,其中'评标'可以从特殊情况的处置、评标办法及流程、评标委员会的组建角度来说明,'定标'可以从定标流程、履约能力的审查角度来说明,请以json格式返回给我结果,外层键名分别为'开标'、'开标异议'、'评标'、'定标',你可以用嵌套键值对组织回答,嵌套键名为你对相关子要求的总结,而嵌套键值应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。输出格式示例如下:
|
||||
# {
|
||||
# "开标":"招标文件关于项目开标的要求",
|
||||
# "开标异议":"招标文件中关于开标异议的项",
|
||||
# "评标":{
|
||||
# "特殊情况的处置":"因“电子交易系统”系统故障导致无法投标的,交易中心及时通知招标人,招标人视情况决定是否顺延投标截止时间。因投标人自身原因导致无法完成投标的,由投标人自行承担后果。",
|
||||
# "评标办法及流程":"评标流程",
|
||||
# "评标委员会的组建":"评标由招标人依法组建的评标委员会负责。评标委员会由招标人或其委托的招标代理机构熟悉相关业务的代表,以及有关技术、经济等方面的专家组成。"
|
||||
# },
|
||||
# "定标":{
|
||||
# "定标流程":"定标流程",
|
||||
# "履约能力的审查":"履约能力的审查"
|
||||
# }
|
||||
# }
|
||||
# """,
|
||||
2:"""
|
||||
该招标文件中开评定标要求(或磋商流程内容)是什么?请以json格式给我返回结果,外层键名为文中提到的有关开评定标要求(或磋商流程内容)的若干要求,对于各子要求,你可以用嵌套键值对组织回答,其中嵌套键名为你对相关子要求的总结或是原文中的标题,而最内层的键值应该完全与原文内容保持一致;若存在多个并列内容,那么键值为一个列表,列表中包含若干描述该要求的字符串,要求键值与原文内容一致,不得总结、删减。示例如下,仅供格式参考:
|
||||
{
|
||||
"开标":"招标文件关于项目开标的要求",
|
||||
"开标异议":"招标文件中关于开标异议的项",
|
||||
@ -231,7 +267,7 @@ def get_requirements_with_gpt(invalid_path, selection):
|
||||
}
|
||||
""",
|
||||
3: """
|
||||
该招标文件中重新招标(或采购)、不再招标(或采购)、终止招标(或采购)的情况分别是什么?请以json格式返回给我结果,键名分别为'重新招标','不再招标','终止招标',键值应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。示例输出如下:
|
||||
该招标文件中重新招标(或重新采购)、不再招标(或不再采购)、终止招标(或终止采购)的情况分别是什么?请以json格式返回给我结果,键名分别为'重新招标','不再招标','终止招标',键值应该完全与原文内容保持一致,不得擅自总结删减,如果原文中未提及相关内容,在键值中填'未知'。示例输出如下:
|
||||
{
|
||||
"重新招标":"有下列情形之一的,招标人将重新招标:(1)投标截止时间止,投标人少于3个的;(2)经评标委员会评审后否决所有投标的;",
|
||||
"不再招标":"重新招标后投标人仍少于3个或者所有投标被否决的,属于必须审批或核准的工程建设项目,经原审批或核准部门批准后不再进行招标。",
|
||||
@ -242,6 +278,7 @@ def get_requirements_with_gpt(invalid_path, selection):
|
||||
|
||||
# 根据 selection 选择相应的 user_query
|
||||
user_query = user_queries.get(selection)
|
||||
|
||||
if not user_query:
|
||||
return {"error": f"无效的 selection 值: {selection}. 请选择 1、2 或 3。"}
|
||||
# 调用大模型并处理响应
|
||||
@ -251,3 +288,9 @@ def get_requirements_with_gpt(invalid_path, selection):
|
||||
return cleaned_res
|
||||
except Exception as e:
|
||||
return {"error": "调用大模型失败"}
|
||||
|
||||
if __name__ == "__main__":
|
||||
merged_baseinfo_path = r"D:\flask_project\flask_app\static\output\output1\648e094b-e677-47ce-9073-09e0c82af210\ztbfile_merged_baseinfo.pdf"
|
||||
selection=2
|
||||
res=get_requirements_with_gpt(merged_baseinfo_path,selection)
|
||||
print(json.dumps(res,ensure_ascii=False,indent=4))
|
@ -95,7 +95,7 @@ def extract_text_by_page(file_path):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
||||
file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
||||
@ -103,5 +103,5 @@ if __name__ == '__main__':
|
||||
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
|
||||
# ress = extract_common_header(file_path)
|
||||
# print(ress)
|
||||
res=extract_text_by_page(input_path)
|
||||
res=extract_text_by_page(file_path)
|
||||
# print(res)磋商文件_tobidders_notice_part2.pdf
|
@ -9,7 +9,7 @@ from flask_app.general.多线程提问 import multi_threading
|
||||
from flask_app.general.通义千问long import upload_file
|
||||
from flask_app.main.判断是否分包等 import read_questions_from_judge
|
||||
|
||||
def process_judge_questions(judge_file_path, chosen_numbers, baseinfo_path, baseinfo_list1):
|
||||
def process_judge_questions(judge_file_path, chosen_numbers, merged_baseinfo_path, baseinfo_list1):
|
||||
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
|
||||
judge_consortium = judge_consortium_bidding(baseinfo_list1)
|
||||
if judge_consortium:
|
||||
@ -18,7 +18,7 @@ def process_judge_questions(judge_file_path, chosen_numbers, baseinfo_path, base
|
||||
"外层键名为'联合体投标要求',其中有一个嵌套键值对为:\"是否接受联合体投标\":\"是\""
|
||||
)
|
||||
judge_questions.append(judge_consortium_question)
|
||||
file_id3 = upload_file(baseinfo_path)
|
||||
file_id3 = upload_file(merged_baseinfo_path)
|
||||
res2 = multi_threading(judge_questions, "", file_id3, 2)
|
||||
|
||||
if not res2:
|
||||
|
@ -143,7 +143,7 @@ def process_baseinfo_list(baseinfo_list, tobidders_notice):
|
||||
return []
|
||||
|
||||
|
||||
def combine_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path):
|
||||
def combine_basic_info(merged_baseinfo_path,merged_baseinfo_path_more, tobidders_notice_table, tobidders_notice, clause_path):
|
||||
"""
|
||||
综合和处理基础信息,生成最终的基础信息字典。
|
||||
|
||||
@ -173,7 +173,7 @@ def combine_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_tabl
|
||||
# 提交两个任务
|
||||
future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, tobidders_notice_table, baseinfo_list1)
|
||||
future2 = executor.submit(process_baseinfo_list, baseinfo_list1_copy, tobidders_notice)
|
||||
future3 = executor.submit(extract_from_notice, invalid_path, clause_path, 3) # 新增的多线程任务
|
||||
future3 = executor.submit(extract_from_notice, merged_baseinfo_path_more, clause_path, 3) # 新增的多线程任务
|
||||
|
||||
# 等待两个任务完成并获取结果
|
||||
future1.result() # process_judge_questions 直接修改 baseinfo_list1,不需要返回值
|
||||
|
@ -1,11 +1,13 @@
|
||||
# -*- encoding:utf-8 -*-
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from flask_app.general.投标人须知正文提取指定内容 import get_requirements_with_gpt
|
||||
from flask_app.main.截取pdf import truncate_pdf_multiple
|
||||
from flask_app.general.merge_pdfs import merge_pdfs
|
||||
from flask_app.main.table_content_extraction import extract_tables_main
|
||||
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
|
||||
from flask_app.general.json_utils import transform_json_values
|
||||
@ -68,6 +70,9 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id):
|
||||
|
||||
|
||||
merged_baseinfo_path=truncate_files[-1]
|
||||
more_path=[merged_baseinfo_path,tobidders_notice]
|
||||
merged_baseinfo_path_more=os.path.join(output_folder,"merged_baseinfo_path_more.pdf")
|
||||
merge_pdfs(more_path,merged_baseinfo_path_more)
|
||||
clause_path = convert_clause_to_json(tobidders_notice, output_folder) # 投标人须知正文条款pdf->json
|
||||
end_time=time.time()
|
||||
logger.info(f"文件预处理 done,耗时:{end_time - start_time:.2f} 秒")
|
||||
@ -83,21 +88,24 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id):
|
||||
'qualification': qualification,
|
||||
'truncate0_jsonpath': truncate_jsonpath,
|
||||
'merged_baseinfo_path':merged_baseinfo_path,
|
||||
'merged_baseinfo_path_more':merged_baseinfo_path_more,
|
||||
'clause_path': clause_path,
|
||||
'invalid_docpath': invalid_docpath
|
||||
}
|
||||
# 基本信息
|
||||
|
||||
def fetch_project_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path):
|
||||
def fetch_project_basic_info(invalid_path, merged_baseinfo_path, merged_baseinfo_path_more,tobidders_notice_table, tobidders_notice, clause_path):
|
||||
logger.info("starting 基础信息...")
|
||||
start_time = time.time()
|
||||
if not merged_baseinfo_path:
|
||||
merged_baseinfo_path = invalid_path
|
||||
if not merged_baseinfo_path_more:
|
||||
merged_baseinfo_path_more=invalid_path
|
||||
if not tobidders_notice_table:
|
||||
tobidders_notice_table = invalid_path
|
||||
if not tobidders_notice:
|
||||
tobidders_notice = invalid_path
|
||||
basic_res = combine_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path)
|
||||
basic_res = combine_basic_info(merged_baseinfo_path,merged_baseinfo_path_more, tobidders_notice_table, tobidders_notice, clause_path)
|
||||
end_time = time.time()
|
||||
logger.info(f"基础信息 done,耗时:{end_time - start_time:.2f} 秒")
|
||||
return basic_res
|
||||
@ -145,21 +153,25 @@ def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpat
|
||||
|
||||
|
||||
# 投标文件要求
|
||||
def fetch_bidding_documents_requirements(invalid_path, clause_path):
|
||||
def fetch_bidding_documents_requirements(invalid_path, merged_baseinfo_path_more,clause_path):
|
||||
logger.info("starting 投标文件要求...")
|
||||
start_time = time.time()
|
||||
if not merged_baseinfo_path_more:
|
||||
merged_baseinfo_path_more=invalid_path
|
||||
selection = 1
|
||||
fetch_bidding_documents_requirements_json = extract_from_notice(invalid_path, clause_path, selection)
|
||||
fetch_bidding_documents_requirements_json = extract_from_notice(merged_baseinfo_path_more, clause_path, selection)
|
||||
end_time = time.time()
|
||||
logger.info(f"投标文件要求 done,耗时:{end_time - start_time:.2f} 秒")
|
||||
return {"投标文件要求": fetch_bidding_documents_requirements_json}
|
||||
|
||||
# 开评定标流程
|
||||
def fetch_bid_opening(invalid_path, clause_path):
|
||||
def fetch_bid_opening(invalid_path, merged_baseinfo_path_more,clause_path):
|
||||
logger.info("starting 开评定标流程...")
|
||||
start_time = time.time()
|
||||
if not merged_baseinfo_path_more:
|
||||
merged_baseinfo_path_more=invalid_path
|
||||
selection = 2
|
||||
fetch_bid_opening_json = extract_from_notice(invalid_path, clause_path, selection)
|
||||
fetch_bid_opening_json = extract_from_notice(merged_baseinfo_path_more, clause_path, selection)
|
||||
end_time = time.time()
|
||||
logger.info(f"开评定标流程 done,耗时:{end_time - start_time:.2f} 秒")
|
||||
return {"开评定标流程": fetch_bid_opening_json}
|
||||
@ -177,7 +189,7 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
# 立即启动不依赖 knowledge_name 和 index 的任务
|
||||
futures = {
|
||||
'base_info': executor.submit(fetch_project_basic_info,processed_data['invalid_path'] ,processed_data['merged_baseinfo_path'],
|
||||
'base_info': executor.submit(fetch_project_basic_info,processed_data['invalid_path'] ,processed_data['merged_baseinfo_path'],processed_data['merged_baseinfo_path_more'],
|
||||
processed_data['tobidders_notice_table'],
|
||||
processed_data['tobidders_notice'], processed_data['clause_path']),
|
||||
'qualification_review': executor.submit(fetch_qualification_review, processed_data['evaluation_method'],
|
||||
@ -189,8 +201,8 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_
|
||||
'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
|
||||
output_folder, processed_data['truncate0_jsonpath'],
|
||||
processed_data['clause_path'], processed_data['qualification']),
|
||||
'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'], processed_data['clause_path']),
|
||||
'opening_bid': executor.submit(fetch_bid_opening,processed_data['invalid_path'], processed_data['clause_path'])
|
||||
'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'], processed_data['merged_baseinfo_path_more'],processed_data['clause_path']),
|
||||
'opening_bid': executor.submit(fetch_bid_opening,processed_data['invalid_path'],processed_data['merged_baseinfo_path_more'], processed_data['clause_path'])
|
||||
}
|
||||
|
||||
# 提前处理这些不依赖的任务,按完成顺序返回
|
||||
|
@ -30,32 +30,32 @@ def extract_text_from_pdf(file_path, start_word, end_pattern):
|
||||
pdf_document = PdfReader(file_path)
|
||||
all_pages_text = []
|
||||
start_index = None
|
||||
|
||||
# 处理所有页面
|
||||
for i, page in enumerate(pdf_document.pages):
|
||||
page_text = page.extract_text() if page.extract_text() else ""
|
||||
cleaned_text = clean_page_content(page_text, common_header)
|
||||
# print(cleaned_text)
|
||||
|
||||
# 在第一页查找开始位置
|
||||
if i == 0 and start_index is None:
|
||||
start_match = re.search(start_word, cleaned_text, re.MULTILINE)
|
||||
if start_match:
|
||||
start_index = start_match.start()
|
||||
cleaned_text = cleaned_text[start_index:]
|
||||
for pattern in (start_word if isinstance(start_word, list) else [start_word]):
|
||||
start_match = re.search(pattern, cleaned_text)
|
||||
if start_match:
|
||||
start_index = start_match.start()
|
||||
cleaned_text = cleaned_text[start_index:]
|
||||
break # 找到一个匹配后跳出循环
|
||||
|
||||
# 在最后一页查找结束位置
|
||||
if i == len(pdf_document.pages) - 1:
|
||||
for pattern in end_pattern:
|
||||
matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE))
|
||||
if matches:
|
||||
end_index = matches[-1].start()
|
||||
cleaned_text = cleaned_text[:end_index]
|
||||
break
|
||||
matches = list(re.finditer(end_pattern, cleaned_text))
|
||||
if matches:
|
||||
end_index = matches[-1].start()
|
||||
cleaned_text = cleaned_text[:end_index]
|
||||
|
||||
all_pages_text.append(cleaned_text)
|
||||
|
||||
# 合并所有页面的文本
|
||||
full_text = "\n".join(all_pages_text)
|
||||
# print(full_text)
|
||||
return full_text
|
||||
def extract_section(text, start_pattern, end_phrases):
|
||||
# 查找开始模式
|
||||
@ -113,8 +113,16 @@ def parse_text_by_heading(text):
|
||||
current_content = []
|
||||
append_newline = False
|
||||
lines = text.split('\n')
|
||||
|
||||
def check_year_pattern(line):
|
||||
line_no_spaces = line.replace(' ', '')
|
||||
return re.match(r'^\d{4}', line_no_spaces) and line_no_spaces[4:5] == '年'
|
||||
for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n'
|
||||
line_stripped = line.strip().replace('.', '.')
|
||||
# 检查是否以年份开头,如果是,属于上一个标题内容
|
||||
if check_year_pattern(line_stripped) and current_key is not None:
|
||||
current_content.append(line_stripped)
|
||||
continue
|
||||
# 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号
|
||||
match = re.match(r'^(?<![a-zA-Z((])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
|
||||
if not match:
|
||||
@ -168,31 +176,58 @@ def convert_to_json(file_path, start_word, end_phrases):
|
||||
parsed_data = parse_text_by_heading(text)
|
||||
return parsed_data
|
||||
|
||||
def convert_clause_to_json(input_path,output_folder,type=1):
|
||||
|
||||
def convert_clause_to_json(input_path, output_folder, type=1):
|
||||
if not os.path.exists(input_path):
|
||||
# print(f"The specified file does not exist: {input_path}")
|
||||
return ""
|
||||
if type==1:
|
||||
start_word = "投标人须知正文"
|
||||
end_phrases = [
|
||||
r'^第[一二三四五六七八九十]+章\s*评标办法',
|
||||
r'^评标办法前附表',
|
||||
r'^附(?:录|件|表)(?:一)?[::]'
|
||||
|
||||
if type == 1:
|
||||
# start_word 和编译后的 end_pattern 用于 type=1
|
||||
start_word = [
|
||||
re.compile(
|
||||
r'^\s*(?:[((]\s*[一1]?\s*[))]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)'
|
||||
r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文)',
|
||||
re.MULTILINE
|
||||
)
|
||||
]
|
||||
end_pattern = re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
|
||||
r'^评标办法前附表|'
|
||||
r'^附录(?:一)?[::]|'
|
||||
r'^附件(?:一)?[::]|'
|
||||
r'^附表(?:一)?[::]',
|
||||
re.MULTILINE
|
||||
)
|
||||
else:
|
||||
start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:|投标邀请书|投标邀请函'
|
||||
end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
|
||||
result = convert_to_json(input_path, start_word, end_phrases) #过滤无关信息
|
||||
start_word = [
|
||||
re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',
|
||||
re.MULTILINE
|
||||
),
|
||||
re.compile(
|
||||
r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\))]?\s*$',
|
||||
re.MULTILINE
|
||||
)
|
||||
]
|
||||
end_pattern = re.compile(
|
||||
r'第[一二三四五六七八九十]+章\s*投标人须知|'
|
||||
r'投标人须知前附表',
|
||||
re.MULTILINE
|
||||
)
|
||||
|
||||
result = convert_to_json(input_path, start_word, end_pattern) # 过滤无关信息
|
||||
|
||||
# 检查输出文件夹是否存在,如果不存在则创建
|
||||
if not os.path.exists(output_folder):
|
||||
os.makedirs(output_folder)
|
||||
print(f"Created output folder: {output_folder}")
|
||||
|
||||
file_name = "clause1.json" if type == 1 else "clause2.json"
|
||||
output_path = os.path.join(output_folder, file_name)
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, indent=4, ensure_ascii=False)
|
||||
# post_process_json(output_path)
|
||||
# print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
|
||||
|
||||
return output_path
|
||||
|
||||
# def post_process_json(json_file_path): #处理一级标题如'5.1'过长的内容 zbtest20
|
||||
@ -237,16 +272,16 @@ def convert_clause_to_json(input_path,output_folder,type=1):
|
||||
# json.dump(processed_data, file, ensure_ascii=False, indent=4)
|
||||
|
||||
if __name__ == "__main__":
|
||||
file_path='D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\东莞支队查验招标文件_tobidders_notice_part2_1-5.pdf'
|
||||
file_path='C:\\Users\\Administrator\\Desktop\\fsdownload\\dafa8e6f-319e-4547-abbd-65375bf82004\\ztbfile_tobidders_notice.pdf'
|
||||
# file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件(一中多媒体报告厅教学设备)_tobidders_notice_part1.pdf'
|
||||
# start_word = "投标人须知正文"
|
||||
# end_phrases = [
|
||||
# r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
|
||||
# r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
|
||||
# ]
|
||||
output_folder = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\tmp'
|
||||
output_folder = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\dafa8e6f-319e-4547-abbd-65375bf82004\\tmp'
|
||||
try:
|
||||
output_path = convert_clause_to_json("",output_folder)
|
||||
output_path = convert_clause_to_json(file_path,output_folder)
|
||||
print(f"Final JSON result saved to: {output_path}")
|
||||
except ValueError as e:
|
||||
print("Error:", e)
|
||||
|
@ -114,8 +114,8 @@ def dynamic_key_handling(key_groups, detected_keys):
|
||||
if key not in group:
|
||||
group.append(key)
|
||||
|
||||
def get_base_info(baseinfo_file_path,clause_path):
|
||||
file_id = upload_file(baseinfo_file_path)
|
||||
def get_base_info(merged_baseinfo_path,clause_path):
|
||||
file_id = upload_file(merged_baseinfo_path)
|
||||
baseinfo_file_path='flask_app/static/提示词/基本信息货物标.txt'
|
||||
# baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息货物标.txt'
|
||||
questions = read_questions_from_file(baseinfo_file_path)
|
||||
@ -129,9 +129,9 @@ def get_base_info(baseinfo_file_path,clause_path):
|
||||
# judge_file_path ='D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题货物标.txt'
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
|
||||
# 提交两个任务
|
||||
future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, baseinfo_file_path,
|
||||
future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, merged_baseinfo_path,
|
||||
baseinfo_list)
|
||||
future2 = executor.submit(extract_from_notice, baseinfo_file_path, clause_path, 3) # 新增的多线程任务
|
||||
future2 = executor.submit(extract_from_notice, merged_baseinfo_path, clause_path, 3) # 新增的多线程任务
|
||||
|
||||
# 等待两个任务完成并获取结果
|
||||
future1.result() # process_judge_questions 直接修改 baseinfo_list,不需要返回值
|
||||
@ -158,14 +158,14 @@ def get_base_info(baseinfo_file_path,clause_path):
|
||||
# baseinfo_list.append(clean_json_string(response))
|
||||
return baseinfo_list
|
||||
|
||||
def combine_basic_info(baseinfo_file_path, procurement_path,procurement_docpath,clause_path):
|
||||
def combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath,clause_path):
|
||||
baseinfo_list = []
|
||||
temp_list = []
|
||||
procurement_reqs = {}
|
||||
# 定义一个线程函数来获取基础信息
|
||||
def get_base_info_thread():
|
||||
nonlocal temp_list
|
||||
temp_list = get_base_info(baseinfo_file_path,clause_path)
|
||||
temp_list = get_base_info(merged_baseinfo_path,clause_path)
|
||||
# 定义一个线程函数来获取采购需求
|
||||
def fetch_procurement_reqs_thread():
|
||||
nonlocal procurement_reqs
|
||||
@ -191,11 +191,11 @@ def combine_basic_info(baseinfo_file_path, procurement_path,procurement_docpath,
|
||||
if __name__ == "__main__":
|
||||
start_time=time.time()
|
||||
# baseinfo_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all\\ztbfile_merged_baseinfo\\ztbfile_merged_baseinfo_3-31.pdf"
|
||||
baseinfo_file_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_merged_baseinfo.pdf"
|
||||
merged_baseinfo_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_merged_baseinfo.pdf"
|
||||
# procurement_file_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\tmp\\ztbfile_procurement.pdf"
|
||||
procurement_file_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx"
|
||||
clause_path='D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\clause1.json'
|
||||
res = combine_basic_info(baseinfo_file_path, procurement_file_path,clause_path)
|
||||
res = combine_basic_info(merged_baseinfo_path, procurement_file_path,clause_path)
|
||||
print("------------------------------------")
|
||||
print(json.dumps(res, ensure_ascii=False, indent=4))
|
||||
end_time=time.time()
|
||||
|
@ -62,7 +62,7 @@ post_process 函数尝试将长字符串按特定模式分割成块,每块至
|
||||
"""
|
||||
|
||||
# 读取JSON数据,提取内容,转换结构,并打印结果
|
||||
def extract_from_notice(invalid_path,clause_path, type):
|
||||
def extract_from_notice(merged_baseinfo_path,clause_path, type):
|
||||
"""
|
||||
从公告中提取特定类型的内容。
|
||||
Args:
|
||||
@ -106,7 +106,7 @@ def extract_from_notice(invalid_path,clause_path, type):
|
||||
transformed_data = process_with_outer_key(extracted_data)
|
||||
final_result = process_nested_data(transformed_data)
|
||||
if not final_result:
|
||||
final_result = get_requirements_with_gpt(invalid_path, type) #万一没用正则匹配到,那就调用大模型
|
||||
final_result = get_requirements_with_gpt(merged_baseinfo_path, type) #万一没用正则匹配到,那就调用大模型
|
||||
return final_result
|
||||
|
||||
except Exception as e:
|
||||
@ -114,11 +114,11 @@ def extract_from_notice(invalid_path,clause_path, type):
|
||||
return DEFAULT_RESULT
|
||||
|
||||
if __name__ == "__main__":
|
||||
file_path = 'D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\clause1.json'
|
||||
invalid_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile.pdf"
|
||||
clause_path = 'D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\clause1.json'
|
||||
merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\648e094b-e677-47ce-9073-09e0c82af210\ztbfile_merged_baseinfo.pdf"
|
||||
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\clause1.json'
|
||||
try:
|
||||
res = extract_from_notice(invalid_path,file_path, 1) # 可以改变此处的 type 参数测试不同的场景
|
||||
res = extract_from_notice(merged_baseinfo_path,clause_path, 1) # 可以改变此处的 type 参数测试不同的场景
|
||||
res2=json.dumps(res,ensure_ascii=False,indent=4)
|
||||
print(res2)
|
||||
except ValueError as e:
|
||||
|
@ -16,6 +16,7 @@ def extract_text_from_pdf(file_path, start_word, end_pattern):
|
||||
# 从PDF文件中提取文本
|
||||
common_header = extract_common_header(file_path)
|
||||
pdf_document = PdfReader(file_path)
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
|
||||
all_pages_text = []
|
||||
start_index = None
|
||||
# 处理所有页面
|
||||
@ -132,18 +133,35 @@ def parse_text_by_heading(text):
|
||||
append_newline = False
|
||||
skip_subheadings = False
|
||||
last_main_number = None
|
||||
temp_title = None # 临时存储以点号开头但不带数字的标题
|
||||
|
||||
lines = text.split('\n')
|
||||
|
||||
def check_year_pattern(line):
|
||||
# 检查是否为年份模式,如 '2014年'
|
||||
line_no_spaces = line.replace(' ', '')
|
||||
return re.match(r'^\d{4}\s*年', line_no_spaces) is not None
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
line_stripped = line.strip().replace('.', '.')
|
||||
|
||||
# 如果在一个章节内,跳过年份模式的行
|
||||
if check_year_pattern(line_stripped) and current_key is not None:
|
||||
current_content.append(line_stripped)
|
||||
continue
|
||||
|
||||
# 匹配带数字的标题,例如 '12.1 内容'
|
||||
match = re.match(r'^(?<![a-zA-Z((])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
|
||||
if not match:
|
||||
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
|
||||
|
||||
# 新增:处理以点号开头的情况
|
||||
dot_match = re.match(r'^\.(\d+(?:\.\d+)*)\s*(.+)$', line_stripped)
|
||||
# 匹配以点号开头并带有数字的情况,例如 '.12.1 内容'
|
||||
dot_match = re.match(r'^[..](\d+(?:[..]\d+)*)\s*(.+)$', line_stripped)
|
||||
|
||||
# 新增:处理没有点号的纯数字开头的情况
|
||||
# 匹配以点号开头但不带数字的情况,例如 '.投标文件的制作和签署'
|
||||
dot_text_match = re.match(r'^[..]\s*(\D.+)$', line_stripped)
|
||||
|
||||
# 匹配不带点号的纯数字开头的情况,例如 '27xxxxx'
|
||||
pure_number_match = re.match(r'^(\d+)([^.\d].*)', line_stripped)
|
||||
|
||||
if match:
|
||||
@ -160,6 +178,14 @@ def parse_text_by_heading(text):
|
||||
data[current_key_chinese] = current_value_chinese
|
||||
current_key_chinese = None
|
||||
|
||||
# 处理临时标题与新主标题的关联
|
||||
if temp_title:
|
||||
# 提取主编号(第一个点前的数字)
|
||||
main_number = new_key.split('.')[0]
|
||||
# 关联临时标题到主编号
|
||||
data[f"{main_number}."] = temp_title
|
||||
temp_title = None # 重置临时标题
|
||||
|
||||
# 保存阿拉伯数字的标题内容
|
||||
if current_key is None or (compare_headings(current_key, new_key) and (
|
||||
len(current_content) == 0 or current_content[-1][-1] != '第')):
|
||||
@ -170,20 +196,13 @@ def parse_text_by_heading(text):
|
||||
current_content = [line_content]
|
||||
append_newline = len(new_key.rstrip('.').split('.')) <= 2
|
||||
last_main_number = new_key.split('.')[0]
|
||||
# if current_key is None or (current_key != new_key and ( #不给序号排序
|
||||
# len(current_content) == 0 or current_content[-1][-1] != '第')):
|
||||
# if current_key is not None:
|
||||
# content_string = ''.join(current_content).strip()
|
||||
# data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
|
||||
# current_key = new_key
|
||||
# current_content = [line_content]
|
||||
# append_newline = len(new_key.rstrip('.').split('.')) <= 2
|
||||
# last_main_number = new_key.split('.')[0]
|
||||
else:
|
||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
|
||||
|
||||
elif dot_match:
|
||||
# 处理以点号开头的情况
|
||||
# 处理以点号开头并带有数字的情况
|
||||
sub_number, line_content = dot_match.groups()
|
||||
sub_number = sub_number.replace('.', '.')
|
||||
if last_main_number:
|
||||
new_key = f"{last_main_number}.{sub_number}"
|
||||
if current_key is not None:
|
||||
@ -194,7 +213,13 @@ def parse_text_by_heading(text):
|
||||
append_newline = True
|
||||
else:
|
||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
|
||||
elif pure_number_match: # 处理没有点号的纯数字开头的情况'27xxxxx'
|
||||
|
||||
elif dot_text_match:
|
||||
# 处理以点号开头但不带数字的情况,存储到临时变量
|
||||
temp_title = dot_text_match.group(1).strip()
|
||||
continue # 跳过进一步处理该行
|
||||
|
||||
elif pure_number_match: # 处理不带点号的纯数字开头的情况,例如 '27xxxxx'
|
||||
new_key_candidate, line_content = pure_number_match.groups()
|
||||
new_key_candidate += '.' # 添加点号
|
||||
line_content = line_content.lstrip('..、,')
|
||||
@ -208,6 +233,12 @@ def parse_text_by_heading(text):
|
||||
|
||||
# 判断新键名是否为新的标题,条件是键名递增且差值在5以内
|
||||
if (current_key is None or (new_key_num > current_key_num and new_key_num - current_key_num <= 5)):
|
||||
# 处理临时标题与新主标题的关联
|
||||
if temp_title:
|
||||
main_number = new_key_candidate.split('.')[0]
|
||||
data[f"{main_number}."] = temp_title
|
||||
temp_title = None # 重置临时标题
|
||||
|
||||
# 开始新的标题
|
||||
if current_key is not None:
|
||||
content_string = ''.join(current_content).strip()
|
||||
@ -216,19 +247,28 @@ def parse_text_by_heading(text):
|
||||
current_content = [line_content]
|
||||
append_newline = True
|
||||
last_main_number = new_key_candidate.rstrip('.')
|
||||
# if current_key is None or (current_key != new_key and ( #不给序号排序
|
||||
# len(current_content) == 0 or current_content[-1][-1] != '第')):
|
||||
# if current_key is not None:
|
||||
# content_string = ''.join(current_content).strip()
|
||||
# data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
|
||||
# current_key = new_key
|
||||
# current_content = [line_content]
|
||||
# append_newline = len(new_key.rstrip('.').split('.')) <= 2
|
||||
# last_main_number = new_key.split('.')[0]
|
||||
else:
|
||||
# 将当前行视为当前标题的内容
|
||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
|
||||
|
||||
else:
|
||||
if not skip_subheadings:
|
||||
# 合并了中文标题、字母标题、阿拉伯数字标题的匹配逻辑
|
||||
pattern_title = re.compile(r'^\s*(?:[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
|
||||
if not skip_subheadings: # 合并中文标题、字母标题、阿拉伯数字标题的匹配逻辑
|
||||
pattern_title = re.compile(
|
||||
r'^\s*(?:[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
|
||||
chinese_match = pattern_title.match(line_stripped)
|
||||
letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
|
||||
arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
|
||||
|
||||
# 优化了处理逻辑,减少冗余
|
||||
# 优化处理逻辑,减少冗余
|
||||
if chinese_match or letter_match or arabic_match:
|
||||
# 保存之前的 key 的内容(无论是中文标题还是阿拉伯数字标题)
|
||||
if current_key is not None:
|
||||
@ -277,6 +317,9 @@ def parse_text_by_heading(text):
|
||||
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
|
||||
if current_key_chinese is not None:
|
||||
data[current_key_chinese] = current_value_chinese
|
||||
if temp_title:
|
||||
# 处理任何未关联的临时标题
|
||||
data["未分类标题"] = temp_title # 您可以根据需要选择合适的键名
|
||||
|
||||
return data
|
||||
|
||||
@ -364,13 +407,13 @@ def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json
|
||||
print(f"The specified file does not exist: {file_path}")
|
||||
return ""
|
||||
if type == 1:
|
||||
start_word = r'^\s*[((]?\s*[一1]\s*[))]?\s*[、.]*\s*(说\s*明|总\s*则)'
|
||||
end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
|
||||
start_word = r'^\s*(?:[((]?\s*[一二12]?\s*[))]?\s*[、..]*\s*)?(说\s*明|总\s*则)'
|
||||
end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*)$'
|
||||
text = extract_text_from_pdf(file_path, start_word, end_pattern)
|
||||
result = parse_text_by_heading(text)
|
||||
else:
|
||||
start_word = r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*'
|
||||
end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人|磋商|供应商)须知前附表)'
|
||||
start_word = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*|.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*)$'
|
||||
end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
|
||||
text = extract_text_from_pdf(file_path, start_word, end_pattern)
|
||||
result=parse_text_to_dict(text)
|
||||
# result = convert_to_json(input_path, start_word, end_pattern)
|
||||
@ -407,9 +450,9 @@ def process_folder(input_folder, output_folder):
|
||||
#TODO: 投标人须知正文这块,序号可能是乱序的,或许可以删除判断序号大小的逻辑,只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理
|
||||
if __name__ == "__main__":
|
||||
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
|
||||
file_path='D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\ztbfile_tobidders_notice_part2.pdf'
|
||||
file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
||||
output_folder = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\tmp'
|
||||
output_folder = 'D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\tmp'
|
||||
try:
|
||||
output_path = convert_clause_to_json(file_path,output_folder,1)
|
||||
print(f"Final JSON result saved to: {output_path}")
|
||||
|
@ -129,22 +129,26 @@ def fetch_invalid_requirements(invalid_docpath, output_folder):
|
||||
logger.info(f"无效标与废标 done,耗时:{end_time - start_time:.2f} 秒")
|
||||
return find_invalid_res
|
||||
|
||||
def fetch_bidding_documents_requirements(invalid_path,clause_path):
|
||||
def fetch_bidding_documents_requirements(invalid_path,merged_baseinfo_path,clause_path):
|
||||
logger.info("starting 投标文件要求...")
|
||||
if not merged_baseinfo_path:
|
||||
merged_baseinfo_path=invalid_path
|
||||
start_time = time.time()
|
||||
selection=1
|
||||
fetch_bidding_documents_requirements_json = extract_from_notice(invalid_path,clause_path, selection)
|
||||
fetch_bidding_documents_requirements_json = extract_from_notice(merged_baseinfo_path,clause_path, selection)
|
||||
end_time = time.time()
|
||||
logger.info(f"投标文件要求 done,耗时:{end_time - start_time:.2f} 秒")
|
||||
return {"投标文件要求": fetch_bidding_documents_requirements_json}
|
||||
|
||||
|
||||
# 开评定标流程
|
||||
def fetch_bid_opening(invalid_path,clause_path):
|
||||
def fetch_bid_opening(invalid_path,merged_baseinfo_path,clause_path):
|
||||
logger.info("starting 开评定标流程...")
|
||||
if not merged_baseinfo_path:
|
||||
merged_baseinfo_path=invalid_path
|
||||
start_time = time.time()
|
||||
selection=2
|
||||
fetch_bid_opening_json = extract_from_notice(invalid_path,clause_path, selection)
|
||||
fetch_bid_opening_json = extract_from_notice(merged_baseinfo_path,clause_path, selection)
|
||||
end_time = time.time()
|
||||
logger.info(f"开评定标流程 done,耗时:{end_time - start_time:.2f} 秒")
|
||||
return {"开评定标流程": fetch_bid_opening_json}
|
||||
@ -201,9 +205,9 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
|
||||
processed_data['evaluation_method_path']),
|
||||
'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
|
||||
output_folder),
|
||||
'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'],
|
||||
'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'],processed_data['merged_baseinfo_path'],
|
||||
processed_data['clause_path']),
|
||||
'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_path'],processed_data['clause_path']),
|
||||
'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_path'],processed_data['merged_baseinfo_path'],processed_data['clause_path']),
|
||||
'base_info': executor.submit(fetch_project_basic_info, processed_data['invalid_path'],processed_data['invalid_docpath'],processed_data['merged_baseinfo_path'],
|
||||
processed_data['procurement_path'],processed_data['procurement_docpath'],processed_data['clause_path']),
|
||||
'qualification_review': executor.submit(fetch_qualification_review, processed_data['invalid_path'],output_folder,
|
||||
|
Loading…
x
Reference in New Issue
Block a user