This commit is contained in:
zy123 2024-10-31 15:03:32 +08:00
parent 0288a27d1c
commit 7d3bca17b3
13 changed files with 197 additions and 188 deletions

View File

@ -1,5 +1,7 @@
import json
import os
import time
import requests
from flask_app.main.download import download_file

View File

@ -215,7 +215,7 @@ def get_requirements_with_gpt(invalid_path, selection):
}
""",
2: """
该招标文件中开评定标要求是什么你需要从'开标''开标异议''评标''定标'四个角度回答其中'评标'可以从特殊情况的处置评标办法及流程评标委员会的组建角度来说明'定标'可以从定标流程履约能力的审查角度来说明请以json格式返回给我结果外层键名分别为'开标''开标异议''评标''定标'你可以用嵌套键值对组织回答嵌套键名为你对相关子要求的总结而嵌套键名应该完全与原文内容保持一致不得擅自总结删减如果原文中未提及相关内容在键值中填'未知'输出格式示例如下
该招标文件中开定标要求(或磋商流程内容)是什么你需要从'开标''开标异议''评标''定标'四个角度回答其中'评标'可以从特殊情况的处置评标办法及流程评标委员会的组建角度来说明'定标'可以从定标流程履约能力的审查角度来说明请以json格式返回给我结果外层键名分别为'开标''开标异议''评标''定标'你可以用嵌套键值对组织回答嵌套键名为你对相关子要求的总结而嵌套键名应该完全与原文内容保持一致不得擅自总结删减如果原文中未提及相关内容在键值中填'未知'输出格式示例如下
{
"开标":"招标文件关于项目开标的要求",
"开标异议":"招标文件中关于开标异议的项",
@ -231,7 +231,7 @@ def get_requirements_with_gpt(invalid_path, selection):
}
""",
3: """
该招标文件中重新招标不再招标终止招标的情况分别是什么请以json格式返回给我结果键名分别为'重新招标''不再招标''终止招标'键值应该完全与原文内容保持一致不得擅自总结删减如果原文中未提及相关内容在键值中填'未知'示例输出如下
该招标文件中重新招标或采购不再招标或采购终止招标或采购的情况分别是什么请以json格式返回给我结果键名分别为'重新招标''不再招标''终止招标'键值应该完全与原文内容保持一致不得擅自总结删减如果原文中未提及相关内容在键值中填'未知'示例输出如下
{
"重新招标":"有下列情形之一的招标人将重新招标1投标截止时间止投标人少于3个的2经评标委员会评审后否决所有投标的",
"不再招标":"重新招标后投标人仍少于3个或者所有投标被否决的属于必须审批或核准的工程建设项目经原审批或核准部门批准后不再进行招标。",

View File

@ -149,12 +149,12 @@ def extract_text_by_page(file_path):
if __name__ == '__main__':
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件定稿三次_tobidders_notice_part2.pdf'
file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest18.pdf"
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
# ress = extract_common_header(file_path)
# print(ress)
res=extract_text_by_page(file_path)

View File

@ -47,13 +47,13 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id):
else:
logger.error("Unsupported file type provided. Preprocessing halted.")
return None
# 调用截取PDF多次
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,unique_id)
# 处理各个部分
tobidders_notice_table=truncate_files[0]
truncate0_docpath = pdf2docx(tobidders_notice_table) # 投标人须知前附表转docx
truncate_jsonpath = extract_tables_main(truncate0_docpath, output_folder) # 投标人须知前附表docx->json
tobidders_notice = truncate_files[1] #投标人须知正文
@ -86,26 +86,6 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id):
'clause_path': clause_path,
'invalid_docpath': invalid_docpath
}
def post_processing(data,includes):
# 初始化结果字典,预设'其他'分类为空字典
result = {"其他": {}}
# 遍历原始字典的每一个键值对
for key, value in data.items():
if key in includes:
# 如果键在includes列表中直接保留这个键值对
result[key] = value
else:
# 如果键不在includes列表中将这个键值对加入到'其他'分类中
result["其他"][key] = value
# 如果'其他'分类没有任何内容,可以选择删除这个键
if not result["其他"]:
del result["其他"]
return result
# 基本信息
def fetch_project_basic_info(invalid_path, merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path):
@ -242,8 +222,9 @@ if __name__ == "__main__":
file_type = 2 #1:docx 2:pdf 3:其他
input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf"
print("yes")
for output in engineering_bid_main(output_folder, input_file, file_type, "zytest"):
print(output)
# for output in engineering_bid_main(output_folder, input_file, file_type, "zytest"):
# print(output)
preprocess_files(output_folder,input_file,2,"121")
end_time = time.time()
elapsed_time = end_time - start_time # 计算耗时
print(f"Function execution took {elapsed_time} seconds.")

View File

@ -1,5 +1,7 @@
import re
import os
import time
from PyPDF2 import PdfReader, PdfWriter
from flask_app.general.merge_pdfs import merge_pdfs
import concurrent.futures
@ -86,7 +88,6 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
before_doc = PdfWriter()
toc_page = -1
# 查找目录页
for page_num in range(min(start_page, total_pages)):
page_text = pdf_document.pages[page_num].extract_text()
@ -97,12 +98,13 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
# 确定截取的页数
pages_to_extract = toc_page + 1 if toc_page != -1 else start_page
# 提取页面
for page_num in range(pages_to_extract):
before_doc.add_page(pdf_document.pages[page_num])
print(before_pdf_path)
with open(before_pdf_path, 'wb') as f_before:
before_doc.write(f_before)
# print(f"已保存页面从 0 到 {pages_to_extract} 为 {before_pdf_path}")
# 提取指定范围的页面
@ -113,7 +115,6 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
# 保存新的PDF文件
with open(output_pdf_path, 'wb') as f_output:
output_doc.write(f_output)
print(f"{output_suffix} 已截取并保存页面从 {start_page + 1}{end_page + 1}{output_pdf_path}")
return output_pdf_path
@ -246,17 +247,11 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
if not is_secondary_match:
if re.search(begin_pattern, cleaned_text) and i >= begin_page:
if output_suffix == "invalid" and start_page:
continue
else:
start_page = i
else:
if re.search(begin_pattern, cleaned_text) and i >=begin_page:
if output_suffix == "notice" and start_page: #考虑投标人须知前附表中有' 同招标公告 '的情况
pass
else:
if re.search(begin_pattern, cleaned_text) and i >= begin_page:
if not start_page:
if (output_suffix == "notice" or output_suffix == "invalid") and is_secondary_match:
pass # 针对 '同招标公告' 的情况
elif output_suffix not in ["notice", "invalid"] or not is_secondary_match:
start_page = i
if start_page is not None and re.search(end_pattern, cleaned_text):
condition = i > start_page
@ -300,7 +295,10 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
def get_start_and_common_header(input_path):
common_header = extract_common_header(input_path)
last_begin_index = 0
begin_pattern = re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函)\s*$',re.MULTILINE)
begin_pattern = re.compile(
r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)]?\s*$',
re.MULTILINE
)
pdf_document = PdfReader(input_path)
for i, page in enumerate(pdf_document.pages):
if i > 25:
@ -366,11 +364,11 @@ def truncate_pdf_main(input_path, output_folder, selection):
pattern_pairs = [
(
re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*|^第一卷|^投标邀请书'),
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'),
re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
),
(
re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|第一卷)\s*$', re.MULTILINE),
re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)]?\s*$',re.MULTILINE),
re.compile(r".*(?:投标人须知|投标人须知前附表)\s*$", re.MULTILINE)
)
]
@ -381,11 +379,11 @@ def truncate_pdf_main(input_path, output_folder, selection):
pattern_pairs = [
(
re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*|^第一卷|^投标邀请书'),
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'),
re.compile(r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷', re.MULTILINE)
),
(
re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|第一卷)\s*$', re.MULTILINE),
re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|第一卷|投标邀请)\s*$', re.MULTILINE),
re.compile(r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷', re.MULTILINE)
)
]
@ -496,6 +494,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix,common_header,las
print(f"Error in extract_pages_twice: {e}")
return []
def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_name):
"""
合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件
@ -550,28 +549,39 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na
else:
print(f"没有找到以 '{suffix}' 结尾的文件。")
print(f"总共将要合并的 PDF 文件数量: {len(all_pdfs_to_merge)}")
if not all_pdfs_to_merge:
print("没有找到要合并的 PDF 文件。")
return ""
# 过滤掉不存在或为空的文件路径
all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f)]
if not all_pdfs_to_merge:
print("没有有效的 PDF 文件需要合并")
print("没有找到要合并的 PDF 文件。")
return ""
# 调用 merge_pdfs 函数进行合并
try:
merge_pdfs(all_pdfs_to_merge, output_path)
print(f"已成功合并 PDF 文件到 '{output_path}'")
return output_path
except Exception as e:
print(f"合并 PDF 文件时出错: {e}")
return ""
# 检查合并后的文件是否存在且不为空
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
# 合并成功,删除 {base_file_name}_before.pdf 文件
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
if os.path.exists(before_pdf_path):
try:
os.remove(before_pdf_path)
print(f"已删除文件: {before_pdf_path}")
except Exception as e:
print(f"删除文件 {before_pdf_path} 时出错: {e}")
else:
print(f"未找到要删除的文件: {before_pdf_path}")
return output_path
else:
print(f"合并失败,没有生成 '{output_path}'")
return ""
def truncate_pdf_multiple(input_path, output_folder, unique_id="123"):
global logger
logger = get_global_logger(unique_id)
@ -675,10 +685,12 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, uniqu
#TODO:目前merged_baseinfo没有包含投标人须知正文。
#投标人须知前附表改为货物标一样的
if __name__ == "__main__":
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest7.pdf" # 可以是单个PDF文件路径或文件夹路径 #zbtest20.pdf zbtest4_evaluation_method.pdf zbtest8.pdf
start_time=time.time()
input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\招标文件\\HBDL-2024-0181-001-招标文件.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\68549b0b-e892-41a9-897c-c3694535ee61\\ztbfile.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\新建文件夹"
# input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
output_folder="C:\\Users\\Administrator\\Desktop\\new招标文件\\招标文件\\tmp"
# files=truncate_pdf_multiple(input_path,output_folder)
# selections = [5, 1] # 仅处理 selection 5、1
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
@ -686,4 +698,6 @@ if __name__ == "__main__":
selection = 5 # 例如1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print("生成的文件:", generated_files)
print("生成的文件:", generated_files)
end_time=time.time()
print("耗时:"+str(end_time-start_time))

View File

@ -9,6 +9,8 @@ from flask_app.货物标.截取pdf货物标版 import extract_common_header, cle
#正则表达式判断原文中是否有商务、服务、其他要求
def find_exists(truncate_file, required_keys):
if not truncate_file:
return ["商务要求", "服务要求", "其他要求"]
common_header = extract_common_header(truncate_file)
pdf_document = PdfReader(truncate_file)
text = ""

View File

@ -3,7 +3,7 @@ import json
import threading
import time
import concurrent.futures
from flask_app.general.json_utils import clean_json_string
from flask_app.general.json_utils import clean_json_string, rename_outer_key
from flask_app.general.通用功能函数 import judge_consortium_bidding, process_judge_questions
from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
from flask_app.general.通义千问long import upload_file
@ -13,6 +13,8 @@ from flask_app.货物标.提取采购需求main import fetch_procurement_reqs
def aggregate_basic_info_goods(baseinfo_list):
for i in baseinfo_list:
print(json.dumps(i,ensure_ascii=False,indent=4))
"""
将基础信息列表中的数据进行合并和分类
@ -124,6 +126,7 @@ def get_base_info(baseinfo_file_path,clause_path):
chosen_numbers, merged = merge_json_to_list(baseinfo_list.pop())
baseinfo_list.append(merged)
judge_file_path = 'flask_app/static/提示词/是否相关问题货物标.txt'
# judge_file_path ='D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题货物标.txt'
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 提交两个任务
future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, baseinfo_file_path,
@ -132,8 +135,10 @@ def get_base_info(baseinfo_file_path,clause_path):
# 等待两个任务完成并获取结果
future1.result() # process_judge_questions 直接修改 baseinfo_list不需要返回值
rebidding_situation = future2.result() # 获取提取失败的情况
baseinfo_list.append(rebidding_situation)
rebidding_situation = future2.result()
update_json = rename_outer_key(rebidding_situation, "重新招标、不再招标和终止招标")
baseinfo_list.append(update_json)
# # judge_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题货物标.txt'
# judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
# # print(judge_questions)
@ -153,7 +158,7 @@ def get_base_info(baseinfo_file_path,clause_path):
# baseinfo_list.append(clean_json_string(response))
return baseinfo_list
def combine_basic_info(baseinfo_file_path, procurement_file_path,clause_path):
def combine_basic_info(baseinfo_file_path, procurement_path,procurement_docpath,clause_path):
baseinfo_list = []
temp_list = []
procurement_reqs = {}
@ -164,7 +169,7 @@ def combine_basic_info(baseinfo_file_path, procurement_file_path,clause_path):
# 定义一个线程函数来获取采购需求
def fetch_procurement_reqs_thread():
nonlocal procurement_reqs
procurement_reqs = fetch_procurement_reqs(procurement_file_path)
procurement_reqs = fetch_procurement_reqs(procurement_path,procurement_docpath)
# 创建并启动获取基础信息的线程
thread1 = threading.Thread(target=get_base_info_thread)
thread1.start()
@ -186,10 +191,12 @@ def combine_basic_info(baseinfo_file_path, procurement_file_path,clause_path):
if __name__ == "__main__":
start_time=time.time()
# baseinfo_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all\\ztbfile_merged_baseinfo\\ztbfile_merged_baseinfo_3-31.pdf"
baseinfo_file_path="D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\ztbfile_merged_baseinfo.pdf"
baseinfo_file_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_merged_baseinfo.pdf"
# procurement_file_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\tmp\\ztbfile_procurement.pdf"
procurement_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zboutpub\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件定稿三次_procurement.pdf"
res = combine_basic_info(baseinfo_file_path, procurement_file_path)
procurement_file_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx"
clause_path='D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\clause1.json'
res = combine_basic_info(baseinfo_file_path, procurement_file_path,clause_path)
print("------------------------------------")
print(json.dumps(res, ensure_ascii=False, indent=4))
end_time=time.time()
print("elasped time:"+str(end_time-start_time))

View File

@ -124,38 +124,15 @@ def process_files(file_path, output_folder, begin_pattern, begin_page, end_patte
if output_suffix == "tobidders_notice":
# 确保返回的是元组,并将其中的 None 转换为 ""
path1, path2 = result
return (path1 or "", path2 or "")
return [path1 or "", path2 or ""]
elif output_suffix == "qualification1":
merge_and_cleanup(result, "qualification3")
return result or ""
return result or ""
return "" # 返回空字符串
return [result or ""]
return [result or ""]
return [""] # 返回空字符串
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
if not os.path.exists(output_folder):
os.makedirs(output_folder)
generated_files = []
if os.path.isdir(input_path):
for file_name in os.listdir(input_path):
file_path = os.path.join(input_path, file_name)
if is_pdf_or_doc(file_path):
result = process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
if isinstance(result, tuple):
generated_files.extend([f if f else "" for f in result]) # 保留空字符串
else:
generated_files.append(result) # 直接添加result可能是空字符串
elif os.path.isfile(input_path) and is_pdf_or_doc(input_path):
result = process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
if isinstance(result, tuple):
generated_files.extend([f if f else "" for f in result]) # 保留空字符串
else:
generated_files.append(result) # 直接添加result可能是空字符串
else:
print("提供的路径既不是文件夹也不是PDF文件。")
return generated_files
# 默认逻辑是start_page匹配上就不再设置了一般不匹配上目录的原因是设置了begin_page=5但是匹配'第一章 招标公告'的时候start_page可能会错误匹配到目录。
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
@ -171,11 +148,8 @@ def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
else:
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text):
continue
if output_suffix == "notice":
if re.search(begin_pattern, cleaned_text) and i > begin_page:
start_page = i
else:
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
if start_page is None and re.search(begin_pattern, cleaned_text):
if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
start_page = i
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
end_page = i
@ -273,8 +247,15 @@ def get_patterns_for_qualification2():
re.MULTILINE
)
return begin_pattern, end_pattern
def get_patterns_for_notice():
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*')
end_pattern = re.compile(
# r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)',
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
re.MULTILINE
)
return begin_pattern, end_pattern
def get_patterns_for_notice_twice():
begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*', re.MULTILINE
)
@ -476,7 +457,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
patterns = [get_patterns_for_qualification(),get_patterns_for_qualification2()]
begin_page = 5
elif output_suffix == "notice":
patterns = [get_patterns_for_notice()]
patterns = [get_patterns_for_notice(),get_patterns_for_notice_twice()]
begin_page = 0
if patterns:
@ -567,7 +548,6 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na
- truncate_files (list): 包含 PDF 文件路径的列表
- output_path (str): 合并后的 PDF 文件保存路径
- base_file_name (str): 用于匹配文件名的基础名称
- logger (logging.Logger): 日志记录器对象
返回:
- str: 如果合并成功返回 output_path否则返回空字符串 ""
@ -625,6 +605,17 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na
# 检查合并后的文件是否存在且不为空
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
# 合并成功,删除 {base_file_name}_before.pdf 文件
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
if os.path.exists(before_pdf_path):
try:
os.remove(before_pdf_path)
print(f"已删除文件: {before_pdf_path}")
except Exception as e:
print(f"删除文件 {before_pdf_path} 时出错: {e}")
else:
print(f"未找到要删除的文件: {before_pdf_path}")
return output_path
else:
print(f"合并失败,没有生成 '{output_path}'")
@ -633,82 +624,92 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na
def get_start_and_common_header(input_path):
common_header = extract_common_header(input_path)
last_begin_index = 0
begin_pattern = re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函)\s*$',re.MULTILINE)
begin_pattern = re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)]?\s*$',re.MULTILINE)
pdf_document = PdfReader(input_path)
for i, page in enumerate(pdf_document.pages):
if i > 25:
if i > 10:
return common_header, 0 # 如果页码大于25直接返回last_begin_index=0
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
if begin_pattern.search(cleaned_text):
if begin_pattern.search(cleaned_text) and not re.search(r'\s*录', cleaned_text):
last_begin_index = i # 更新第一个匹配的索引页码从0开始
return common_header,last_begin_index
return common_header,last_begin_index
def truncate_pdf_main(input_path, output_folder, selection, output_suffix="default"):
try:
last_begin_index = get_start_and_common_header(input_path)
# 检查是否为文件夹
if os.path.isdir(input_path):
generated_files = []
for file_name in os.listdir(input_path):
file_path = os.path.join(input_path, file_name)
if is_pdf_or_doc(file_path):
result = process_input(file_path, output_folder, selection, output_suffix)
if isinstance(result, tuple):
generated_files.extend([f if f else "" for f in result])
else:
generated_files.append(result)
return generated_files
# 单文件情况
elif os.path.isfile(input_path) and is_pdf_or_doc(input_path):
return process_input(input_path, output_folder, selection, output_suffix)
else:
print("提供的路径既不是文件夹也不是PDF文件。")
return ['']
except Exception as e:
print(f"Error in truncate_pdf_main: {e}")
return [''] # 返回空字符串
def process_input(input_path, output_folder, selection, output_suffix):
try:
# 创建输出文件夹
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 获取起始和通用页眉
common_header, last_begin_index = get_start_and_common_header(input_path)
begin_page = last_begin_index if last_begin_index != 0 else {
4: 1, # 前附表
2: 5, # 评标
3: 5, # 资格
1: 0, # 公告
5: 3 # 采购需求
4: 1,
2: 5,
3: 5,
1: 0,
5: 3
}.get(selection, 0)
if selection == 1: #招标公告
begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*'
)
end_pattern = re.compile(
# r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人|磋商|供应商)须知前附表)'
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE
)
# 根据选择设置对应的模式和结束模式
if selection == 1:
begin_pattern = re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)]?\s*$', re.MULTILINE)
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
local_output_suffix = "notice"
elif selection == 2:
begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(磋商|谈判|评标|评定|评审)(方法|办法).*'
)
end_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
)
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(磋商|谈判|评标|评定|评审)(方法|办法).*')
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
local_output_suffix = "evaluation_method"
elif selection == 3:
begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE
)
end_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE
)
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE)
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
local_output_suffix = "qualification1"
elif selection == 4: # 投标人须知前附表和正文
elif selection == 4:
begin_pattern = re.compile(
r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表)',
re.MULTILINE
)
end_pattern=None
# end_pattern = re.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE
# )
re.MULTILINE)
end_pattern = None
local_output_suffix = "tobidders_notice"
elif selection == 5: #采购需求
# 更新的正则表达式以匹配"第x章"和"第x部分",考虑到可能的空格和其他文字
elif selection == 5:
begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?采购.*|'
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?需求.*'
)
end_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
)
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分).*?采购.*|^第[一二三四五六七八九十百千]+(?:章|部分).*?需求.*')
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
local_output_suffix = "procurement"
elif selection==6: #投标文件格式
begin_pattern=re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:格式).*'
)
end_pattern=re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE
)
elif selection == 6:
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:格式).*')
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
local_output_suffix = "format"
else:
print("无效的选择:请选择1-5")
@ -718,11 +719,12 @@ def truncate_pdf_main(input_path, output_folder, selection, output_suffix="defau
if output_suffix == "default":
output_suffix = local_output_suffix
# 调用相应的处理函数
return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) or ""
# 调用实际处理文件内容的函数
return process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
except Exception as e:
print(f"Error in truncate_pdf_main: {e}")
return [''] # 返回空字符串
print(f"Error in process_input: {e}")
return ['']
def truncate_pdf_multiple(pdf_path, output_folder,unique_id="123"):
@ -819,17 +821,17 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况类似工程标
if __name__ == "__main__":
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.pdf"
input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
# files = truncate_pdf_multiple(input_path, output_folder)
files = truncate_pdf_multiple(input_path, output_folder)
# selections = [1,4]
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
# print(files)
selection = 4# 例如1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-公告
generated_files = truncate_pdf_main(input_path, output_folder, selection)
print(files)
# selection = 1# 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print(generated_files)

View File

@ -164,7 +164,7 @@ def test_all_files_in_folder(input_folder, output_folder):
print(f"处理文件 {file_path} 时出错: {e}")
if __name__ == "__main__":
truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件定稿三次_procurement.pdf"
truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx"
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\tmp"
file_id = upload_file(truncate_file)
res=get_technical_requirements(file_id)

View File

@ -114,10 +114,11 @@ def extract_from_notice(invalid_path,clause_path, type):
return DEFAULT_RESULT
if __name__ == "__main__":
file_path = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\tmp\\clause1.json'
file_path = 'D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\clause1.json'
invalid_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile.pdf"
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\clause1.json'
try:
res = extract_from_notice(file_path, 2) # 可以改变此处的 type 参数测试不同的场景
res = extract_from_notice(invalid_path,file_path, 1) # 可以改变此处的 type 参数测试不同的场景
res2=json.dumps(res,ensure_ascii=False,indent=4)
print(res2)
except ValueError as e:

View File

@ -1,15 +1,13 @@
import concurrent.futures
import json
import time
from flask_app.general.format_change import pdf2docx
from flask_app.货物标.技术参数要求提取 import get_technical_requirements
from flask_app.general.通义千问long import upload_file
from flask_app.货物标.商务服务其他要求提取 import get_business_requirements
#获取采购清单
def fetch_procurement_reqs(truncate_file):
def fetch_procurement_reqs(procurement_path,procurement_docpath):
# 定义默认的 procurement_reqs 字典
DEFAULT_PROCUREMENT_REQS = {
"技术要求": "",
@ -18,19 +16,18 @@ def fetch_procurement_reqs(truncate_file):
"其他要求": ""
}
# 如果 truncate_file 是空字符串,直接返回包含空字符串的字典
if not truncate_file:
if not procurement_docpath:
return DEFAULT_PROCUREMENT_REQS.copy()
try:
# 上传文件并获取 file_id
truncate_file_docx=pdf2docx(truncate_file)
file_id = upload_file(truncate_file_docx)
file_id = upload_file(procurement_docpath)
# 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
with concurrent.futures.ThreadPoolExecutor() as executor:
# 提交任务给线程池
future_technical = executor.submit(get_technical_requirements, file_id)
time.sleep(1) # 如果需要延迟,可以保留,否则建议移除以提高效率
future_business = executor.submit(get_business_requirements, truncate_file, file_id)
future_business = executor.submit(get_business_requirements, procurement_path, file_id)
# 获取并行任务的结果
technical_requirements = future_technical.result()
@ -42,7 +39,6 @@ def fetch_procurement_reqs(truncate_file):
"商务要求": business_requirements.get("商务要求", {}),
"服务要求": business_requirements.get("服务要求", {}),
"其他要求": business_requirements.get("其他要求", {}),
"货物列表":business_requirements.get("货物列表",{})
}
return procurement_reqs
@ -55,6 +51,6 @@ def fetch_procurement_reqs(truncate_file):
if __name__ == "__main__":
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output"
# file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_procurement.pdf"
file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\磋商文件_procurement.pdf"
file_path="D:\\flask_project\\flask_app\\static\\output\\output1\\bf225a5e-16d0-45c8-8c19-54a1a94cf3e2\\ztbfile_procurement.docx"
res=fetch_procurement_reqs(file_path)
print(json.dumps(res, ensure_ascii=False, indent=4))

View File

@ -255,7 +255,7 @@ def combine_evaluation_standards(truncate_file):
)
# 执行第二个查询
evaluation_res = qianwen_long(file_id, user_query)
print(evaluation_res)
# print(evaluation_res)
# 清理和处理响应
cleaned_evaluation_res = parse_json_with_duplicates(evaluation_res) #处理重复键名的情况
result_data = process_data_based_on_key(cleaned_evaluation_res) #处理不知名外键的情况

View File

@ -1,6 +1,6 @@
# 竞磋 竞谈 磋商 询价 邀请 单一来源
import json
import time
from flask_app.general.format_change import docx2pdf, pdf2docx,doc2docx
from flask_app.general.json_utils import transform_json_values
from flask_app.货物标.基础信息解析main import combine_basic_info
@ -56,7 +56,8 @@ def preprocess_files(output_folder, file_path, file_type):
# 处理各个部分
invalid_path=pdf_path
invalid_docpath = docx_path # docx截取无效标部分
procurement_path = truncate_files[5] # 商务技术服务要求
procurement_path = truncate_files[5] # 采购需求
procurement_docpath=pdf2docx(procurement_path) # 采购需求docx
evaluation_method_path = truncate_files[1] # 评标办法
qualification_path = truncate_files[2] # 资格审查
tobidders_notice_path = truncate_files[4] # 投标人须知正文
@ -72,6 +73,7 @@ def preprocess_files(output_folder, file_path, file_type):
'invalid_path': invalid_path,
'output_folder': output_folder,
'procurement_path': procurement_path,
'procurement_docpath':procurement_docpath,
'evaluation_method_path': evaluation_method_path,
'qualification_path': qualification_path,
'notice_path': notice_path,
@ -81,12 +83,14 @@ def preprocess_files(output_folder, file_path, file_type):
'merged_baseinfo_path': merged_baseinfo_path
}
def fetch_project_basic_info(invalid_path, merged_baseinfo_path, procurement_file_path, clause_path):
def fetch_project_basic_info(invalid_path,invalid_docpath, merged_baseinfo_path, procurement_path,procurement_docpath, clause_path):
logger.info("starting 基础信息...")
start_time = time.time()
if not merged_baseinfo_path:
merged_baseinfo_path = invalid_path
basic_res = combine_basic_info(merged_baseinfo_path, procurement_file_path, clause_path)
if not procurement_docpath:
procurement_docpath=invalid_docpath
basic_res = combine_basic_info(merged_baseinfo_path, procurement_path,procurement_docpath, clause_path)
base_info, good_list = post_process_baseinfo(basic_res)
end_time = time.time()
logger.info(f"基础信息 done耗时{end_time - start_time:.2f}")
@ -125,20 +129,22 @@ def fetch_invalid_requirements(invalid_docpath, output_folder):
logger.info(f"无效标与废标 done耗时{end_time - start_time:.2f}")
return find_invalid_res
def fetch_bidding_documents_requirements(clause_path):
def fetch_bidding_documents_requirements(invalid_path,clause_path):
logger.info("starting 投标文件要求...")
start_time = time.time()
fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1)
selection=1
fetch_bidding_documents_requirements_json = extract_from_notice(invalid_path,clause_path, selection)
end_time = time.time()
logger.info(f"投标文件要求 done耗时{end_time - start_time:.2f}")
return {"投标文件要求": fetch_bidding_documents_requirements_json}
# 开评定标流程
def fetch_bid_opening(clause_path):
def fetch_bid_opening(invalid_path,clause_path):
logger.info("starting 开评定标流程...")
start_time = time.time()
fetch_bid_opening_json = extract_from_notice(clause_path, 2)
selection=2
fetch_bid_opening_json = extract_from_notice(invalid_path,clause_path, selection)
end_time = time.time()
logger.info(f"开评定标流程 done耗时{end_time - start_time:.2f}")
return {"开评定标流程": fetch_bid_opening_json}
@ -195,11 +201,11 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
processed_data['evaluation_method_path']),
'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
output_folder),
'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,
'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'],
processed_data['clause_path']),
'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path']),
'base_info': executor.submit(fetch_project_basic_info, processed_data['invalid_path'],processed_data['merged_baseinfo_path'],
processed_data['procurement_path'],processed_data['clause_path']),
'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_path'],processed_data['clause_path']),
'base_info': executor.submit(fetch_project_basic_info, processed_data['invalid_path'],processed_data['invalid_docpath'],processed_data['merged_baseinfo_path'],
processed_data['procurement_path'],processed_data['procurement_docpath'],processed_data['clause_path']),
'qualification_review': executor.submit(fetch_qualification_review, processed_data['invalid_path'],output_folder,
processed_data['qualification_path'],
processed_data['notice_path']),
@ -234,8 +240,6 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
#TODO:区分output目录 陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1)_tobidders_notice_part2.pdf提取有问题
#good_list 金额 截取上下文
if __name__ == "__main__":
import time
# 配置日志器
unique_id = "uuidzyzy11"
logger = get_global_logger(unique_id)