10.17 小解析货物标

This commit is contained in:
zy123 2024-10-17 19:07:57 +08:00
parent 8255070f60
commit abd972df3a
6 changed files with 233 additions and 90 deletions

View File

@ -11,7 +11,8 @@ from flask_app.main.基础信息整合 import judge_consortium_bidding
from flask_app.main.多线程提问 import read_questions_from_file, multi_threading
from flask_app.main.通义千问long import upload_file
from flask_app.货物标.基础信息解析main import aggregate_basic_info_goods
from flask_app.货物标.货物标截取pdf import truncate_pdf_specific
from flask_app.货物标.货物标截取pdf import truncate_pdf_specific_goods
from flask_app.main.截取pdf import truncate_pdf_specific_engineering
from flask_app.main.post_processing import inner_post_processing
def get_global_logger(unique_id):
@ -36,8 +37,8 @@ def merge(merged):
def get_goods_baseinfo(baseinfo_file_path):
file_id = upload_file(baseinfo_file_path)
# baseinfo_file_path='flask_app/static/提示词/基本信息货物标.txt'
baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息货物标.txt'
baseinfo_file_path='flask_app/static/提示词/基本信息货物标.txt'
# baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息货物标.txt'
questions = read_questions_from_file(baseinfo_file_path)
more_query = "请你根据招标文件信息回答以下问题是否需要递交投标保证金或磋商保证金请按json格式给我提供信息键名分为'是否递交投标保证金'(或'是否递交磋商保证金',键值仅限于'','','未知',若存在矛盾信息,请回答'未知'"
questions.append(more_query)
@ -45,7 +46,7 @@ def get_goods_baseinfo(baseinfo_file_path):
baseinfo_list = [clean_json_string(res) for _, res in baseinfo_results] if baseinfo_results else []
type,merged=merge(baseinfo_list.pop())
if type:
judge_questions="根据招标文件第二章投标人须知该项目投标保证金或磋商保证金的内容或要求是什么请按json格式给我提供信息外层键名为'投标保证金'(或'磋商保证金'),若需要以嵌套键值对返回结果,那么嵌套键名为你对相应要求的总结,而对应键值需要完全与原文保持一致。"
judge_questions=["根据招标文件第二章投标人须知该项目投标保证金或磋商保证金的内容或要求是什么请按json格式给我提供信息外层键名为'投标保证金'(或'磋商保证金'),若需要以嵌套键值对返回结果,那么嵌套键名为你对相应要求的总结,而对应键值需要完全与原文保持一致。"]
res2 = multi_threading(judge_questions, "", file_id, 2) # 调用千问-long
if not res2:
print("基础信息整合: multi_threading error!")
@ -59,11 +60,13 @@ def get_goods_baseinfo(baseinfo_file_path):
#货物标
def little_parse_goods(output_folder,file_path):
files=truncate_pdf_specific(file_path,output_folder)
files=truncate_pdf_specific_goods(file_path,output_folder)
baseinfo_list=get_goods_baseinfo(files[-1])
aggregated_baseinfo = aggregate_basic_info_goods(baseinfo_list)
return {"基础信息": aggregated_baseinfo}
def little_parse_engineering(output_folder,downloaded_filepath):
def little_parse_engineering(output_folder,file_path):
files=truncate_pdf_specific_engineering(file_path,output_folder)
return True
def little_parse_main(output_folder, file_path, file_type,zb_type,unique_id):
@ -81,6 +84,7 @@ def little_parse_main(output_folder, file_path, file_type,zb_type,unique_id):
"""
global logger
logger = get_global_logger(unique_id)
logger.info("zb_type:"+str(zb_type))
# 根据文件类型处理文件路径
if file_type == 1: # docx
docx_path = file_path
@ -119,8 +123,12 @@ if __name__ == "__main__":
file_type = 2 # 1:docx 2:pdf 3:其他
zb_type=2 #1:工程标 2货物标
input_file = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\ztbfile.pdf"
res=little_parse_main(output_folder, input_file, file_type, zb_type,"122334")
print(json.dumps(res, ensure_ascii=False, indent=4))
final_json_path=little_parse_main(output_folder, input_file, file_type, zb_type,"122334")
with open(final_json_path, 'r', encoding='utf-8') as f:
logger.info('final_json_path:' + final_json_path)
zbparse_data = json.load(f)
json_str = json.dumps(zbparse_data, ensure_ascii=False)
print(json_str)
end_time = time.time()
elapsed_time = end_time - start_time # 计算耗时
print(f"Function execution took {elapsed_time} seconds.")

View File

@ -0,0 +1,58 @@
import os
from PyPDF2 import PdfReader, PdfWriter
#合并PDF
def merge_pdfs(paths, output_path):
pdf_writer = PdfWriter()
last_page_text = None # 用于存储上一个PDF的最后一页的文本
for path in paths:
pdf_reader = PdfReader(path)
pages = pdf_reader.pages
start_index = 0 # 从第一页开始添加
# 如果这不是第一个文件,并且有上一个文件的最后一页文本
if last_page_text is not None and len(pages) > 0:
current_first_page_text = pages[0].extract_text() if pages[0].extract_text() else ""
# 比较当前文件的第一页和上一个文件的最后一页的文本
if current_first_page_text == last_page_text:
start_index = 1 # 如果相同,跳过当前文件的第一页
# 添加当前PDF的页面到写入器
for page in range(start_index, len(pages)):
pdf_writer.add_page(pages[page])
# 更新last_page_text为当前PDF的最后一页的文本
if len(pages) > 0:
last_page_text = pages[-1].extract_text() if pages[-1].extract_text() else ""
# 写入合并后的PDF到文件
with open(output_path, 'wb') as out:
pdf_writer.write(out)
def judge_file_exist(original_path, new_suffix):
# 提取目录路径和原始文件名
directory = os.path.dirname(original_path)
original_filename = os.path.basename(original_path)
# 替换文件名中的旧后缀为新后缀
# 假设原始文件名格式为 '2-招标文件_qualification.pdf'
# 需要替换 '_qualification' 部分为 '_qualification2'
new_filename = original_filename.replace("_qualification1", f"_{new_suffix}")
new_filename = new_filename.replace("_qualification2", f"_{new_suffix}")
# 生成新的文件路径
new_file_path = os.path.join(directory, new_filename)
# 检查新文件是否存在
if os.path.isfile(new_file_path):
return new_file_path
else:
return None
def merge_and_cleanup(output_pdf_path, suffix_to_merge):
another_file_path = judge_file_exist(output_pdf_path, suffix_to_merge)
if another_file_path:
paths = [output_pdf_path, another_file_path] # 需要合并的 PDF 文件路径
merge_pdfs(paths, output_pdf_path)
os.remove(another_file_path)
print(f"文件 {another_file_path} 已删除。")

View File

@ -177,7 +177,7 @@ def generate_response(final_json_path):
zbparse_data = json.load(f)
json_str = json.dumps(zbparse_data, ensure_ascii=False)
return jsonify({
'message': 'File uploaded and processed successfully',
'message': 'Little Parse processed successfully',
'filename': os.path.basename(final_json_path),
'data': json_str
})

View File

@ -1,6 +1,7 @@
from PyPDF2 import PdfReader, PdfWriter
import re # 导入正则表达式库
import os # 用于文件和文件夹操作
from flask_app.main.merge_pdfs import merge_pdfs
def clean_page_content(text, common_header):
# 首先删除抬头公共部分
if common_header: # 确保有公共抬头才进行替换
@ -52,31 +53,63 @@ def extract_common_header(pdf_path):
return '\n'.join(common_headers)
def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page):
"""从原始PDF截取指定范围的页面并保存到新的PDF文件中"""
# 获取文件基本名称
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
# 构建输出文件路径
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
"""
从原始PDF截取指定范围的页面并保存到新的PDF文件中
如果 output_suffix 'notice'则额外保存 start_page 之前的页面
# 读取PDF文件
pdf_document = PdfReader(pdf_path)
output_doc = PdfWriter()
参数:
pdf_path (str): 原始PDF文件路径
output_folder (str): 输出文件夹路径
output_suffix (str): 输出文件的后缀用于区分不同的提取
start_page (int): 起始页码0
end_page (int): 结束页码0
# 检查起始和结束页码是否有效
if start_page is not None and end_page is not None and start_page <= end_page:
# 添加指定范围的页面到新的PDF文档中
返回:
str: 保存的PDF文件路径如果提取失败返回空字符串
"""
try:
# 获取文件基本名称
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
# 构建主要输出文件路径
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
# 读取PDF文件
pdf_document = PdfReader(pdf_path)
total_pages = len(pdf_document.pages)
# 检查起始和结束页码是否有效
if start_page < 0 or end_page >= total_pages or start_page > end_page:
print(f"无效的页面范围: {start_page}{end_page}")
return ""
# 如果 output_suffix 是 'notice',保存 start_page 之前的页面
if output_suffix == 'notice' and start_page > 0:
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
before_doc = PdfWriter()
for page_num in range(0, start_page):
before_doc.add_page(pdf_document.pages[page_num])
with open(before_pdf_path, 'wb') as f_before:
before_doc.write(f_before)
print(f"已保存页面从 0 到 {start_page - 1}{before_pdf_path}")
# 提取指定范围的页面
output_doc = PdfWriter()
for page_num in range(start_page, end_page + 1):
output_doc.add_page(pdf_document.pages[page_num])
# 保存新的PDF文件
with open(output_pdf_path, 'wb') as f:
output_doc.write(f)
with open(output_pdf_path, 'wb') as f_output:
output_doc.write(f_output)
print(f"已截取并保存页面从 {start_page + 1}{end_page + 1}{output_pdf_path}")
else:
print("提供的页码范围无效。")
return output_pdf_path
print(f"{output_suffix} 已截取并保存页面从 {start_page + 1}{end_page + 1}{output_pdf_path}")
return output_pdf_path
except Exception as e:
print(f"Error in save_pages_to_new_pdf: {e}")
return "" # 返回空字符串
def extract_pages_twice(pdf_path, output_folder, output_suffix):
@ -260,14 +293,116 @@ def truncate_pdf_multiple(input_path, output_folder):
truncate_files.extend(files)
return truncate_files
def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_name):
"""
合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件
以及 truncate_files 中以指定后缀结尾的文件按照指定顺序合并
参数
- output_folder (str): 包含以 {base_file_name}_before.pdf 结尾的 PDF 文件的文件夹路径
- truncate_files (list): 包含 PDF 文件路径的列表
- output_path (str): 合并后的 PDF 文件保存路径
- base_file_name (str): 用于匹配文件名的基础名称
"""
# 1. 获取 output_folder 中所有文件
try:
all_output_files = os.listdir(output_folder)
except FileNotFoundError:
print(f"输出文件夹 '{output_folder}' 未找到。")
return
except PermissionError:
print(f"没有权限访问输出文件夹 '{output_folder}'")
return
# 2. 定义要选择的文件后缀及合并顺序,包括 before 文件
desired_suffixes = [
f'{base_file_name}_before.pdf',
f'{base_file_name}_notice.pdf',
f'{base_file_name}_tobidders_notice_table.pdf',
f'{base_file_name}_tobidders_notice.pdf'
]
all_pdfs_to_merge = []
for suffix in desired_suffixes:
if suffix == f'{base_file_name}_before.pdf':
# 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件
matching_files = [
os.path.join(output_folder, f)
for f in all_output_files
if f.endswith(suffix)
]
else:
# 从 truncate_files 中选择以指定后缀结尾的文件
matching_files = [f for f in truncate_files if f.endswith(suffix)]
if matching_files:
# 如果找到多个匹配的文件,按名称排序并添加
matching_files_sorted = sorted(matching_files)
all_pdfs_to_merge.extend(matching_files_sorted)
for f in matching_files_sorted:
print(f"选中文件: {f}")
else:
print(f"没有找到以 '{suffix}' 结尾的文件。")
print(f"总共将要合并的 PDF 文件数量: {len(all_pdfs_to_merge)}")
if not all_pdfs_to_merge:
print("没有找到要合并的 PDF 文件。")
return
# 调用 merge_pdfs 函数进行合并
merge_pdfs(all_pdfs_to_merge, output_path)
print(f"已成功合并 PDF 文件到 '{output_path}'")
def truncate_pdf_specific_engineering(pdf_path, output_folder):
"""
处理 PDF 文件选择 selection 51 3 的部分并合并结果
Args:
pdf_path (str): 要处理的 PDF 文件路径
output_folder (str): 截取后的文件保存文件夹路径
Returns:
list: 截取的文件路径列表包括合并后的文件路径如果有
"""
try:
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
truncate_files = []
selections = [5, 1, 3] # 仅处理 selection 5、1 和 3
for selection in selections:
files = truncate_pdf_main(pdf_path, output_folder, selection)
if files:
if isinstance(files, list):
truncate_files.extend(files)
elif isinstance(files, str):
truncate_files.append(files)
if truncate_files:
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_specific.pdf")
merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name)
truncate_files.append(merged_output_path)
print(f"已生成合并文件: {merged_output_path}")
else:
print(f"没有文件需要合并 for {pdf_path}")
return truncate_files
except Exception as e:
print(f"Error in truncate_pdf_specific_two: {e}")
return [] # 返回空列表表示失败
# TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下比如说就不进行跳转见xx表 开评定标这里也要考虑 如果评分表为空,也要处理。
if __name__ == "__main__":
# input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74\\ztbfile.pdf"
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74"
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标03.pdf"
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件"
truncate_pdf_multiple(input_path,output_folder)
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\special_output"
files=truncate_pdf_multiple(input_path,output_folder)
# files=truncate_pdf_specific(input_path,output_folder)
print(files)
# selection = 3 # 例如1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
# # print("生成的文件:", generated_files)

View File

@ -2,7 +2,7 @@ from PyPDF2 import PdfReader, PdfWriter
import re # 导入正则表达式库
import os # 用于文件和文件夹操作
from flask_app.main.format_change import docx2pdf
from flask_app.main.merge_pdfs import merge_and_cleanup,merge_pdfs
def clean_page_content(text, common_header):
# 首先删除抬头公共部分
@ -102,65 +102,6 @@ def convert_to_pdf(file_path):
return docx2pdf(file_path)
return file_path
def judge_file_exist(original_path, new_suffix):
# 提取目录路径和原始文件名
directory = os.path.dirname(original_path)
original_filename = os.path.basename(original_path)
# 替换文件名中的旧后缀为新后缀
# 假设原始文件名格式为 '2-招标文件_qualification.pdf'
# 需要替换 '_qualification' 部分为 '_qualification2'
new_filename = original_filename.replace("_qualification1", f"_{new_suffix}")
new_filename = new_filename.replace("_qualification2", f"_{new_suffix}")
# 生成新的文件路径
new_file_path = os.path.join(directory, new_filename)
# 检查新文件是否存在
if os.path.isfile(new_file_path):
return new_file_path
else:
return None
#合并PDF
def merge_pdfs(paths, output_path):
pdf_writer = PdfWriter()
last_page_text = None # 用于存储上一个PDF的最后一页的文本
for path in paths:
pdf_reader = PdfReader(path)
pages = pdf_reader.pages
start_index = 0 # 从第一页开始添加
# 如果这不是第一个文件,并且有上一个文件的最后一页文本
if last_page_text is not None and len(pages) > 0:
current_first_page_text = pages[0].extract_text() if pages[0].extract_text() else ""
# 比较当前文件的第一页和上一个文件的最后一页的文本
if current_first_page_text == last_page_text:
start_index = 1 # 如果相同,跳过当前文件的第一页
# 添加当前PDF的页面到写入器
for page in range(start_index, len(pages)):
pdf_writer.add_page(pages[page])
# 更新last_page_text为当前PDF的最后一页的文本
if len(pages) > 0:
last_page_text = pages[-1].extract_text() if pages[-1].extract_text() else ""
# 写入合并后的PDF到文件
with open(output_path, 'wb') as out:
pdf_writer.write(out)
def merge_and_cleanup(output_pdf_path, suffix_to_merge):
another_file_path = judge_file_exist(output_pdf_path, suffix_to_merge)
if another_file_path:
paths = [output_pdf_path, another_file_path] # 需要合并的 PDF 文件路径
merge_pdfs(paths, output_pdf_path)
os.remove(another_file_path)
print(f"文件 {another_file_path} 已删除。")
def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
pdf_path = convert_to_pdf(file_path)
result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
@ -597,7 +538,7 @@ def truncate_pdf_multiple(pdf_path, output_folder):
return truncate_files
#小解析,只需要前三章内容
def truncate_pdf_specific(pdf_path, output_folder):
def truncate_pdf_specific_goods(pdf_path, output_folder):
"""
处理 PDF 文件选择 selection 4 5 的部分并合并结果
@ -636,7 +577,7 @@ if __name__ == "__main__":
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件正文(1).pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all"
# files = truncate_pdf_multiple(input_path, output_folder)
files=truncate_pdf_specific(input_path,output_folder)
files=truncate_pdf_specific_goods(input_path,output_folder)
print(files[-1])
# selection = 1 # 例如1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-公告
# generated_files = truncate_pdf_main(input_path, output_folder, selection)

View File

@ -211,6 +211,7 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
# deleteKnowledge(index)
#TODO:目前的无效标这块的键值都删去空格了,所有的键名都删去空格
#广水市 2022 年义务教育学校多媒体补充采购项目 资格审查有问题
if __name__ == "__main__":
output_folder = "flask_app/static/output/zytest1"