2024-10-17 15:33:58 +08:00
|
|
|
|
# -*- encoding:utf-8 -*-
|
|
|
|
|
import json
|
|
|
|
|
import os
|
|
|
|
|
import time
|
|
|
|
|
|
2024-10-22 10:06:22 +08:00
|
|
|
|
from flask_app.general.format_change import docx2pdf
|
|
|
|
|
from flask_app.general.json_utils import clean_json_string
|
|
|
|
|
from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
|
|
|
|
|
from flask_app.general.通义千问long import upload_file
|
2024-12-30 17:32:24 +08:00
|
|
|
|
from flask_app.general.通用功能函数 import get_global_logger,aggregate_basic_info
|
2024-12-17 18:44:58 +08:00
|
|
|
|
from flask_app.general.截取pdf_main import truncate_pdf_multiple
|
2024-10-22 10:06:22 +08:00
|
|
|
|
from flask_app.general.post_processing import inner_post_processing
|
2024-10-18 13:37:54 +08:00
|
|
|
|
#货物标
|
2024-12-17 18:44:58 +08:00
|
|
|
|
def little_parse_goods(output_folder, pdf_path,logger):
|
2024-10-18 13:37:54 +08:00
|
|
|
|
"""
|
|
|
|
|
解析货物相关的基础信息。
|
2024-10-17 15:33:58 +08:00
|
|
|
|
|
2024-10-18 13:37:54 +08:00
|
|
|
|
参数:
|
|
|
|
|
output_folder (str): 输出文件夹路径。
|
|
|
|
|
file_path (str): 输入文件的路径。
|
2024-10-17 16:47:35 +08:00
|
|
|
|
|
2024-10-18 13:37:54 +08:00
|
|
|
|
返回:
|
|
|
|
|
dict: 包含 '基础信息' 的字典。
|
|
|
|
|
"""
|
|
|
|
|
# 截取特定的货物 PDF 文件
|
2025-01-21 17:20:42 +08:00
|
|
|
|
selections = [1,4] # 仅处理 selection 1和4 #公告+投标人须知
|
2024-12-17 18:44:58 +08:00
|
|
|
|
files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods',selections)
|
2024-10-18 13:37:54 +08:00
|
|
|
|
if not files:
|
|
|
|
|
raise ValueError("未找到截取后的文件。")
|
|
|
|
|
# 假设最后一个文件是需要处理的基础信息文件
|
|
|
|
|
baseinfo_file_path = files[-1]
|
2024-11-02 14:29:28 +08:00
|
|
|
|
if not baseinfo_file_path:
|
2024-12-23 15:47:41 +08:00
|
|
|
|
baseinfo_file_path=files[-2]
|
2024-10-18 13:37:54 +08:00
|
|
|
|
# 上传文件并获取文件 ID
|
|
|
|
|
file_id = upload_file(baseinfo_file_path)
|
|
|
|
|
# 注意:以下路径被硬编码,确保该路径存在并且正确
|
2025-01-21 17:22:24 +08:00
|
|
|
|
baseinfo_prompt_file_path='flask_app/static/提示词/小解析基本信息货物标.txt'
|
|
|
|
|
# baseinfo_prompt_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\小解析基本信息货物标.txt'
|
2024-10-18 13:37:54 +08:00
|
|
|
|
# 从提示词文件中读取问题
|
|
|
|
|
questions = read_questions_from_file(baseinfo_prompt_file_path)
|
|
|
|
|
# 多线程处理问题,使用指定的处理模式(2 代表使用 qianwen-long)
|
|
|
|
|
baseinfo_results = multi_threading(questions, "", file_id, 2)
|
|
|
|
|
# 清理 JSON 字符串
|
|
|
|
|
baseinfo_list = [clean_json_string(res) for _, res in baseinfo_results] if baseinfo_results else []
|
|
|
|
|
# 聚合基础信息
|
2025-01-02 15:35:38 +08:00
|
|
|
|
aggregated_baseinfo = aggregate_basic_info(baseinfo_list,'goods')
|
2024-10-17 15:33:58 +08:00
|
|
|
|
return {"基础信息": aggregated_baseinfo}
|
2024-10-18 13:37:54 +08:00
|
|
|
|
|
|
|
|
|
|
2024-12-17 18:44:58 +08:00
|
|
|
|
def little_parse_engineering(output_folder, pdf_path,logger):
|
2024-10-18 13:37:54 +08:00
|
|
|
|
"""
|
|
|
|
|
解析工程相关的基础信息。
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
output_folder (str): 输出文件夹路径。
|
|
|
|
|
file_path (str): 输入文件的路径。
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
dict: 包含 '基础信息' 的字典。
|
|
|
|
|
"""
|
|
|
|
|
# 截取特定的工程 PDF 文件
|
2025-01-21 17:20:42 +08:00
|
|
|
|
selections = [1,4] #公告+投标人须知前附表
|
2024-12-17 18:44:58 +08:00
|
|
|
|
files = truncate_pdf_multiple(pdf_path, output_folder,logger,'engineering',selections)
|
2024-10-18 13:37:54 +08:00
|
|
|
|
if not files:
|
|
|
|
|
raise ValueError("未找到截取后的文件。")
|
|
|
|
|
# 假设最后一个文件是需要处理的基础信息文件
|
|
|
|
|
baseinfo_file_path = files[-1]
|
2024-11-02 14:29:28 +08:00
|
|
|
|
if not baseinfo_file_path:
|
2025-01-08 17:34:50 +08:00
|
|
|
|
baseinfo_file_path=files[-2] #invalid_path
|
2024-10-18 13:37:54 +08:00
|
|
|
|
# 上传文件并获取文件 ID
|
|
|
|
|
file_id = upload_file(baseinfo_file_path)
|
|
|
|
|
# 注意:以下路径被硬编码,确保该路径存在并且正确
|
2024-11-15 15:35:06 +08:00
|
|
|
|
baseinfo_prompt_file_path='flask_app/static/提示词/小解析基本信息工程标.txt'
|
|
|
|
|
# baseinfo_prompt_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\小解析基本信息工程标.txt'
|
2024-10-18 13:37:54 +08:00
|
|
|
|
# 从提示词文件中读取问题
|
|
|
|
|
questions = read_questions_from_file(baseinfo_prompt_file_path)
|
|
|
|
|
# 多线程处理问题,使用指定的处理模式(2 代表使用 qianwen-long)
|
|
|
|
|
baseinfo_results = multi_threading(questions, "", file_id, 2)
|
|
|
|
|
# 清理 JSON 字符串
|
|
|
|
|
baseinfo_list = [clean_json_string(res) for _, res in baseinfo_results] if baseinfo_results else []
|
|
|
|
|
# 聚合基础信息
|
2024-12-30 17:32:24 +08:00
|
|
|
|
aggregated_baseinfo = aggregate_basic_info(baseinfo_list)
|
2024-10-17 20:57:13 +08:00
|
|
|
|
return {"基础信息": aggregated_baseinfo}
|
2024-10-17 15:33:58 +08:00
|
|
|
|
|
|
|
|
|
def little_parse_main(output_folder, file_path, file_type,zb_type,unique_id):
|
|
|
|
|
"""
|
|
|
|
|
主解析函数,根据文件类型和招标类型处理文件,并保存结果为JSON文件。
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
output_folder (str): 输出文件夹路径。
|
|
|
|
|
file_path (str): 待处理文件的路径。
|
|
|
|
|
file_type (int): 文件类型(1: docx, 2: pdf, 3: doc)。
|
|
|
|
|
zb_type (int): 招标类型(2: 货物标, 其他: 工程标)。
|
2025-01-14 17:10:38 +08:00
|
|
|
|
unique_i(int):唯一标识
|
2024-10-17 15:33:58 +08:00
|
|
|
|
返回:
|
|
|
|
|
str: 保存的JSON文件的路径。
|
|
|
|
|
"""
|
|
|
|
|
logger = get_global_logger(unique_id)
|
2024-10-17 19:07:57 +08:00
|
|
|
|
logger.info("zb_type:"+str(zb_type))
|
2024-10-17 15:33:58 +08:00
|
|
|
|
# 根据文件类型处理文件路径
|
|
|
|
|
if file_type == 1: # docx
|
|
|
|
|
docx_path = file_path
|
|
|
|
|
pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理
|
|
|
|
|
elif file_type == 2: # pdf
|
|
|
|
|
pdf_path = file_path
|
|
|
|
|
# docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
|
|
|
|
|
elif file_type == 3: # doc
|
|
|
|
|
pdf_path = docx2pdf(file_path)
|
|
|
|
|
# docx_path = pdf2docx(pdf_path)
|
|
|
|
|
else:
|
|
|
|
|
logger.error("Unsupported file type provided. Preprocessing halted.")
|
|
|
|
|
return None
|
|
|
|
|
# 根据招标类型调用相应的解析函数
|
|
|
|
|
if zb_type == 2: # 货物标
|
2024-12-17 18:44:58 +08:00
|
|
|
|
combined_data = little_parse_goods(output_folder, pdf_path,logger)
|
2024-10-17 15:33:58 +08:00
|
|
|
|
logger.info("Parsed goods data: %s", json.dumps(combined_data, ensure_ascii=False, indent=4))
|
|
|
|
|
res = inner_post_processing(combined_data["基础信息"])
|
2024-11-15 09:23:26 +08:00
|
|
|
|
elif zb_type==1: #工程标
|
2024-12-17 18:44:58 +08:00
|
|
|
|
combined_data = little_parse_engineering(output_folder, pdf_path,logger)
|
2024-10-17 15:33:58 +08:00
|
|
|
|
logger.info("Parsed engineering data: %s", json.dumps(combined_data, ensure_ascii=False, indent=4))
|
|
|
|
|
res = inner_post_processing(combined_data["基础信息"])
|
|
|
|
|
# 定义保存JSON文件的路径
|
2024-11-15 09:23:26 +08:00
|
|
|
|
else:
|
|
|
|
|
logger.info("wrong zb_type!!!")
|
|
|
|
|
return ""
|
2024-10-17 15:33:58 +08:00
|
|
|
|
json_path = os.path.join(output_folder, "final_result.json")
|
|
|
|
|
# 将结果保存为JSON文件
|
|
|
|
|
with open(json_path, 'w', encoding='utf-8') as json_file:
|
|
|
|
|
json.dump(res, json_file, ensure_ascii=False, indent=4)
|
|
|
|
|
logger.info(f"Extraction results saved to {json_path}")
|
|
|
|
|
|
|
|
|
|
# 返回JSON文件的路径
|
|
|
|
|
return json_path
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
file_type = 2 # 1:docx 2:pdf 3:其他
|
2024-10-18 13:37:54 +08:00
|
|
|
|
# output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\little_parse_output"
|
|
|
|
|
# zb_type=2 #1:工程标 2:货物标
|
|
|
|
|
# input_file = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\ztbfile.pdf"
|
|
|
|
|
|
2025-01-21 17:20:42 +08:00
|
|
|
|
output_folder=r"C:\Users\Administrator\Desktop\fsdownload\20c9e7fa-0245-4de0-b004-d5231d0be940\tmp"
|
|
|
|
|
zb_type=2 #1:工程 2:货物
|
2024-11-15 09:23:26 +08:00
|
|
|
|
# input_file=r"C:\Users\Administrator\Desktop\fsdownload\865a5d46-a5f8-467a-8374-c71c415d0af9\ztbfile.pdf"
|
2025-01-21 17:20:42 +08:00
|
|
|
|
input_file=r"C:\Users\Administrator\Desktop\fsdownload\20c9e7fa-0245-4de0-b004-d5231d0be940\ztbfile.pdf"
|
2024-10-17 19:07:57 +08:00
|
|
|
|
final_json_path=little_parse_main(output_folder, input_file, file_type, zb_type,"122334")
|
|
|
|
|
with open(final_json_path, 'r', encoding='utf-8') as f:
|
2024-11-15 09:23:26 +08:00
|
|
|
|
# logger.info('final_json_path:' + final_json_path)
|
2024-10-17 19:07:57 +08:00
|
|
|
|
zbparse_data = json.load(f)
|
2024-10-17 20:57:13 +08:00
|
|
|
|
json_str = json.dumps(zbparse_data, ensure_ascii=False,indent=4)
|
2024-10-17 19:07:57 +08:00
|
|
|
|
print(json_str)
|
2024-10-17 15:33:58 +08:00
|
|
|
|
end_time = time.time()
|
|
|
|
|
elapsed_time = end_time - start_time # 计算耗时
|
|
|
|
|
print(f"Function execution took {elapsed_time} seconds.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|