优化招标文件判断&新增支持采购附件参数&优化招标文件验证切分正则
This commit is contained in:
parent
51d252daf2
commit
adc580a5f1
@ -1,8 +1,50 @@
|
|||||||
import os
|
import os
|
||||||
import fitz
|
import fitz
|
||||||
from PyPDF2 import PdfReader, PdfWriter
|
import tempfile
|
||||||
|
import logging
|
||||||
|
from PyPDF2 import PdfReader, PdfWriter, PdfMerger
|
||||||
from flask_app.general.读取文件.clean_pdf import create_get_text_function
|
from flask_app.general.读取文件.clean_pdf import create_get_text_function
|
||||||
|
|
||||||
|
|
||||||
|
def merge_pdf_to(original_pdf, to_pdf, logger) -> str:
|
||||||
|
""" merge original pdf to exists pdf """
|
||||||
|
merger = None
|
||||||
|
temp_filename = None
|
||||||
|
try:
|
||||||
|
# 初始化PdfMerger
|
||||||
|
merger = PdfMerger()
|
||||||
|
|
||||||
|
# 读取目标PDF的内容
|
||||||
|
with open(to_pdf, 'rb') as f:
|
||||||
|
merger.append(f)
|
||||||
|
|
||||||
|
# 读取原始PDF的内容
|
||||||
|
with open(original_pdf, 'rb') as f:
|
||||||
|
merger.append(f)
|
||||||
|
|
||||||
|
# 创建临时文件
|
||||||
|
with tempfile.NamedTemporaryFile(dir=os.path.dirname(to_pdf), suffix='.pdf', delete=False) as temp_file:
|
||||||
|
temp_filename = temp_file.name
|
||||||
|
merger.write(temp_file)
|
||||||
|
|
||||||
|
# 用临时文件替换原文件
|
||||||
|
os.replace(temp_filename, to_pdf)
|
||||||
|
return to_pdf
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"merge_pdf_to 遇到错误: {str(e)}")
|
||||||
|
# 发生错误时删除临时文件
|
||||||
|
if temp_filename and os.path.exists(temp_filename):
|
||||||
|
try:
|
||||||
|
os.remove(temp_filename)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return to_pdf
|
||||||
|
finally:
|
||||||
|
# 确保merger和临时文件被正确关闭/清理
|
||||||
|
if merger is not None:
|
||||||
|
merger.close()
|
||||||
|
|
||||||
|
|
||||||
#合并PDF
|
#合并PDF
|
||||||
# 主合并函数,尝试使用 PyPDF2,若失败则使用 fitz
|
# 主合并函数,尝试使用 PyPDF2,若失败则使用 fitz
|
||||||
def merge_pdfs(paths, output_path):
|
def merge_pdfs(paths, output_path):
|
||||||
|
@ -319,8 +319,11 @@ def get_invalid_file(pdf_path, output_folder, common_header, begin_page):
|
|||||||
if regex.search(exclusion_pat, text):
|
if regex.search(exclusion_pat, text):
|
||||||
continue
|
continue
|
||||||
if regex.search(pattern, text):
|
if regex.search(pattern, text):
|
||||||
|
end_exclusion_pat = regex.compile(r'第[一二三四五六七八九十百千]+(?:章|部分)\s*“[^“”]*合同[^“”]*”', regex.DOTALL)
|
||||||
|
if regex.search(end_exclusion_pat, text):
|
||||||
|
continue
|
||||||
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
|
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
|
||||||
return i
|
return i
|
||||||
else:
|
else:
|
||||||
# 当页数较少时,从后向前查找(至少保留前30页)
|
# 当页数较少时,从后向前查找(至少保留前30页)
|
||||||
for pattern in end_patterns:
|
for pattern in end_patterns:
|
||||||
|
@ -15,20 +15,20 @@ from flask_app.ConnectionLimiter import require_connection_limit, require_execut
|
|||||||
|
|
||||||
upload_bp = Blueprint('upload', __name__)
|
upload_bp = Blueprint('upload', __name__)
|
||||||
|
|
||||||
def _child_target(main_func, queue, output_folder, file_path, file_type, unique_id):
|
def _child_target(main_func, queue, output_folder, file_path, file_type, require_downloaded_filepath, require_file_type, unique_id):
|
||||||
"""
|
"""
|
||||||
子进程中调用 `main_func`(它是一个生成器函数),
|
子进程中调用 `main_func`(它是一个生成器函数),
|
||||||
将其 yield 出的数据逐条放进队列,最后放一个 None 表示结束。
|
将其 yield 出的数据逐条放进队列,最后放一个 None 表示结束。
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
for data in main_func(output_folder, file_path, file_type, unique_id):
|
for data in main_func(output_folder, file_path, file_type, require_downloaded_filepath, require_file_type, unique_id):
|
||||||
queue.put(data)
|
queue.put(data)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# 如果要把异常也传给父进程,以便父进程可感知
|
# 如果要把异常也传给父进程,以便父进程可感知
|
||||||
queue.put(json.dumps({'error': str(e)}, ensure_ascii=False))
|
queue.put(json.dumps({'error': str(e)}, ensure_ascii=False))
|
||||||
finally:
|
finally:
|
||||||
queue.put(None)
|
queue.put(None)
|
||||||
def run_in_subprocess(main_func, output_folder, file_path, file_type, unique_id):
|
def run_in_subprocess(main_func, output_folder, file_path, file_type, require_downloaded_filepath, require_file_type, unique_id):
|
||||||
"""
|
"""
|
||||||
启动子进程调用 `main_func(...)`,并在父进程流式获取其输出(通过 Queue)。
|
启动子进程调用 `main_func(...)`,并在父进程流式获取其输出(通过 Queue)。
|
||||||
子进程结束时,操作系统回收其内存;父进程则保持实时输出。
|
子进程结束时,操作系统回收其内存;父进程则保持实时输出。
|
||||||
@ -36,7 +36,7 @@ def run_in_subprocess(main_func, output_folder, file_path, file_type, unique_id)
|
|||||||
queue = multiprocessing.Queue()
|
queue = multiprocessing.Queue()
|
||||||
p = multiprocessing.Process(
|
p = multiprocessing.Process(
|
||||||
target=_child_target,
|
target=_child_target,
|
||||||
args=(main_func, queue, output_folder, file_path, file_type, unique_id)
|
args=(main_func, queue, output_folder, file_path, file_type, require_downloaded_filepath, require_file_type, unique_id)
|
||||||
)
|
)
|
||||||
p.start()
|
p.start()
|
||||||
|
|
||||||
@ -59,11 +59,12 @@ def zbparse(): #大解析
|
|||||||
logger.info("Received JSON data: " + str(received_data))
|
logger.info("Received JSON data: " + str(received_data))
|
||||||
file_url = g.file_url
|
file_url = g.file_url
|
||||||
zb_type = g.zb_type
|
zb_type = g.zb_type
|
||||||
|
require_file_url = g.require_file_url
|
||||||
file_name = urllib.parse.unquote(os.path.basename(file_url.split('?')[0]))
|
file_name = urllib.parse.unquote(os.path.basename(file_url.split('?')[0]))
|
||||||
logger.info(f"Starting parsing file: {file_name}")
|
logger.info(f"Starting parsing file: {file_name}")
|
||||||
try:
|
try:
|
||||||
logger.info("starting parsing url:" + file_url)
|
logger.info("starting parsing url:" + file_url)
|
||||||
return process_and_stream(file_url, zb_type) #主要执行函数
|
return process_and_stream(file_url, zb_type, require_file_url) #主要执行函数
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error('Exception occurred: ' + str(e))
|
logger.error('Exception occurred: ' + str(e))
|
||||||
if hasattr(g, 'unique_id'):
|
if hasattr(g, 'unique_id'):
|
||||||
@ -84,7 +85,7 @@ def zbparse(): #大解析
|
|||||||
data='Internal server error'
|
data='Internal server error'
|
||||||
)
|
)
|
||||||
return jsonify(error_response)
|
return jsonify(error_response)
|
||||||
def process_and_stream(file_url, zb_type):
|
def process_and_stream(file_url, zb_type, require_file_url):
|
||||||
"""
|
"""
|
||||||
下载文件并进行处理,支持工程标和货物标的处理。
|
下载文件并进行处理,支持工程标和货物标的处理。
|
||||||
"""
|
"""
|
||||||
@ -92,11 +93,14 @@ def process_and_stream(file_url, zb_type):
|
|||||||
unique_id = g.unique_id
|
unique_id = g.unique_id
|
||||||
output_folder = g.output_folder
|
output_folder = g.output_folder
|
||||||
filename = "ztbfile"
|
filename = "ztbfile"
|
||||||
|
require_filename = "ztbfile_require"
|
||||||
downloaded_filename = os.path.join(output_folder, filename)
|
downloaded_filename = os.path.join(output_folder, filename)
|
||||||
|
require_downloaded_filename = os.path.join(output_folder, require_filename)
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
file_name = urllib.parse.unquote(os.path.basename(file_url.split('?')[0]))
|
file_name = urllib.parse.unquote(os.path.basename(file_url.split('?')[0]))
|
||||||
try:
|
try:
|
||||||
downloaded_filepath, file_type = download_file(file_url, downloaded_filename, True)
|
downloaded_filepath, file_type = download_file(file_url, downloaded_filename, True)
|
||||||
|
require_downloaded_filepath, require_file_type = ("", -1) if not require_file_url else download_file(require_file_url, require_downloaded_filename, True)
|
||||||
if not downloaded_filepath or file_type == 4:
|
if not downloaded_filepath or file_type == 4:
|
||||||
logger.error("下载文件失败或不支持的文件类型")
|
logger.error("下载文件失败或不支持的文件类型")
|
||||||
log_error_unique_id(unique_id, 1, file_name=file_name)
|
log_error_unique_id(unique_id, 1, file_name=file_name)
|
||||||
@ -119,7 +123,7 @@ def process_and_stream(file_url, zb_type):
|
|||||||
}
|
}
|
||||||
processing_func = processing_functions.get(zb_type, goods_bid_main)
|
processing_func = processing_functions.get(zb_type, goods_bid_main)
|
||||||
|
|
||||||
for data in run_in_subprocess(processing_func, output_folder, downloaded_filepath, file_type, unique_id): #逐一接收货物标 工程标解析内容,为前端网页展示服务
|
for data in run_in_subprocess(processing_func, output_folder, downloaded_filepath, file_type, require_downloaded_filepath, require_file_type, unique_id): #逐一接收货物标 工程标解析内容,为前端网页展示服务
|
||||||
if not data.strip():
|
if not data.strip():
|
||||||
logger.error("Received empty data, skipping JSON parsing.")
|
logger.error("Received empty data, skipping JSON parsing.")
|
||||||
continue
|
continue
|
||||||
|
@ -16,13 +16,14 @@ def validate_request():
|
|||||||
return jsonify({'error': 'Missing JSON in request'}), 400
|
return jsonify({'error': 'Missing JSON in request'}), 400
|
||||||
file_url = request.json.get('file_url')
|
file_url = request.json.get('file_url')
|
||||||
zb_type = request.json.get('zb_type', 2) #zb_type:默认按货物标解析
|
zb_type = request.json.get('zb_type', 2) #zb_type:默认按货物标解析
|
||||||
|
require_file_url = request.json.get('require_file_url', "")
|
||||||
if not file_url:
|
if not file_url:
|
||||||
return jsonify({'error': 'No file URL provided'}), 400
|
return jsonify({'error': 'No file URL provided'}), 400
|
||||||
try:
|
try:
|
||||||
zb_type = int(zb_type)
|
zb_type = int(zb_type)
|
||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
return jsonify({'error': 'Invalid zb_type provided'}), 400
|
return jsonify({'error': 'Invalid zb_type provided'}), 400
|
||||||
return file_url, zb_type
|
return file_url, zb_type, require_file_url
|
||||||
def generate_deviation_response(tech_deviation, tech_star_deviation, business_deviation, business_star_deviation,
|
def generate_deviation_response(tech_deviation, tech_star_deviation, business_deviation, business_star_deviation,
|
||||||
zigefuhe_deviation,proof_materials, logger):
|
zigefuhe_deviation,proof_materials, logger):
|
||||||
logger.info(f"技术偏离表: {json.dumps(tech_deviation, ensure_ascii=False, indent=4)}")
|
logger.info(f"技术偏离表: {json.dumps(tech_deviation, ensure_ascii=False, indent=4)}")
|
||||||
@ -105,7 +106,7 @@ def validate_and_setup_logger(f):
|
|||||||
# 进行请求验证
|
# 进行请求验证
|
||||||
validation_result = validate_request()
|
validation_result = validate_request()
|
||||||
if isinstance(validation_result, tuple):
|
if isinstance(validation_result, tuple):
|
||||||
file_url, zb_type = validation_result
|
file_url, zb_type, require_file_url = validation_result
|
||||||
|
|
||||||
# 根据蓝图确定子文件夹
|
# 根据蓝图确定子文件夹
|
||||||
blueprint = request.blueprint
|
blueprint = request.blueprint
|
||||||
@ -125,6 +126,7 @@ def validate_and_setup_logger(f):
|
|||||||
# 将验证后的数据存储在 g 对象中
|
# 将验证后的数据存储在 g 对象中
|
||||||
g.file_url = file_url
|
g.file_url = file_url
|
||||||
g.zb_type = zb_type
|
g.zb_type = zb_type
|
||||||
|
g.require_file_url = require_file_url
|
||||||
return f(*args, **kwargs)
|
return f(*args, **kwargs)
|
||||||
else:
|
else:
|
||||||
# 验证失败,返回错误响应
|
# 验证失败,返回错误响应
|
||||||
|
@ -2,30 +2,76 @@ import time
|
|||||||
from PyPDF2 import PdfReader # 确保已安装 PyPDF2: pip install PyPDF2
|
from PyPDF2 import PdfReader # 确保已安装 PyPDF2: pip install PyPDF2
|
||||||
from docx import Document
|
from docx import Document
|
||||||
import fitz
|
import fitz
|
||||||
|
from flask_app.general.llm.qianwen_plus import qianwen_plus
|
||||||
from flask_app.general.llm.通义千问long import upload_file, qianwen_long
|
from flask_app.general.llm.通义千问long import upload_file, qianwen_long
|
||||||
|
|
||||||
def judge_zbfile_exec(file_path):
|
|
||||||
|
def pre_judge(few_pages_content: str):
|
||||||
|
if few_pages_content.strip() == "":
|
||||||
|
return "未知"
|
||||||
|
|
||||||
|
# 使用大模型进行判断
|
||||||
|
user_query = f"""
|
||||||
|
一, 任务描述:
|
||||||
|
我会给定一段从文件中摘取出来的文本, 你需要根据该文本的内容来判断文件是否属于招标文件。
|
||||||
|
|
||||||
|
二, 判断准则:
|
||||||
|
以下是常见的招标文件类型:
|
||||||
|
公开招标文件、邀请招标文件、竞争性谈判文件、竞争性磋商文件、询价文件、问询文件、货物类招标文件、工程类招标文件、施工类招标文件、服务类招标文件、比选文件。
|
||||||
|
若有未涵盖的类型,但其内容明确表达了项目需求、采购或招标信息,且包含指导投标人参与的关键要素,则可视为招标文件。
|
||||||
|
排除情况:
|
||||||
|
1. 请注意区分招标文件和投标文件,若文件仅有投标文件格式要求部分,或是投标、响应性文件,则不视为招标文件。
|
||||||
|
2. 若文件内容为乱码,无有效信息,请直接返回"否"。
|
||||||
|
|
||||||
|
三, 注意事项: 你并不是完全依赖于判断准则来给出判断, 而是根据自己对招标文件的理解来判断, 给定判断准则只是帮助你进行理解。
|
||||||
|
|
||||||
|
四, 回答格式: 你的回答只能为"是","否","未知"三种情况之一, 不要在回答中进行任何的解释或输出其他不属于"是","否","未知"的字符。
|
||||||
|
|
||||||
|
五, 给定从文件中摘取出来的文本:
|
||||||
|
{few_pages_content}
|
||||||
|
|
||||||
|
提问: 该内容是否属于招标文件? 如果属于返回"是"; 如果不属于则返回"否"; 如果你无法判断则返回"未知", 不要进行假设性判断, 不理解就返回"未知"。
|
||||||
|
"""
|
||||||
|
start_time = time.time()
|
||||||
|
model_res = qianwen_plus(user_query)
|
||||||
|
end_time = time.time()
|
||||||
|
print(f"pre_judge 判断是否属于招标文件:{model_res} 实际耗时:{end_time - start_time:.2f} 秒")
|
||||||
|
|
||||||
|
if "是" in model_res:
|
||||||
|
return "是"
|
||||||
|
elif "否" in model_res:
|
||||||
|
return "否"
|
||||||
|
else:
|
||||||
|
return "未知"
|
||||||
|
|
||||||
|
|
||||||
|
def judge_zbfile_exec(file_path, end_page=5, max_len=3000):
|
||||||
"""
|
"""
|
||||||
判断文件是否属于招标文件,并返回结果。
|
判断文件是否属于招标文件,并返回结果。
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
few_pages_content = ""
|
||||||
# 检查文件是否为PDF格式
|
# 检查文件是否为PDF格式
|
||||||
if file_path.lower().endswith('.pdf'):
|
if file_path.lower().endswith('.pdf'):
|
||||||
try:
|
try:
|
||||||
with open(file_path, 'rb') as f:
|
with open(file_path, 'rb') as f:
|
||||||
reader = PdfReader(f)
|
reader = PdfReader(f)
|
||||||
num_pages = len(reader.pages)
|
num_pages = len(reader.pages)
|
||||||
|
if num_pages > end_page:
|
||||||
|
few_pages_content = "".join([reader.pages[i].extract_text() for i in range(end_page)])
|
||||||
except Exception:
|
except Exception:
|
||||||
try:
|
try:
|
||||||
doc = fitz.open(file_path)
|
doc = fitz.open(file_path)
|
||||||
num_pages = len(doc)
|
num_pages = len(doc)
|
||||||
|
if num_pages > end_page:
|
||||||
|
few_pages_content = "".join([doc.load_page(i).get_text() for i in range(end_page)])
|
||||||
except Exception:
|
except Exception:
|
||||||
print("PDF 文件读取失败")
|
print("PDF 文件读取失败")
|
||||||
return False # 两种解析方式都失败,直接返回 False
|
return False # 两种解析方式都失败,直接返回 False
|
||||||
|
|
||||||
if num_pages <= 5:
|
if num_pages <= 5:
|
||||||
return False # 小于等于 5 页的 PDF 直接判定为非招标文件
|
return False # 小于等于 5 页的 PDF 直接判定为非招标文件
|
||||||
|
few_pages_content = few_pages_content[:max_len]
|
||||||
elif file_path.lower().endswith('.docx'):
|
elif file_path.lower().endswith('.docx'):
|
||||||
try:
|
try:
|
||||||
doc = Document(file_path)
|
doc = Document(file_path)
|
||||||
@ -37,13 +83,20 @@ def judge_zbfile_exec(file_path):
|
|||||||
chunk = paragraphs[i:i + chunk_size]
|
chunk = paragraphs[i:i + chunk_size]
|
||||||
for para in chunk:
|
for para in chunk:
|
||||||
accumulated_text += para.text
|
accumulated_text += para.text
|
||||||
if len(accumulated_text) >= 1000:
|
if len(accumulated_text) >= max_len:
|
||||||
break # 读取超过1000字后即可停止
|
break # 读取超过1000字后即可停止
|
||||||
if len(accumulated_text) < 1000:
|
if len(accumulated_text) < 1000:
|
||||||
return False # 若累计内容不足1000字,则直接返回 False
|
return False # 若累计内容不足1000字,则直接返回 False
|
||||||
except Exception:
|
except Exception:
|
||||||
print("DOCX 文件读取失败,可能为乱码文件")
|
print("DOCX 文件读取失败,可能为乱码文件")
|
||||||
return False # 解析失败直接返回 False
|
return False # 解析失败直接返回 False
|
||||||
|
few_pages_content = accumulated_text[:max_len]
|
||||||
|
|
||||||
|
# pre_judge -> 是, 否, 未知
|
||||||
|
pre_res = pre_judge(few_pages_content)
|
||||||
|
if pre_res != "未知":
|
||||||
|
return '否' not in pre_res
|
||||||
|
|
||||||
pre_endtime=time.time()
|
pre_endtime=time.time()
|
||||||
print(f"judge_zbfile_exec预处理耗时:{pre_endtime - start_time:.2f} 秒")
|
print(f"judge_zbfile_exec预处理耗时:{pre_endtime - start_time:.2f} 秒")
|
||||||
# 使用大模型进行判断
|
# 使用大模型进行判断
|
||||||
@ -60,7 +113,7 @@ def judge_zbfile_exec(file_path):
|
|||||||
model_res = qianwen_long(file_id, user_query)
|
model_res = qianwen_long(file_id, user_query)
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
print(f"judge_zbfile_exec实际耗时:{end_time - start_time:.2f} 秒")
|
print(f"judge_zbfile_exec实际耗时:{end_time - start_time:.2f} 秒")
|
||||||
print(f"判断是否属于招标文件:{model_res} 实际耗时:{end_time - start_time:.2f} 秒")
|
print(f"qianwen_long 判断是否属于招标文件:{model_res} 实际耗时:{end_time - start_time:.2f} 秒")
|
||||||
|
|
||||||
return '否' not in model_res
|
return '否' not in model_res
|
||||||
|
|
||||||
|
@ -33,7 +33,7 @@ def engineering_bid_main_process(output_folder, file_path, file_type, unique_id)
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def preprocess_files(output_folder, file_path, file_type, logger):
|
def preprocess_files(output_folder, file_path, file_type, require_file_path, require_file_type, logger):
|
||||||
logger.info("starting 文件预处理...")
|
logger.info("starting 文件预处理...")
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
is_pure_image_flag = False
|
is_pure_image_flag = False
|
||||||
@ -239,10 +239,10 @@ def fetch_bid_opening(invalid_deleted_docx, merged_baseinfo_path_more, clause_pa
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def engineering_bid_main(output_folder, file_path, file_type, unique_id):
|
def engineering_bid_main(output_folder, file_path, file_type, require_file_path, require_file_type, unique_id):
|
||||||
logger = get_global_logger(unique_id)
|
logger = get_global_logger(unique_id)
|
||||||
# 预处理文件,获取处理后的数据
|
# 预处理文件,获取处理后的数据
|
||||||
processed_data = preprocess_files(output_folder, file_path, file_type, logger)
|
processed_data = preprocess_files(output_folder, file_path, file_type, require_file_path, require_file_type, logger)
|
||||||
if not processed_data:
|
if not processed_data:
|
||||||
error_response = {
|
error_response = {
|
||||||
'error': '文件预处理失败。请检查文件类型并重试。'
|
'error': '文件预处理失败。请检查文件类型并重试。'
|
||||||
|
@ -15,17 +15,18 @@ from flask_app.general.无效标和废标公共代码 import combine_find_invali
|
|||||||
from flask_app.货物标.资格审查main import combine_qualification_review
|
from flask_app.货物标.资格审查main import combine_qualification_review
|
||||||
from flask_app.general.商务技术评分提取 import combine_evaluation_standards
|
from flask_app.general.商务技术评分提取 import combine_evaluation_standards
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from flask_app.general.merge_pdfs import merge_pdf_to, merge_pdfs
|
||||||
|
|
||||||
|
|
||||||
|
def verify_file(file_path, file_type, logger):
|
||||||
|
pdf_path = ""
|
||||||
|
is_pure_image_flag = False # 判断是否为纯图片类型的docx
|
||||||
|
|
||||||
def preprocess_files(output_folder, file_path, file_type,logger):
|
|
||||||
logger.info("starting 文件预处理...")
|
|
||||||
start_time = time.time()
|
|
||||||
is_pure_image_flag=False #判断是否为纯图片类型的docx
|
|
||||||
pdf_path=""
|
|
||||||
# 根据文件类型处理文件路径
|
# 根据文件类型处理文件路径
|
||||||
if file_type == 1: # docx
|
if file_type == 1: # docx
|
||||||
# docx_path = file_path
|
# docx_path = file_path
|
||||||
if is_pure_image(file_path):
|
if is_pure_image(file_path):
|
||||||
is_pure_image_flag=True
|
is_pure_image_flag = True
|
||||||
else:
|
else:
|
||||||
pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理
|
pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理
|
||||||
elif file_type == 2: # pdf
|
elif file_type == 2: # pdf
|
||||||
@ -36,7 +37,37 @@ def preprocess_files(output_folder, file_path, file_type,logger):
|
|||||||
pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理
|
pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理
|
||||||
else:
|
else:
|
||||||
logger.error("Unsupported file type provided. Preprocessing halted.")
|
logger.error("Unsupported file type provided. Preprocessing halted.")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
return pdf_path, is_pure_image_flag
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_files(output_folder, file_path, file_type, require_file_path, require_file_type, logger):
|
||||||
|
logger.info("starting 文件预处理...")
|
||||||
|
start_time = time.time()
|
||||||
|
# is_pure_image_flag=False #判断是否为纯图片类型的docx
|
||||||
|
# pdf_path=""
|
||||||
|
# # 根据文件类型处理文件路径
|
||||||
|
# if file_type == 1: # docx
|
||||||
|
# # docx_path = file_path
|
||||||
|
# if is_pure_image(file_path):
|
||||||
|
# is_pure_image_flag=True
|
||||||
|
# else:
|
||||||
|
# pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理
|
||||||
|
# elif file_type == 2: # pdf
|
||||||
|
# pdf_path = file_path
|
||||||
|
# # docx_path = pdf2docx(pdf_path)
|
||||||
|
# elif file_type == 3: # doc
|
||||||
|
# # docx_path = doc2docx(file_path)
|
||||||
|
# pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理
|
||||||
|
# else:
|
||||||
|
# logger.error("Unsupported file type provided. Preprocessing halted.")
|
||||||
|
# return None
|
||||||
|
pdf_path, is_pure_image_flag = verify_file(file_path, file_type, logger)
|
||||||
|
require_pdf_path, _ = verify_file(require_file_path, require_file_type, logger)
|
||||||
|
if pdf_path is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if not is_pure_image_flag: #大多数情况 不是纯图片doc/docx
|
if not is_pure_image_flag: #大多数情况 不是纯图片doc/docx
|
||||||
# 调用截取PDF多次
|
# 调用截取PDF多次
|
||||||
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods')
|
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods')
|
||||||
@ -44,7 +75,9 @@ def preprocess_files(output_folder, file_path, file_type,logger):
|
|||||||
truncate_files=['','','','','','',file_path,''] #纯图片,无需切片
|
truncate_files=['','','','','','',file_path,''] #纯图片,无需切片
|
||||||
# print("切割出的文件:"+str(truncate_files))
|
# print("切割出的文件:"+str(truncate_files))
|
||||||
# 处理各个部分
|
# 处理各个部分
|
||||||
procurement_path = truncate_files[5] # 采购需求
|
# procurement_path = truncate_files[5] # 采购需求
|
||||||
|
procurement_path = truncate_files[5] if not require_pdf_path else merge_pdf_to(require_pdf_path, truncate_files[5], logger) # 采购需求
|
||||||
|
# procurement_path = truncate_files[5] if not require_pdf_path else merge_pdfs([truncate_files[5], require_pdf_path], truncate_files[5])
|
||||||
evaluation_method_path = truncate_files[1] # 评标办法
|
evaluation_method_path = truncate_files[1] # 评标办法
|
||||||
qualification_path = truncate_files[2] # 资格审查
|
qualification_path = truncate_files[2] # 资格审查
|
||||||
tobidders_notice_path = truncate_files[4] # 投标人须知正文
|
tobidders_notice_path = truncate_files[4] # 投标人须知正文
|
||||||
@ -223,10 +256,10 @@ def post_process_baseinfo(base_info,logger):
|
|||||||
logger.error(f"Error in post_process_baseinfo: {e}")
|
logger.error(f"Error in post_process_baseinfo: {e}")
|
||||||
return base_info, [] # 返回空列表
|
return base_info, [] # 返回空列表
|
||||||
|
|
||||||
def goods_bid_main(output_folder, file_path, file_type, unique_id):
|
def goods_bid_main(output_folder, file_path, file_type, require_file_path, require_file_type, unique_id):
|
||||||
logger = get_global_logger(unique_id)
|
logger = get_global_logger(unique_id)
|
||||||
# 预处理文件,获取处理后的数据
|
# 预处理文件,获取处理后的数据
|
||||||
processed_data = preprocess_files(output_folder, file_path, file_type,logger)
|
processed_data = preprocess_files(output_folder, file_path, file_type, require_file_path, require_file_type, logger)
|
||||||
if not processed_data:
|
if not processed_data:
|
||||||
error_response = {
|
error_response = {
|
||||||
'error': '文件预处理失败。请检查文件类型并重试。'
|
'error': '文件预处理失败。请检查文件类型并重试。'
|
||||||
|
Loading…
x
Reference in New Issue
Block a user