2.13 处理一些pdf文件无法被pypdf2处理导致死循环的问题,以及纯图片的docx文档解析出现的问题

This commit is contained in:
zy123 2025-02-13 15:42:52 +08:00
parent 2a25c28e20
commit 140ef263a9
13 changed files with 270 additions and 147 deletions

View File

@ -3,14 +3,17 @@ import os
import mimetypes
import requests
from PyPDF2 import PdfReader
from flask_app.general.读取文件.clean_pdf import is_scanned_pdf
def download_file(url, local_filename):
from flask_app.general.读取文件.clean_pdf import is_scanned_pdf, is_pure_image
def download_file(url, local_filename,enable=False):
"""
下载文件并保存到本地基于Content-Type设置文件扩展名
参数:
- url (str): 文件的URL地址
- local_filename (str): 本地保存的文件名不含扩展名
- enable: 是否需要判断为扫描型/纯图片文件
返回:
- tuple: (完整文件名, 文件类型代码)
@ -50,7 +53,7 @@ def download_file(url, local_filename):
}
file_code = extension_mapping.get(extension.lower(), 4)
# 如果是 PDF判断是否为扫描型
if extension.lower() == '.pdf':
if enable and extension.lower() == '.pdf':
print(f"Checking if the PDF is scanned: {full_filename}")
if is_scanned_pdf(full_filename):
print(f"PDF is scanned. Converting to normal PDF: {full_filename}")
@ -154,6 +157,7 @@ def doc2docx(local_path_in):
downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
print(f"format_change d2d:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath
def docx2pdf(local_path_in,force=False):
"""
DOCX 文件转换为 PDF
@ -174,7 +178,7 @@ def docx2pdf(local_path_in,force=False):
pdf_file_path = os.path.join(folder, f"{filename}.pdf")
if os.path.exists(pdf_file_path):
if force:
print(f"强制转换,覆盖已存在的文件: {pdf_file_path}")
print(f"强制转换,覆盖已存在的文件: {pdf_file_path}") #覆盖掉原来的扫描型pdf
else:
print(f"跳过转换,文件已存在: {pdf_file_path}")
return pdf_file_path # 跳过转换
@ -207,11 +211,13 @@ if __name__ == '__main__':
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
# local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
# local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf"
local_path_in = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\有问题的.doc"
# downloaded_file=pdf2docx(local_path_in)
local_path_in = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\ztbfile(1).pdf"
intermediate_docx = pdf2docx(local_path_in)
if intermediate_docx:
normal_pdf = docx2pdf(intermediate_docx, force=True)
# # downloaded_file=pdf2docx(local_path_in)
downloaded_file=docx2pdf(local_path_in)
print(downloaded_file)
# downloaded_file=docx2pdf(local_path_in)
# print(downloaded_file)
# test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7.pdf?Expires=1736852995&OSSAccessKeyId=TMP.3Kg1oKKcsSWb7DXNe4F56bfGfKY5nNWUi274p39HyY7GR3mghMCaFWy69Fi83SBab6PmSkErh4JUD4yAxAGzVVx2hxxoxm&Signature=rmxS5lett4MzWdksDI57EujCklw%3"
# local_file_name = r'D:\flask_project\flask_app\static\output\output1\1d763771-f25a-4b65-839e-3b2ca56577b1\tmp\ztbfile.pdf'
@ -224,8 +230,8 @@ if __name__ == '__main__':
# # 检查文件类型
# if file_type == 4:
# print("error")
res=pdf2docx(local_path_in)
print(res)
# res=pdf2docx(local_path_in)
# print(res)

View File

@ -95,7 +95,7 @@ def delete_mark(docx_path):
if __name__ == '__main__':
# input=r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\ztbfile.pdf'
input=r'C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件(107国道).pdf'
input=r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\ztbfile_invalid.pdf'
output=insert_mark(input)
# doc_path = r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\invalid_added.docx'
# res=delete_mark(doc_path)

View File

@ -239,46 +239,15 @@ def qianwen_long_stream(file_id, user_query, max_retries=2, backoff_factor=1.0,
if __name__ == "__main__":
# Example file path - replace with your actual file path
file_path = r"C:\Users\Administrator\Desktop\fsdownload\29457826-1e99-4e98-9c90-1cfb5d175579\invalid_del.docx"
file_path = r"C:\Users\Administrator\Downloads\2022-广东-鹏华基金管理有限公司深圳深业上城办公室装修项目.pdf"
file_id = upload_file(file_path)
# print(file_id)
user_query1 ="""该招标文件对响应文件投标文件偏离项的要求或内容是怎样的请不要回答具体的技术参数也不要回答具体的评分要求。请以json格式给我提供信息外层键名为'偏离',若存在嵌套信息,嵌套内容键名为文件中对应字段或是你的总结,键值为原文对应内容。若文中没有关于偏离项的相关内容,在键值中填'未知'
禁止内容
确保键值内容均基于提供的实际招标文件内容禁止使用任何预设的示例作为回答
禁止返回markdown格式请提取具体的偏离相关内容
示例1嵌套键值对情况
{
"偏离":{
"技术要求":"以★标示的内容不允许负偏离",
"商务要求":"以★标示的内容不允许负偏离"
}
}
示例2无嵌套键值对情况
{
"偏离":"所有参数需在技术响应偏离表内响应,如应答有缺项,且无有效证明材料的,评标委员会有权不予认可,视同负偏离处理"
}
"""
user_query1 ="该招标文件的项目编号是什么?"
# # res1,res2=qianwen_long_stream(file_id,user_query1,2,1,True)
# res1,res2= qianwen_long_stream(file_id, user_query1, 2, 1,True)
res=qianwen_long(file_id,user_query1)
# print(res1)
# print(res2)
# res=qianwen_plus(user_query1)
print(res)
#
#
# user_query2 = ("请提供文件中关于资格审查的具体内容和标准。")
# start_time=time.time()
# # First query
# print("starting qianwen-long...")
# result1 ,result2= qianwen_long(file_id, user_query1)
# print("First Query Result:", result1)
# print(type(result1))
# print(result2)
# # Second query
# print("starting qianwen-long...")
# result2 = qianwen_long(file_id, user_query2)
# print("Second Query Result:", result2)
# end_time=time.time()
# print("elapsed time:"+str(end_time-start_time))

View File

@ -313,14 +313,14 @@ def combine_evaluation_standards(evaluation_method_path,invalid_path,zb_type):
-请根据你对招投标业务的熟悉对表格中的评分因素进行准确分类关键是确保每个评分因素都能被归类到'技术评分''商务评分''投标报价评分'不要遗漏任何一个评分因素
**特殊情况**
1. 缺失评分项若大项的'xx评分'要求未在文中说明则键名'xx评分'的键值设为字符串'本项目无xx评分项'例如"技术评分":"本项目无技术评分项"而非默认的字典格式
1. 缺失评分项若大项的'xx评分'要求未在文中说明则键名'xx评分'的键值设为字符串'本项目无xx评分项'例如"技术评分":"本项目无技术评分项"而非默认的字典格式请基于提供的实际招标文件内容禁止捏造回答
2. 其他评分默认情况大项评分仅有'技术评分''商务评分''投标报价评分'若在充分归类之后表格中仍有评分因素未被归类才添加大项评分'其他评分'保存该内容
3. 多包评分默认只有一包最外层键名为各大评分项而不是'一包'但是如果该招标采购活动有多个分包且每个分包有独自的评分表则最外层键名为对应的包名'一包''二包'内部才是各大评分项
4. 多张技术评分表若同一包下有多张技术评分表请不要遗漏任何一个评分表的信息,此时最外层键名'技术评分'替换为'技术评分-d'd为自然数从1开始分别保存每张技术评分表的信息
-例如有'技术评分标准1其他项目''技术评分标准2施工类'算作两个技术评分表最外层的键名分别为'技术评分-1''技术评分-2'替换默认的'技术评分'
**禁止内容**
1. 确保所有输出内容均基于提供的实际招标文件内容除了最外层的三个评分大项名称不使用任何预设的示例作为回答
1. 确保所有输出内容均基于提供的实际招标文件内容除了最外层的三个评分大项名称不使用任何预设的示例作为回答也禁止捏造评分标准
2. 不得擅自添加不属于评审因素的键名以及 `'备注'` 之外的其他键名
3. 不得遗漏评分表中的任何评分因素确保每个评分因素都被正确归类到评分大项下
"""
@ -447,12 +447,12 @@ def combine_evaluation_standards(evaluation_method_path,invalid_path,zb_type):
# 定义用户查询
query = (
"""根据该文档,你判断它是否有关于技术评分或商务评分或投标报价的具体的评分及要求,如果有,返回'',否则返回''
要求与指南
1. 评分要求主要以表格形式呈现且有评分因素及评分要求标准其中评分因素可以是笼统的评分大项如'技术评分''商务评分'
2. 竞争性磋商文件通常无评分要求但若满足'1.'的内容也请返回''
3. 仅返回''''不需要其他解释或内容
"""
"""请根据以下指南判断该文档是否包含关于技术评分、商务评分或投标报价的具体评分要求和标准:
1. 若文档中以表格形式展示了评分要求且包含评分因素技术评分商务评分或更细的评分因素及相应的评分标准即使评分方式为定性无具体分值也应视为满足要求
2. 如果文档中仅描述了评标流程但未提供具体的评分标准则应视为不满足条件
3. 虽然竞争性磋商文件通常不包含评分要求但若文档满足第1条的内容也应视为符合要求
请仅返回不附加其他解释或内容
"""
) # 应对竞争性谈判这种无评分要求的情况
# 执行查询
@ -505,6 +505,7 @@ def combine_evaluation_standards(evaluation_method_path,invalid_path,zb_type):
try:
judge_res, file_id = run_first_query(evaluation_method_path)
print(judge_res)
eval_path = os.path.abspath(evaluation_method_path)
invalid_eval_path = os.path.abspath(invalid_path)
# 获取 evaluation_method_path 所在的目录
@ -556,8 +557,8 @@ def combine_evaluation_standards(evaluation_method_path,invalid_path,zb_type):
if __name__ == "__main__":
start_time=time.time()
# truncate_file=r"C:\Users\Administrator\Desktop\招标文件-采购类\tmp2\2024-新疆-塔城地区公安局食药环分局快检实验室项目_evaluation_method.pdf"
evaluation_method_path = r'C:\Users\Administrator\Desktop\货物标\output2\招标文件-通产丽星高端化妆品研发生产总部基地高低压配电工程_evaluation_method11.pdf'
invalid_path=r'C:\Users\Administrator\Desktop\货物标\output2\招标文件-通产丽星高端化妆品研发生产总部基地高低压配电工程_evaluation_method11.pdf'
evaluation_method_path = r'C:\Users\Administrator\Downloads\2022-广东-鹏华基金管理有限公司深圳深业上城办公室装修项目.pdf'
invalid_path=r'C:\Users\Administrator\Downloads\2022-广东-鹏华基金管理有限公司深圳深业上城办公室装修项目.pdf'
# truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output2\\2-招标文件统计局智能终端二次招标_evaluation_method.pdf"
# truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output2\\广水市妇幼招标文件最新W改_evaluation_method.pdf"
# truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\2d481945-1f82-45a5-8e56-7fafea4a7793\\ztbfile_evaluation_method.pdf"

View File

@ -118,14 +118,14 @@ if __name__ == "__main__":
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
# pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\d4f30cc2-1643-4576-bfb1-97a2f1e5ba51\ztbfile.pdf"
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\0aade992-dc8e-4690-9b09-78630a34c6e7\ztbfile.pdf"
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\d4f30cc2-1643-4576-bfb1-97a2f1e5ba51\tmp"
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\0aade992-dc8e-4690-9b09-78630a34c6e7"
# selections = [1, 4] # 仅处理 selection 4、1
# selections = [1, 2, 3, 5]
# files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods', selections) #engineering
files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods')
files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'engineering')
print(files)
# print(files[-1])
# print(files[-2])

View File

@ -1,7 +1,8 @@
import re
import threading
import fitz
from PyPDF2 import PdfReader
from docx import Document
def extract_common_header(pdf_path):
"""
提取 PDF 文件的公共页眉
@ -180,30 +181,119 @@ def clean_page_content(text, common_header):
text = re.sub(r'^\s*[—-]\s*(第\s*)?\d{1,3}\s*(页)?\s*[—-]\s*', '', text) # 删除形如 '—2—', '-2-', 或 '-第2页-' 的页码
return text
def is_scanned_pdf(file_path, max_pages=15):
def read_page_text_with_timeout(page, timeout=3):
"""
检查 PDF 是否为扫描件即前 15 页无文本
参数:
- file_path: PDF 文件路径
- max_pages: 最大检查页数默认为 15
返回:
- True: 如果前 15 页都没有文本认为是扫描件
- False: 如果有任何页有文本认为不是扫描件
尝试在子线程里执行 page.extract_text()
如果超过 `timeout` 秒没有完成就返回 None表示超时
如果正常完成就返回提取到的文本
"""
with open(file_path, 'rb') as file:
reader = PdfReader(file)
for i, page in enumerate(reader.pages):
if i >= max_pages: # 超过最大检查页数,停止检查
break
if page.extract_text().strip(): # 如果有文本
return False # 不是扫描型
return True # 前 max_pages 页都没有文本
done = threading.Event()
result = []
def wrapper():
try:
txt = page.extract_text() # 可能会卡住的操作
result.append(txt)
except Exception as e:
print("提取文本时出错:", e)
result.append("")
finally:
done.set()
# 启动后台线程
thread = threading.Thread(target=wrapper, daemon=True)
thread.start()
# 等待提取结果,超过 timeout 秒则视为超时
finished_in_time = done.wait(timeout)
if not finished_in_time:
print(f"单页文本提取超时(超过 {timeout} 秒)!")
return None
return result[0] if result else ""
def is_scanned_pdf(file_path, max_pages=15, page_timeout=3, overall_timeout=30):
"""
检查 PDF 是否为扫描件逻辑:
1. 逐页读取若某页提取文本超时(>page_timeout秒)直接判定为 True(假设超时为扫描件)
2. 如果正常提取到文本且文本不为空则判定为 False非扫描件立即返回
3. 如果前 max_pages 页都检测完成均无可见文本则返回 True认为是扫描件
4. 如果需要整体超时overall_timeout则在最外层加一个封装线程进行控制
"""
def core_check(result_container, done_event):
"""
真正的核心逻辑执行完后把结果塞进 result_container[0]
然后调用 done_event.set() 告知整个检查流程结束
"""
try:
with open(file_path, 'rb') as f:
reader = PdfReader(f)
for i, page in enumerate(reader.pages):
if i >= max_pages:
break
# 尝试在 page_timeout 内获取文本
text = read_page_text_with_timeout(page, page_timeout)
if text is None:
print(f"{i+1} 页文本提取超时,直接判定为扫描件。")
result_container[0] = True
done_event.set()
return
if text.strip():
print(f"{i+1} 页检测到文本,判定为非扫描件。")
result_container[0] = False
done_event.set()
return
except Exception as e:
print("处理 PDF 文件时发生异常:", e)
result_container[0] = True
done_event.set()
return
print(f"{max_pages} 页均未检测到文本,判定为扫描件。")
result_container[0] = True
done_event.set()
result_container = [None] # 用于在子线程中传递结果
done = threading.Event()
thread = threading.Thread(target=core_check, args=(result_container, done), daemon=True)
thread.start()
# 等待整体流程结束,最多等待 overall_timeout 秒
finished_in_time = done.wait(overall_timeout)
if not finished_in_time:
print(f"整体检查超时(超过 {overall_timeout} 秒),返回默认结果。")
return True # 或者根据需要返回其它默认值
# 打印最终结果调试信息
if result_container[0]:
print("最终结果:认为是扫描件(未检测到有效文本或发生单页超时)")
else:
print("最终结果:认为非扫描件(检测到有效文本)")
return result_container[0]
def is_pure_image(docx_path, percentage=0.3):
"""
判断 docx 文件是否为纯图片
先计算文档中段落数量然后取前 percentage(默认30%)的段落进行判断
如果这些段落中没有文本则视为纯图片
"""
document = Document(docx_path)
paragraphs = document.paragraphs
total_paragraphs = len(paragraphs)
# 计算需要判断的段落数至少判断1个段落
check_count = max(1, int(total_paragraphs * percentage))
# 判断这部分段落是否含有文本
for paragraph in paragraphs[:check_count]:
if paragraph.text.strip():
return False
return True
if __name__ == '__main__':
file_path = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\2020-安徽-安徽省生态环境厅电梯采购.pdf"
res=is_scanned_pdf(file_path)
pdf_path=r"C:\Users\Administrator\Desktop\ztbfile.pdf"
res=is_scanned_pdf(pdf_path)
if res:
print("扫描型")
else:

View File

@ -1,8 +1,32 @@
import PyPDF2
from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content
import fitz # PyMuPDF
def extract_text_by_page_fitz(file_path):
common_header = extract_common_header(file_path)
# print(common_header)
# common_header=""
result = ""
with fitz.open(file_path) as doc:
num_pages = len(doc)
for page_num in range(num_pages):
page = doc[page_num]
text = page.get_text("text") # 获取文本内容
if text:
cleaned_text = clean_page_content(text, common_header)
print(cleaned_text)
print("-----------------" + str(page_num))
# result += cleaned_text
else:
print(f"Page {page_num + 1} is empty or text could not be extracted.")
return result
def extract_text_by_page(file_path):
common_header = extract_common_header(file_path)
# print(common_header)
result = ""
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
@ -117,14 +141,14 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
if __name__ == '__main__':
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
pdf_path=r"C:\Users\Administrator\Downloads\bid_format (1).pdf"
# file_path = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\ztbfile(1).pdf"
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\output2\广水市妇幼招标文件最新W改_evaluation_method.pdf"
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
ress = extract_common_header(pdf_path)
print(ress)
print("-----------------")
res=extract_text_by_page(pdf_path)
# ress = extract_common_header(pdf_path)
# print(ress)
# print("-----------------")
res=extract_text_by_page_fitz(pdf_path)
# print(res)磋商文件_tobidders_notice_part2.pdf
# save_extracted_text_to_txt(file_path,"output.txt")

View File

@ -35,7 +35,7 @@ def judge_zbfile() -> Any: #判断是否是招标文件
start_time = time.time()
downloaded_filename = os.path.join(output_folder, "ztbfile")
logger.info(f"接收到的url:{file_url}")
downloaded_filepath, file_type = download_file(file_url, downloaded_filename)
downloaded_filepath, file_type = download_file(file_url, downloaded_filename,True)
if not downloaded_filepath or file_type == 4:
logger.error("下载地址不存在或不支持的文件类型!")

View File

@ -64,7 +64,7 @@ def download_and_process_file(file_url, zb_type):
return None # 返回 None 以指示失败
# 映射 zb_type如果是 3 则映射为 2
mapped_zb_type = 2 if zb_type == 3 else zb_type
downloaded_filepath, file_type = download_file(file_url, downloaded_filename)
downloaded_filepath, file_type = download_file(file_url, downloaded_filename,True)
if downloaded_filepath is None or file_type == 4:
return None

View File

@ -58,7 +58,7 @@ def process_and_stream(file_url, zb_type):
start_time = time.time()
try:
downloaded = download_file(file_url, downloaded_filename)
downloaded = download_file(file_url, downloaded_filename,True)
if not downloaded:
logger.error("下载文件失败或不支持的文件类型")
log_error_unique_id(unique_id,1) # 记录失败的 unique_id

View File

@ -9,6 +9,7 @@ from docx import Document
from flask_app.general.insert_del_pagemark import insert_mark,delete_mark
from flask_app.general.截取pdf_main import truncate_pdf_multiple
from flask_app.general.merge_pdfs import merge_pdfs
from flask_app.general.读取文件.clean_pdf import is_pure_image
from flask_app.general.通用功能函数 import get_global_logger
from flask_app.general.投标人须知正文条款提取成json文件 import convert_clause_to_json
from flask_app.general.json_utils import transform_json_values
@ -24,58 +25,75 @@ from flask_app.general.format_change import pdf2docx, docx2pdf
executor = ThreadPoolExecutor()
def preprocess_files(output_folder, file_path, file_type,logger):
logger.info("starting 文件预处理...")
logger.info("output_folder..." + output_folder)
start_time=time.time()
is_pure_image_flag = False
pdf_path = ""
# 根据文件类型处理文件路径
if file_type == 1: # docx
# docx_path = file_path
pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理
if is_pure_image(file_path):
is_pure_image_flag = True
else:
pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理
elif file_type == 2: # pdf
pdf_path = file_path
# docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
elif file_type == 3: #doc
pdf_path=docx2pdf(file_path)
# docx_path=doc2docx(downloaded_file_path)
if is_pure_image(file_path):
is_pure_image_flag = True
else:
pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理
else:
logger.error("Unsupported file type provided. Preprocessing halted.")
return None
# 调用截取PDF多次
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'engineering')
if not is_pure_image_flag: # 大多数情况 不是纯图片doc/docx
# 调用截取PDF多次
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'engineering')
else:
truncate_files = ['', '', '', '', '', file_path, '']
# print("切割出的文件:"+str(truncate_files))
# 处理各个部分
notice_path=truncate_files[0] #招标公告
notice_path = truncate_files[0] # 招标公告
evaluation_method = truncate_files[1] #评标方法
evaluation_method = truncate_files[1] # 评标方法
qualification = truncate_files[2] #资格审查
qualification = truncate_files[2] # 资格审查
tobidders_notice_table = truncate_files[3] #投标人须知前附表
tobidders_notice=truncate_files[4] #投标人须知正文
tobidders_notice_table = truncate_files[3] # 投标人须知前附表
tobidders_notice = truncate_files[4] # 投标人须知正文
invalid_path=truncate_files[5] if truncate_files[5] != "" else pdf_path #无效标
invalid_path = truncate_files[5] if truncate_files[5] != "" else pdf_path # 无效标
merged_baseinfo_path = truncate_files[-1]
more_path = [merged_baseinfo_path, tobidders_notice]
merged_baseinfo_path_more = os.path.join(output_folder, "merged_baseinfo_path_more.pdf")
merged_baseinfo_path_more = merge_pdfs(more_path, merged_baseinfo_path_more)
clause_path = convert_clause_to_json(tobidders_notice, output_folder) # 投标人须知正文条款pdf->json
# invalid_docpath = copy_docx(docx_path) # docx截取无效标部分
# invalid_docpath=pdf2docx(invalid_path)
invalid_added_pdf = insert_mark(invalid_path)
invalid_added_docx = pdf2docx(invalid_added_pdf) #有标记的invalid_path
try:
# 尝试加载 .docx 文件
doc = Document(invalid_added_docx)
print("yes")
except Exception as e:
# 捕获异常并打印错误信息
invalid_added_docx=pdf2docx(invalid_path)
invalid_deleted_docx=delete_mark(invalid_added_docx) #无标记的invalid_path
if not invalid_deleted_docx:
invalid_deleted_docx=pdf2docx(invalid_path)
merged_baseinfo_path=truncate_files[-1]
more_path=[merged_baseinfo_path,tobidders_notice]
truncate_endtime=time.time()
if not is_pure_image_flag:
invalid_added_pdf = insert_mark(invalid_path)
logger.info(f"文件切分CPU耗时{truncate_endtime - start_time:.2f}")
merged_baseinfo_path_more=os.path.join(output_folder,"merged_baseinfo_path_more.pdf")
merged_baseinfo_path_more=merge_pdfs(more_path,merged_baseinfo_path_more)
clause_path = convert_clause_to_json(tobidders_notice, output_folder) # 投标人须知正文条款pdf->json
invalid_added_docx = pdf2docx(invalid_added_pdf) #有标记的invalid_path
try:
# 尝试加载 .docx 文件
doc = Document(invalid_added_docx)
print("yes")
except Exception as e:
# 捕获异常并打印错误信息
invalid_added_docx=pdf2docx(invalid_path)
invalid_deleted_docx=delete_mark(invalid_added_docx) #无标记的invalid_path
if not invalid_deleted_docx:
invalid_deleted_docx=pdf2docx(invalid_path)
else:
invalid_deleted_docx = file_path
invalid_added_docx = ''
end_time=time.time()
logger.info(f"文件预处理 done耗时{end_time - start_time:.2f}")
logger.info(f"文件预处理耗时:{end_time - start_time:.2f}")
# 返回包含预处理后文件路径的字典
return {

View File

@ -4,6 +4,7 @@ from docx import Document
from flask_app.general.format_change import docx2pdf, pdf2docx
from flask_app.general.insert_del_pagemark import insert_mark, delete_mark
from flask_app.general.json_utils import transform_json_values
from flask_app.general.读取文件.clean_pdf import is_pure_image
from flask_app.general.通用功能函数 import get_global_logger
from flask_app.货物标.基础信息解析货物标版 import combine_basic_info
from flask_app.general.投标人须知正文提取指定内容 import extract_from_notice
@ -17,51 +18,64 @@ from flask_app.general.商务技术评分提取 import combine_evaluation_standa
def preprocess_files(output_folder, file_path, file_type,logger):
logger.info("starting 文件预处理...")
start_time = time.time()
logger.info("output_folder..." + output_folder)
is_pure_image_flag=False #判断是否为纯图片类型的docx
pdf_path=""
# 根据文件类型处理文件路径
if file_type == 1: # docx
# docx_path = file_path
pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理
if is_pure_image(file_path):
is_pure_image_flag=True
else:
pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理
elif file_type == 2: # pdf
pdf_path = file_path
# docx_path = pdf2docx(pdf_path)
elif file_type == 3: # doc
pdf_path = docx2pdf(file_path)
# docx_path = doc2docx(file_path)
if is_pure_image(file_path):
is_pure_image_flag = True
else:
pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理
else:
logger.error("Unsupported file type provided. Preprocessing halted.")
return None
# 调用截取PDF多次
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods')
if not is_pure_image_flag: #大多数情况 不是纯图片doc/docx
# 调用截取PDF多次
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods')
else:
truncate_files=['','','','','','',file_path,''] #纯图片,无需切片
# print("切割出的文件:"+str(truncate_files))
# 处理各个部分
invalid_path = truncate_files[6] if truncate_files[6] != "" else pdf_path #无效标(投标文件格式\合同条款之前的内容)
invalid_added_pdf = insert_mark(invalid_path)
invalid_added_docx = pdf2docx(invalid_added_pdf) # 有标记的invalid_path
try:
# 尝试加载 .docx 文件
doc = Document(invalid_added_docx)
print("yes")
except Exception as e:
# 捕获异常并打印错误信息
invalid_added_docx=pdf2docx(invalid_path)
invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path
if not invalid_deleted_docx:
invalid_deleted_docx=pdf2docx(invalid_path)
# invalid_docpath = invalid_added_docx # docx截取无效标部分
procurement_path = truncate_files[5] # 采购需求
evaluation_method_path = truncate_files[1] # 评标办法
qualification_path = truncate_files[2] # 资格审查
tobidders_notice_path = truncate_files[4] # 投标人须知正文
notice_path = truncate_files[0] #招标公告
notice_path = truncate_files[0] # 招标公告
merged_baseinfo_path = truncate_files[7] # 合并封面+招标公告+投标人须知前附表+须知正文
clause_path = convert_clause_to_json(tobidders_notice_path, output_folder) # 投标人须知正文条款pdf->json
invalid_path = truncate_files[6] if truncate_files[6] != "" else pdf_path #无效标(投标文件格式\合同条款之前的内容)
truncate_endtime = time.time()
logger.info(f"文件切分CPU耗时{truncate_endtime - start_time:.2f}")
if not is_pure_image_flag:
invalid_added_pdf = insert_mark(invalid_path)
invalid_added_docx = pdf2docx(invalid_added_pdf) # 有标记的invalid_path用于废标项提取使用正则。
try:
# 尝试加载 .docx 文件
doc = Document(invalid_added_docx)
# print("yes")
except Exception as e:
# 捕获异常并打印错误信息
invalid_added_docx=pdf2docx(invalid_path)
invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path
if not invalid_deleted_docx:
invalid_deleted_docx=pdf2docx(invalid_path)
else: #主要是节约了pdf2docx的一块钱
invalid_deleted_docx=file_path
invalid_added_docx='' #由于传入的docx是纯图片型正则是提取不到的需要调用大模型。
end_time = time.time()
logger.info(f"文件预处理 done耗时{end_time - start_time:.2f}")
logger.info(f"文件预处理耗时:{end_time - start_time:.2f}")
# 提前返回,不等待 future_knowledge 完成,返回包含 Future 对象
return {
@ -281,15 +295,16 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
if __name__ == "__main__":
# 配置日志器
unique_id = "uuidzyzy11"
# logger = get_global_logger(unique_id)
logger = get_global_logger(unique_id)
output_folder = "flask_app/static/output/zytest1"
file_type = 1 # 1:docx 2:pdf 3:其他
input_file = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\6.2定版视频会议磋商文件.pdf"
file_type = 2 # 1:docx 2:pdf 3:其他
input_file = r"C:\Users\Administrator\Desktop\fsdownload\e5c8ca13-6043-49e5-a156-685bc1aabb58\ztbfile.pdf"
start_time = time.time()
# preprocess_files(output_folder, input_file, file_type, logger)
# 创建生成器
generator = goods_bid_main(output_folder, input_file, file_type, unique_id)
# 迭代生成器,逐步获取和处理结果
for output in generator:
print(output)

View File

@ -29,7 +29,7 @@ def fetch_procurement_reqs(procurement_path, invalid_path):
# 判断路径是否一致一致表示一开始procurement_path截取为空
if proc_path == invalid_path:
# 读取 PDF 页码数
page_count = get_pdf_page_count(procurement_path)
page_count = get_pdf_page_count(procurement_path) #注意这里的procurement_path可能是docx的invalid_path
if page_count > 60: # 如果页码数大于60,不转markdown
tech_model_type= 2 #long
busi_model_type=3 #long-stream