2.13 处理一些pdf文件无法被pypdf2处理导致死循环的问题,以及纯图片的docx文档解析出现的问题
This commit is contained in:
parent
2a25c28e20
commit
140ef263a9
@ -3,14 +3,17 @@ import os
|
||||
import mimetypes
|
||||
import requests
|
||||
from PyPDF2 import PdfReader
|
||||
from flask_app.general.读取文件.clean_pdf import is_scanned_pdf
|
||||
def download_file(url, local_filename):
|
||||
from flask_app.general.读取文件.clean_pdf import is_scanned_pdf, is_pure_image
|
||||
|
||||
|
||||
def download_file(url, local_filename,enable=False):
|
||||
"""
|
||||
下载文件并保存到本地,基于Content-Type设置文件扩展名。
|
||||
|
||||
参数:
|
||||
- url (str): 文件的URL地址。
|
||||
- local_filename (str): 本地保存的文件名(不含扩展名)。
|
||||
- enable: 是否需要判断为扫描型/纯图片文件
|
||||
|
||||
返回:
|
||||
- tuple: (完整文件名, 文件类型代码)
|
||||
@ -50,7 +53,7 @@ def download_file(url, local_filename):
|
||||
}
|
||||
file_code = extension_mapping.get(extension.lower(), 4)
|
||||
# 如果是 PDF,判断是否为扫描型
|
||||
if extension.lower() == '.pdf':
|
||||
if enable and extension.lower() == '.pdf':
|
||||
print(f"Checking if the PDF is scanned: {full_filename}")
|
||||
if is_scanned_pdf(full_filename):
|
||||
print(f"PDF is scanned. Converting to normal PDF: {full_filename}")
|
||||
@ -154,6 +157,7 @@ def doc2docx(local_path_in):
|
||||
downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
|
||||
print(f"format_change d2d:have downloaded file to: {downloaded_filepath}")
|
||||
return downloaded_filepath
|
||||
|
||||
def docx2pdf(local_path_in,force=False):
|
||||
"""
|
||||
将 DOCX 文件转换为 PDF。
|
||||
@ -174,7 +178,7 @@ def docx2pdf(local_path_in,force=False):
|
||||
pdf_file_path = os.path.join(folder, f"{filename}.pdf")
|
||||
if os.path.exists(pdf_file_path):
|
||||
if force:
|
||||
print(f"强制转换,覆盖已存在的文件: {pdf_file_path}")
|
||||
print(f"强制转换,覆盖已存在的文件: {pdf_file_path}") #覆盖掉原来的扫描型pdf
|
||||
else:
|
||||
print(f"跳过转换,文件已存在: {pdf_file_path}")
|
||||
return pdf_file_path # 跳过转换
|
||||
@ -207,11 +211,13 @@ if __name__ == '__main__':
|
||||
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
|
||||
# local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
|
||||
# local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf"
|
||||
local_path_in = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\有问题的.doc"
|
||||
# downloaded_file=pdf2docx(local_path_in)
|
||||
local_path_in = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\ztbfile(1).pdf"
|
||||
intermediate_docx = pdf2docx(local_path_in)
|
||||
if intermediate_docx:
|
||||
normal_pdf = docx2pdf(intermediate_docx, force=True)
|
||||
# # downloaded_file=pdf2docx(local_path_in)
|
||||
downloaded_file=docx2pdf(local_path_in)
|
||||
print(downloaded_file)
|
||||
# downloaded_file=docx2pdf(local_path_in)
|
||||
# print(downloaded_file)
|
||||
|
||||
# test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7.pdf?Expires=1736852995&OSSAccessKeyId=TMP.3Kg1oKKcsSWb7DXNe4F56bfGfKY5nNWUi274p39HyY7GR3mghMCaFWy69Fi83SBab6PmSkErh4JUD4yAxAGzVVx2hxxoxm&Signature=rmxS5lett4MzWdksDI57EujCklw%3"
|
||||
# local_file_name = r'D:\flask_project\flask_app\static\output\output1\1d763771-f25a-4b65-839e-3b2ca56577b1\tmp\ztbfile.pdf'
|
||||
@ -224,8 +230,8 @@ if __name__ == '__main__':
|
||||
# # 检查文件类型
|
||||
# if file_type == 4:
|
||||
# print("error")
|
||||
res=pdf2docx(local_path_in)
|
||||
print(res)
|
||||
# res=pdf2docx(local_path_in)
|
||||
# print(res)
|
||||
|
||||
|
||||
|
||||
|
@ -95,7 +95,7 @@ def delete_mark(docx_path):
|
||||
|
||||
if __name__ == '__main__':
|
||||
# input=r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\ztbfile.pdf'
|
||||
input=r'C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件(107国道).pdf'
|
||||
input=r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\ztbfile_invalid.pdf'
|
||||
output=insert_mark(input)
|
||||
# doc_path = r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\invalid_added.docx'
|
||||
# res=delete_mark(doc_path)
|
||||
|
@ -239,46 +239,15 @@ def qianwen_long_stream(file_id, user_query, max_retries=2, backoff_factor=1.0,
|
||||
if __name__ == "__main__":
|
||||
# Example file path - replace with your actual file path
|
||||
|
||||
file_path = r"C:\Users\Administrator\Desktop\fsdownload\29457826-1e99-4e98-9c90-1cfb5d175579\invalid_del.docx"
|
||||
file_path = r"C:\Users\Administrator\Downloads\2022-广东-鹏华基金管理有限公司深圳深业上城办公室装修项目.pdf"
|
||||
file_id = upload_file(file_path)
|
||||
# print(file_id)
|
||||
user_query1 ="""该招标文件对响应文件(投标文件)偏离项的要求或内容是怎样的?请不要回答具体的技术参数,也不要回答具体的评分要求。请以json格式给我提供信息,外层键名为'偏离',若存在嵌套信息,嵌套内容键名为文件中对应字段或是你的总结,键值为原文对应内容。若文中没有关于偏离项的相关内容,在键值中填'未知'。
|
||||
禁止内容:
|
||||
确保键值内容均基于提供的实际招标文件内容,禁止使用任何预设的示例作为回答。
|
||||
禁止返回markdown格式,请提取具体的偏离相关内容。
|
||||
示例1,嵌套键值对情况:
|
||||
{
|
||||
"偏离":{
|
||||
"技术要求":"以★标示的内容不允许负偏离",
|
||||
"商务要求":"以★标示的内容不允许负偏离"
|
||||
}
|
||||
}
|
||||
示例2,无嵌套键值对情况:
|
||||
{
|
||||
"偏离":"所有参数需在技术响应偏离表内响应,如应答有缺项,且无有效证明材料的,评标委员会有权不予认可,视同负偏离处理"
|
||||
}
|
||||
"""
|
||||
user_query1 ="该招标文件的项目编号是什么?"
|
||||
|
||||
# # res1,res2=qianwen_long_stream(file_id,user_query1,2,1,True)
|
||||
# res1,res2= qianwen_long_stream(file_id, user_query1, 2, 1,True)
|
||||
res=qianwen_long(file_id,user_query1)
|
||||
# print(res1)
|
||||
# print(res2)
|
||||
# res=qianwen_plus(user_query1)
|
||||
print(res)
|
||||
#
|
||||
#
|
||||
# user_query2 = ("请提供文件中关于资格审查的具体内容和标准。")
|
||||
# start_time=time.time()
|
||||
# # First query
|
||||
# print("starting qianwen-long...")
|
||||
# result1 ,result2= qianwen_long(file_id, user_query1)
|
||||
# print("First Query Result:", result1)
|
||||
# print(type(result1))
|
||||
# print(result2)
|
||||
# # Second query
|
||||
# print("starting qianwen-long...")
|
||||
# result2 = qianwen_long(file_id, user_query2)
|
||||
# print("Second Query Result:", result2)
|
||||
# end_time=time.time()
|
||||
# print("elapsed time:"+str(end_time-start_time))
|
||||
|
||||
|
@ -313,14 +313,14 @@ def combine_evaluation_standards(evaluation_method_path,invalid_path,zb_type):
|
||||
-请根据你对招投标业务的熟悉,对表格中的评分因素进行准确分类。关键是确保每个评分因素都能被归类到'技术评分'或'商务评分'或'投标报价评分',不要遗漏任何一个评分因素。
|
||||
|
||||
**特殊情况**:
|
||||
1. 缺失评分项:若大项的'xx评分'要求未在文中说明,则键名'xx评分'的键值设为字符串'本项目无xx评分项',例如"技术评分":"本项目无技术评分项",而非默认的字典格式。
|
||||
1. 缺失评分项:若大项的'xx评分'要求未在文中说明,则键名'xx评分'的键值设为字符串'本项目无xx评分项',例如"技术评分":"本项目无技术评分项",而非默认的字典格式,请基于提供的实际招标文件内容,禁止捏造回答。
|
||||
2. 其他评分:默认情况大项评分仅有'技术评分''商务评分''投标报价评分',若在充分归类之后,表格中仍有评分因素未被归类,才添加大项评分'其他评分'保存该内容。
|
||||
3. 多包评分:默认只有一包,最外层键名为各大评分项,而不是'一包';但是如果该招标、采购活动有多个分包且每个分包有独自的评分表,则最外层键名为对应的包名,如'一包''二包',内部才是各大评分项。
|
||||
4. 多张技术评分表:若同一包下有多张技术评分表,请不要遗漏任何一个评分表的信息,此时最外层键名'技术评分'替换为'技术评分-d',d为自然数,从1开始,分别保存每张技术评分表的信息。
|
||||
-例如有'技术评分标准(1)其他项目','技术评分标准(2)施工类',算作两个技术评分表,最外层的键名分别为'技术评分-1''技术评分-2',替换默认的'技术评分'
|
||||
|
||||
**禁止内容**:
|
||||
1. 确保所有输出内容均基于提供的实际招标文件内容(除了最外层的三个评分大项名称),不使用任何预设的示例作为回答。
|
||||
1. 确保所有输出内容均基于提供的实际招标文件内容(除了最外层的三个评分大项名称),不使用任何预设的示例作为回答,也禁止捏造评分标准。
|
||||
2. 不得擅自添加不属于评审因素的键名以及 `'备注'` 之外的其他键名。
|
||||
3. 不得遗漏评分表中的任何评分因素,确保每个评分因素都被正确归类到评分大项下。
|
||||
"""
|
||||
@ -447,12 +447,12 @@ def combine_evaluation_standards(evaluation_method_path,invalid_path,zb_type):
|
||||
|
||||
# 定义用户查询
|
||||
query = (
|
||||
"""根据该文档,你判断它是否有关于技术评分或商务评分或投标报价的具体的评分及要求,如果有,返回'是',否则返回'否'。
|
||||
要求与指南:
|
||||
1. 评分要求主要以表格形式呈现,且有评分因素及评分要求、标准,其中评分因素可以是笼统的评分大项如'技术评分'或'商务评分'。
|
||||
2. 竞争性磋商文件通常无评分要求,但若满足'1.'的内容,也请返回'是'。
|
||||
3. 仅返回'是'或'否',不需要其他解释或内容。
|
||||
"""
|
||||
"""请根据以下指南判断该文档是否包含关于技术评分、商务评分或投标报价的具体评分要求和标准:
|
||||
1. 若文档中以表格形式展示了评分要求,且包含评分因素(如“技术评分”或“商务评分”或更细的评分因素)及相应的评分标准;即使评分方式为定性(无具体分值),也应视为满足要求。
|
||||
2. 如果文档中仅描述了评标流程,但未提供具体的评分标准,则应视为不满足条件。
|
||||
3. 虽然竞争性磋商文件通常不包含评分要求,但若文档满足第1条的内容,也应视为符合要求。
|
||||
请仅返回“是”或“否”,不附加其他解释或内容。
|
||||
"""
|
||||
) # 应对竞争性谈判这种无评分要求的情况
|
||||
|
||||
# 执行查询
|
||||
@ -505,6 +505,7 @@ def combine_evaluation_standards(evaluation_method_path,invalid_path,zb_type):
|
||||
|
||||
try:
|
||||
judge_res, file_id = run_first_query(evaluation_method_path)
|
||||
print(judge_res)
|
||||
eval_path = os.path.abspath(evaluation_method_path)
|
||||
invalid_eval_path = os.path.abspath(invalid_path)
|
||||
# 获取 evaluation_method_path 所在的目录
|
||||
@ -556,8 +557,8 @@ def combine_evaluation_standards(evaluation_method_path,invalid_path,zb_type):
|
||||
if __name__ == "__main__":
|
||||
start_time=time.time()
|
||||
# truncate_file=r"C:\Users\Administrator\Desktop\招标文件-采购类\tmp2\2024-新疆-塔城地区公安局食药环分局快检实验室项目_evaluation_method.pdf"
|
||||
evaluation_method_path = r'C:\Users\Administrator\Desktop\货物标\output2\招标文件-通产丽星高端化妆品研发生产总部基地高低压配电工程_evaluation_method11.pdf'
|
||||
invalid_path=r'C:\Users\Administrator\Desktop\货物标\output2\招标文件-通产丽星高端化妆品研发生产总部基地高低压配电工程_evaluation_method11.pdf'
|
||||
evaluation_method_path = r'C:\Users\Administrator\Downloads\2022-广东-鹏华基金管理有限公司深圳深业上城办公室装修项目.pdf'
|
||||
invalid_path=r'C:\Users\Administrator\Downloads\2022-广东-鹏华基金管理有限公司深圳深业上城办公室装修项目.pdf'
|
||||
# truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output2\\2-招标文件(统计局智能终端二次招标)_evaluation_method.pdf"
|
||||
# truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output2\\广水市妇幼招标文件最新(W改)_evaluation_method.pdf"
|
||||
# truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\2d481945-1f82-45a5-8e56-7fafea4a7793\\ztbfile_evaluation_method.pdf"
|
||||
|
@ -118,14 +118,14 @@ if __name__ == "__main__":
|
||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
|
||||
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
|
||||
# pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
|
||||
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\d4f30cc2-1643-4576-bfb1-97a2f1e5ba51\ztbfile.pdf"
|
||||
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\0aade992-dc8e-4690-9b09-78630a34c6e7\ztbfile.pdf"
|
||||
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
|
||||
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\d4f30cc2-1643-4576-bfb1-97a2f1e5ba51\tmp"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\0aade992-dc8e-4690-9b09-78630a34c6e7"
|
||||
# selections = [1, 4] # 仅处理 selection 4、1
|
||||
# selections = [1, 2, 3, 5]
|
||||
# files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods', selections) #engineering
|
||||
files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods')
|
||||
files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'engineering')
|
||||
print(files)
|
||||
# print(files[-1])
|
||||
# print(files[-2])
|
||||
|
@ -1,7 +1,8 @@
|
||||
import re
|
||||
import threading
|
||||
import fitz
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
from docx import Document
|
||||
def extract_common_header(pdf_path):
|
||||
"""
|
||||
提取 PDF 文件的公共页眉。
|
||||
@ -180,30 +181,119 @@ def clean_page_content(text, common_header):
|
||||
text = re.sub(r'^\s*[—-]\s*(第\s*)?\d{1,3}\s*(页)?\s*[—-]\s*', '', text) # 删除形如 '—2—', '-2-', 或 '-第2页-' 的页码
|
||||
return text
|
||||
|
||||
def is_scanned_pdf(file_path, max_pages=15):
|
||||
def read_page_text_with_timeout(page, timeout=3):
|
||||
"""
|
||||
检查 PDF 是否为扫描件(即前 15 页无文本)。
|
||||
|
||||
参数:
|
||||
- file_path: PDF 文件路径。
|
||||
- max_pages: 最大检查页数,默认为 15 页。
|
||||
|
||||
返回:
|
||||
- True: 如果前 15 页都没有文本,认为是扫描件。
|
||||
- False: 如果有任何页有文本,认为不是扫描件。
|
||||
尝试在子线程里执行 page.extract_text()。
|
||||
如果超过 `timeout` 秒没有完成,就返回 None,表示超时。
|
||||
如果正常完成,就返回提取到的文本。
|
||||
"""
|
||||
with open(file_path, 'rb') as file:
|
||||
reader = PdfReader(file)
|
||||
for i, page in enumerate(reader.pages):
|
||||
if i >= max_pages: # 超过最大检查页数,停止检查
|
||||
break
|
||||
if page.extract_text().strip(): # 如果有文本
|
||||
return False # 不是扫描型
|
||||
return True # 前 max_pages 页都没有文本
|
||||
done = threading.Event()
|
||||
result = []
|
||||
|
||||
def wrapper():
|
||||
try:
|
||||
txt = page.extract_text() # 可能会卡住的操作
|
||||
result.append(txt)
|
||||
except Exception as e:
|
||||
print("提取文本时出错:", e)
|
||||
result.append("")
|
||||
finally:
|
||||
done.set()
|
||||
|
||||
# 启动后台线程
|
||||
thread = threading.Thread(target=wrapper, daemon=True)
|
||||
thread.start()
|
||||
|
||||
# 等待提取结果,超过 timeout 秒则视为超时
|
||||
finished_in_time = done.wait(timeout)
|
||||
if not finished_in_time:
|
||||
print(f"单页文本提取超时(超过 {timeout} 秒)!")
|
||||
return None
|
||||
|
||||
return result[0] if result else ""
|
||||
|
||||
def is_scanned_pdf(file_path, max_pages=15, page_timeout=3, overall_timeout=30):
|
||||
"""
|
||||
检查 PDF 是否为扫描件。逻辑:
|
||||
1. 逐页读取,若某页提取文本超时(>page_timeout秒),直接判定为 True(假设超时为扫描件)。
|
||||
2. 如果正常提取到文本且文本不为空,则判定为 False(非扫描件),立即返回。
|
||||
3. 如果前 max_pages 页都检测完成,均无可见文本,则返回 True(认为是扫描件)。
|
||||
4. 如果需要整体超时(overall_timeout),则在最外层加一个封装线程进行控制。
|
||||
"""
|
||||
def core_check(result_container, done_event):
|
||||
"""
|
||||
真正的核心逻辑,执行完后把结果塞进 result_container[0],
|
||||
然后调用 done_event.set() 告知“整个检查流程结束”。
|
||||
"""
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
reader = PdfReader(f)
|
||||
for i, page in enumerate(reader.pages):
|
||||
if i >= max_pages:
|
||||
break
|
||||
|
||||
# 尝试在 page_timeout 内获取文本
|
||||
text = read_page_text_with_timeout(page, page_timeout)
|
||||
if text is None:
|
||||
print(f"第 {i+1} 页文本提取超时,直接判定为扫描件。")
|
||||
result_container[0] = True
|
||||
done_event.set()
|
||||
return
|
||||
|
||||
if text.strip():
|
||||
print(f"第 {i+1} 页检测到文本,判定为非扫描件。")
|
||||
result_container[0] = False
|
||||
done_event.set()
|
||||
return
|
||||
except Exception as e:
|
||||
print("处理 PDF 文件时发生异常:", e)
|
||||
result_container[0] = True
|
||||
done_event.set()
|
||||
return
|
||||
|
||||
print(f"前 {max_pages} 页均未检测到文本,判定为扫描件。")
|
||||
result_container[0] = True
|
||||
done_event.set()
|
||||
|
||||
result_container = [None] # 用于在子线程中传递结果
|
||||
done = threading.Event()
|
||||
thread = threading.Thread(target=core_check, args=(result_container, done), daemon=True)
|
||||
thread.start()
|
||||
|
||||
# 等待整体流程结束,最多等待 overall_timeout 秒
|
||||
finished_in_time = done.wait(overall_timeout)
|
||||
if not finished_in_time:
|
||||
print(f"整体检查超时(超过 {overall_timeout} 秒),返回默认结果。")
|
||||
return True # 或者根据需要返回其它默认值
|
||||
|
||||
# 打印最终结果调试信息
|
||||
if result_container[0]:
|
||||
print("最终结果:认为是扫描件(未检测到有效文本或发生单页超时)")
|
||||
else:
|
||||
print("最终结果:认为非扫描件(检测到有效文本)")
|
||||
return result_container[0]
|
||||
|
||||
def is_pure_image(docx_path, percentage=0.3):
|
||||
"""
|
||||
判断 docx 文件是否为纯图片。
|
||||
先计算文档中段落数量,然后取前 percentage(默认30%)的段落进行判断。
|
||||
如果这些段落中没有文本,则视为纯图片。
|
||||
"""
|
||||
document = Document(docx_path)
|
||||
paragraphs = document.paragraphs
|
||||
total_paragraphs = len(paragraphs)
|
||||
# 计算需要判断的段落数,至少判断1个段落
|
||||
check_count = max(1, int(total_paragraphs * percentage))
|
||||
|
||||
# 判断这部分段落是否含有文本
|
||||
for paragraph in paragraphs[:check_count]:
|
||||
if paragraph.text.strip():
|
||||
return False
|
||||
return True
|
||||
|
||||
if __name__ == '__main__':
|
||||
file_path = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\2020-安徽-安徽省生态环境厅电梯采购.pdf"
|
||||
res=is_scanned_pdf(file_path)
|
||||
pdf_path=r"C:\Users\Administrator\Desktop\ztbfile.pdf"
|
||||
res=is_scanned_pdf(pdf_path)
|
||||
if res:
|
||||
print("扫描型")
|
||||
else:
|
||||
|
@ -1,8 +1,32 @@
|
||||
import PyPDF2
|
||||
from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content
|
||||
import fitz # PyMuPDF
|
||||
|
||||
def extract_text_by_page_fitz(file_path):
|
||||
common_header = extract_common_header(file_path)
|
||||
# print(common_header)
|
||||
# common_header=""
|
||||
result = ""
|
||||
|
||||
with fitz.open(file_path) as doc:
|
||||
num_pages = len(doc)
|
||||
for page_num in range(num_pages):
|
||||
page = doc[page_num]
|
||||
text = page.get_text("text") # 获取文本内容
|
||||
if text:
|
||||
cleaned_text = clean_page_content(text, common_header)
|
||||
print(cleaned_text)
|
||||
print("-----------------" + str(page_num))
|
||||
# result += cleaned_text
|
||||
else:
|
||||
print(f"Page {page_num + 1} is empty or text could not be extracted.")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def extract_text_by_page(file_path):
|
||||
common_header = extract_common_header(file_path)
|
||||
# print(common_header)
|
||||
result = ""
|
||||
with open(file_path, 'rb') as file:
|
||||
reader = PyPDF2.PdfReader(file)
|
||||
@ -117,14 +141,14 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
|
||||
|
||||
if __name__ == '__main__':
|
||||
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
||||
pdf_path=r"C:\Users\Administrator\Downloads\bid_format (1).pdf"
|
||||
# file_path = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
||||
pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\ztbfile(1).pdf"
|
||||
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\output2\广水市妇幼招标文件最新(W改)_evaluation_method.pdf"
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
||||
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
||||
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
|
||||
ress = extract_common_header(pdf_path)
|
||||
print(ress)
|
||||
print("-----------------")
|
||||
res=extract_text_by_page(pdf_path)
|
||||
# ress = extract_common_header(pdf_path)
|
||||
# print(ress)
|
||||
# print("-----------------")
|
||||
res=extract_text_by_page_fitz(pdf_path)
|
||||
# print(res)磋商文件_tobidders_notice_part2.pdf
|
||||
# save_extracted_text_to_txt(file_path,"output.txt")
|
||||
|
@ -35,7 +35,7 @@ def judge_zbfile() -> Any: #判断是否是招标文件
|
||||
start_time = time.time()
|
||||
downloaded_filename = os.path.join(output_folder, "ztbfile")
|
||||
logger.info(f"接收到的url:{file_url}")
|
||||
downloaded_filepath, file_type = download_file(file_url, downloaded_filename)
|
||||
downloaded_filepath, file_type = download_file(file_url, downloaded_filename,True)
|
||||
|
||||
if not downloaded_filepath or file_type == 4:
|
||||
logger.error("下载地址不存在或不支持的文件类型!")
|
||||
|
@ -64,7 +64,7 @@ def download_and_process_file(file_url, zb_type):
|
||||
return None # 返回 None 以指示失败
|
||||
# 映射 zb_type,如果是 3 则映射为 2
|
||||
mapped_zb_type = 2 if zb_type == 3 else zb_type
|
||||
downloaded_filepath, file_type = download_file(file_url, downloaded_filename)
|
||||
downloaded_filepath, file_type = download_file(file_url, downloaded_filename,True)
|
||||
|
||||
if downloaded_filepath is None or file_type == 4:
|
||||
return None
|
||||
|
@ -58,7 +58,7 @@ def process_and_stream(file_url, zb_type):
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
downloaded = download_file(file_url, downloaded_filename)
|
||||
downloaded = download_file(file_url, downloaded_filename,True)
|
||||
if not downloaded:
|
||||
logger.error("下载文件失败或不支持的文件类型")
|
||||
log_error_unique_id(unique_id,1) # 记录失败的 unique_id
|
||||
|
@ -9,6 +9,7 @@ from docx import Document
|
||||
from flask_app.general.insert_del_pagemark import insert_mark,delete_mark
|
||||
from flask_app.general.截取pdf_main import truncate_pdf_multiple
|
||||
from flask_app.general.merge_pdfs import merge_pdfs
|
||||
from flask_app.general.读取文件.clean_pdf import is_pure_image
|
||||
from flask_app.general.通用功能函数 import get_global_logger
|
||||
from flask_app.general.投标人须知正文条款提取成json文件 import convert_clause_to_json
|
||||
from flask_app.general.json_utils import transform_json_values
|
||||
@ -24,58 +25,75 @@ from flask_app.general.format_change import pdf2docx, docx2pdf
|
||||
executor = ThreadPoolExecutor()
|
||||
def preprocess_files(output_folder, file_path, file_type,logger):
|
||||
logger.info("starting 文件预处理...")
|
||||
logger.info("output_folder..." + output_folder)
|
||||
start_time=time.time()
|
||||
is_pure_image_flag = False
|
||||
pdf_path = ""
|
||||
# 根据文件类型处理文件路径
|
||||
if file_type == 1: # docx
|
||||
# docx_path = file_path
|
||||
pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理
|
||||
if is_pure_image(file_path):
|
||||
is_pure_image_flag = True
|
||||
else:
|
||||
pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理
|
||||
elif file_type == 2: # pdf
|
||||
pdf_path = file_path
|
||||
# docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
|
||||
elif file_type == 3: #doc
|
||||
pdf_path=docx2pdf(file_path)
|
||||
# docx_path=doc2docx(downloaded_file_path)
|
||||
if is_pure_image(file_path):
|
||||
is_pure_image_flag = True
|
||||
else:
|
||||
pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理
|
||||
else:
|
||||
logger.error("Unsupported file type provided. Preprocessing halted.")
|
||||
return None
|
||||
# 调用截取PDF多次
|
||||
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'engineering')
|
||||
if not is_pure_image_flag: # 大多数情况 不是纯图片doc/docx
|
||||
# 调用截取PDF多次
|
||||
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'engineering')
|
||||
else:
|
||||
truncate_files = ['', '', '', '', '', file_path, '']
|
||||
# print("切割出的文件:"+str(truncate_files))
|
||||
|
||||
# 处理各个部分
|
||||
notice_path=truncate_files[0] #招标公告
|
||||
notice_path = truncate_files[0] # 招标公告
|
||||
|
||||
evaluation_method = truncate_files[1] #评标方法
|
||||
evaluation_method = truncate_files[1] # 评标方法
|
||||
|
||||
qualification = truncate_files[2] #资格审查
|
||||
qualification = truncate_files[2] # 资格审查
|
||||
|
||||
tobidders_notice_table = truncate_files[3] #投标人须知前附表
|
||||
tobidders_notice=truncate_files[4] #投标人须知正文
|
||||
tobidders_notice_table = truncate_files[3] # 投标人须知前附表
|
||||
tobidders_notice = truncate_files[4] # 投标人须知正文
|
||||
|
||||
invalid_path=truncate_files[5] if truncate_files[5] != "" else pdf_path #无效标
|
||||
invalid_path = truncate_files[5] if truncate_files[5] != "" else pdf_path # 无效标
|
||||
|
||||
merged_baseinfo_path = truncate_files[-1]
|
||||
more_path = [merged_baseinfo_path, tobidders_notice]
|
||||
merged_baseinfo_path_more = os.path.join(output_folder, "merged_baseinfo_path_more.pdf")
|
||||
merged_baseinfo_path_more = merge_pdfs(more_path, merged_baseinfo_path_more)
|
||||
|
||||
clause_path = convert_clause_to_json(tobidders_notice, output_folder) # 投标人须知正文条款pdf->json
|
||||
# invalid_docpath = copy_docx(docx_path) # docx截取无效标部分
|
||||
# invalid_docpath=pdf2docx(invalid_path)
|
||||
invalid_added_pdf = insert_mark(invalid_path)
|
||||
invalid_added_docx = pdf2docx(invalid_added_pdf) #有标记的invalid_path
|
||||
try:
|
||||
# 尝试加载 .docx 文件
|
||||
doc = Document(invalid_added_docx)
|
||||
print("yes")
|
||||
except Exception as e:
|
||||
# 捕获异常并打印错误信息
|
||||
invalid_added_docx=pdf2docx(invalid_path)
|
||||
invalid_deleted_docx=delete_mark(invalid_added_docx) #无标记的invalid_path
|
||||
if not invalid_deleted_docx:
|
||||
invalid_deleted_docx=pdf2docx(invalid_path)
|
||||
merged_baseinfo_path=truncate_files[-1]
|
||||
more_path=[merged_baseinfo_path,tobidders_notice]
|
||||
truncate_endtime=time.time()
|
||||
if not is_pure_image_flag:
|
||||
invalid_added_pdf = insert_mark(invalid_path)
|
||||
logger.info(f"文件切分CPU耗时:{truncate_endtime - start_time:.2f} 秒")
|
||||
|
||||
merged_baseinfo_path_more=os.path.join(output_folder,"merged_baseinfo_path_more.pdf")
|
||||
merged_baseinfo_path_more=merge_pdfs(more_path,merged_baseinfo_path_more)
|
||||
clause_path = convert_clause_to_json(tobidders_notice, output_folder) # 投标人须知正文条款pdf->json
|
||||
invalid_added_docx = pdf2docx(invalid_added_pdf) #有标记的invalid_path
|
||||
try:
|
||||
# 尝试加载 .docx 文件
|
||||
doc = Document(invalid_added_docx)
|
||||
print("yes")
|
||||
except Exception as e:
|
||||
# 捕获异常并打印错误信息
|
||||
invalid_added_docx=pdf2docx(invalid_path)
|
||||
invalid_deleted_docx=delete_mark(invalid_added_docx) #无标记的invalid_path
|
||||
if not invalid_deleted_docx:
|
||||
invalid_deleted_docx=pdf2docx(invalid_path)
|
||||
else:
|
||||
invalid_deleted_docx = file_path
|
||||
invalid_added_docx = ''
|
||||
end_time=time.time()
|
||||
logger.info(f"文件预处理 done,耗时:{end_time - start_time:.2f} 秒")
|
||||
logger.info(f"文件预处理总耗时:{end_time - start_time:.2f} 秒")
|
||||
|
||||
# 返回包含预处理后文件路径的字典
|
||||
return {
|
||||
|
@ -4,6 +4,7 @@ from docx import Document
|
||||
from flask_app.general.format_change import docx2pdf, pdf2docx
|
||||
from flask_app.general.insert_del_pagemark import insert_mark, delete_mark
|
||||
from flask_app.general.json_utils import transform_json_values
|
||||
from flask_app.general.读取文件.clean_pdf import is_pure_image
|
||||
from flask_app.general.通用功能函数 import get_global_logger
|
||||
from flask_app.货物标.基础信息解析货物标版 import combine_basic_info
|
||||
from flask_app.general.投标人须知正文提取指定内容 import extract_from_notice
|
||||
@ -17,51 +18,64 @@ from flask_app.general.商务技术评分提取 import combine_evaluation_standa
|
||||
def preprocess_files(output_folder, file_path, file_type,logger):
|
||||
logger.info("starting 文件预处理...")
|
||||
start_time = time.time()
|
||||
logger.info("output_folder..." + output_folder)
|
||||
is_pure_image_flag=False #判断是否为纯图片类型的docx
|
||||
pdf_path=""
|
||||
# 根据文件类型处理文件路径
|
||||
if file_type == 1: # docx
|
||||
# docx_path = file_path
|
||||
pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理
|
||||
if is_pure_image(file_path):
|
||||
is_pure_image_flag=True
|
||||
else:
|
||||
pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理
|
||||
elif file_type == 2: # pdf
|
||||
pdf_path = file_path
|
||||
# docx_path = pdf2docx(pdf_path)
|
||||
elif file_type == 3: # doc
|
||||
pdf_path = docx2pdf(file_path)
|
||||
# docx_path = doc2docx(file_path)
|
||||
if is_pure_image(file_path):
|
||||
is_pure_image_flag = True
|
||||
else:
|
||||
pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理
|
||||
else:
|
||||
logger.error("Unsupported file type provided. Preprocessing halted.")
|
||||
return None
|
||||
|
||||
# 调用截取PDF多次
|
||||
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods')
|
||||
|
||||
if not is_pure_image_flag: #大多数情况 不是纯图片doc/docx
|
||||
# 调用截取PDF多次
|
||||
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods')
|
||||
else:
|
||||
truncate_files=['','','','','','',file_path,''] #纯图片,无需切片
|
||||
# print("切割出的文件:"+str(truncate_files))
|
||||
# 处理各个部分
|
||||
invalid_path = truncate_files[6] if truncate_files[6] != "" else pdf_path #无效标(投标文件格式\合同条款之前的内容)
|
||||
|
||||
invalid_added_pdf = insert_mark(invalid_path)
|
||||
invalid_added_docx = pdf2docx(invalid_added_pdf) # 有标记的invalid_path
|
||||
try:
|
||||
# 尝试加载 .docx 文件
|
||||
doc = Document(invalid_added_docx)
|
||||
print("yes")
|
||||
except Exception as e:
|
||||
# 捕获异常并打印错误信息
|
||||
invalid_added_docx=pdf2docx(invalid_path)
|
||||
invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path
|
||||
if not invalid_deleted_docx:
|
||||
invalid_deleted_docx=pdf2docx(invalid_path)
|
||||
|
||||
# invalid_docpath = invalid_added_docx # docx截取无效标部分
|
||||
procurement_path = truncate_files[5] # 采购需求
|
||||
evaluation_method_path = truncate_files[1] # 评标办法
|
||||
qualification_path = truncate_files[2] # 资格审查
|
||||
tobidders_notice_path = truncate_files[4] # 投标人须知正文
|
||||
notice_path = truncate_files[0] #招标公告
|
||||
notice_path = truncate_files[0] # 招标公告
|
||||
merged_baseinfo_path = truncate_files[7] # 合并封面+招标公告+投标人须知前附表+须知正文
|
||||
clause_path = convert_clause_to_json(tobidders_notice_path, output_folder) # 投标人须知正文条款pdf->json
|
||||
|
||||
invalid_path = truncate_files[6] if truncate_files[6] != "" else pdf_path #无效标(投标文件格式\合同条款之前的内容)
|
||||
truncate_endtime = time.time()
|
||||
logger.info(f"文件切分CPU耗时:{truncate_endtime - start_time:.2f} 秒")
|
||||
if not is_pure_image_flag:
|
||||
invalid_added_pdf = insert_mark(invalid_path)
|
||||
invalid_added_docx = pdf2docx(invalid_added_pdf) # 有标记的invalid_path,用于废标项提取,使用正则。
|
||||
try:
|
||||
# 尝试加载 .docx 文件
|
||||
doc = Document(invalid_added_docx)
|
||||
# print("yes")
|
||||
except Exception as e:
|
||||
# 捕获异常并打印错误信息
|
||||
invalid_added_docx=pdf2docx(invalid_path)
|
||||
invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path
|
||||
if not invalid_deleted_docx:
|
||||
invalid_deleted_docx=pdf2docx(invalid_path)
|
||||
else: #主要是节约了pdf2docx的一块钱
|
||||
invalid_deleted_docx=file_path
|
||||
invalid_added_docx='' #由于传入的docx是纯图片型,正则是提取不到的,需要调用大模型。
|
||||
end_time = time.time()
|
||||
logger.info(f"文件预处理 done,耗时:{end_time - start_time:.2f} 秒")
|
||||
logger.info(f"文件预处理总耗时:{end_time - start_time:.2f} 秒")
|
||||
|
||||
# 提前返回,不等待 future_knowledge 完成,返回包含 Future 对象
|
||||
return {
|
||||
@ -281,15 +295,16 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
|
||||
if __name__ == "__main__":
|
||||
# 配置日志器
|
||||
unique_id = "uuidzyzy11"
|
||||
# logger = get_global_logger(unique_id)
|
||||
logger = get_global_logger(unique_id)
|
||||
|
||||
output_folder = "flask_app/static/output/zytest1"
|
||||
file_type = 1 # 1:docx 2:pdf 3:其他
|
||||
input_file = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\6.2定版视频会议磋商文件.pdf"
|
||||
file_type = 2 # 1:docx 2:pdf 3:其他
|
||||
input_file = r"C:\Users\Administrator\Desktop\fsdownload\e5c8ca13-6043-49e5-a156-685bc1aabb58\ztbfile.pdf"
|
||||
start_time = time.time()
|
||||
|
||||
# preprocess_files(output_folder, input_file, file_type, logger)
|
||||
# 创建生成器
|
||||
generator = goods_bid_main(output_folder, input_file, file_type, unique_id)
|
||||
|
||||
# 迭代生成器,逐步获取和处理结果
|
||||
for output in generator:
|
||||
print(output)
|
||||
|
@ -29,7 +29,7 @@ def fetch_procurement_reqs(procurement_path, invalid_path):
|
||||
# 判断路径是否一致,一致表示一开始procurement_path截取为空
|
||||
if proc_path == invalid_path:
|
||||
# 读取 PDF 页码数
|
||||
page_count = get_pdf_page_count(procurement_path)
|
||||
page_count = get_pdf_page_count(procurement_path) #注意这里的procurement_path可能是docx的invalid_path
|
||||
if page_count > 60: # 如果页码数大于60,不转markdown
|
||||
tech_model_type= 2 #long
|
||||
busi_model_type=3 #long-stream
|
||||
|
Loading…
x
Reference in New Issue
Block a user