zbparse/flask_app/old_version/extarct_img_2_txt_old.py

123 lines
5.7 KiB
Python
Raw Normal View History

import json
import os
import fitz
import PyPDF2
import tempfile
from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content
from flask_app.general.llm.doubao import doubao_model
from flask_app.old_version.table_ocr_old import CommonOcr
def extract_img_table_text(ocr_result_pages):
# 调用豆包对json形式的表格数据进行重构
# print(ocr_result_pages)
base_prompt = '''
任务你负责解析以json形式输入的表格信息根据提供的文件内容来恢复表格信息并输出不要遗漏任何一个文字
要求与指南
1. 请运用文档表格理解能力根据文件内容定位表格的每列和每行列与列之间用丨分隔若某列或者某行没有信息则用/填充
2. 请不要遗漏任何一个文字同时不要打乱行与行之间的顺序也不要打乱列与列之间的顺序严格按照文字的位置信息来恢复表格信息
示例输出
表格标题
|序号|名称|数量|单位|单价()|总价()|技术参数|备注|
|形象展示区|
|1|公园主E题雕塑|1||/|/|根据江夏体育与文化元素定制|/|
..........
'''
base_prompt += f"\n\n文件内容:\n{json.dumps(ocr_result_pages, ensure_ascii=False, indent=4)}"
model_res = doubao_model(base_prompt)
return model_res
# 判断pdf中是否有图片, 并输出含有图片的页面列表
def has_images(pdf_path):
# 打开PDF文件
pdf_document = fitz.open(pdf_path)
# 存储包含图片的页面页数
pages_with_imgs = {}
# 遍历PDF的每一页
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
# 获取页面的图片列表
images = page.get_images(full=True)
# 如果页面中有图片返回True
if images:
pages_with_imgs[page_num + 1] = images
# 如果遍历了所有页面都没有图片则返回False
return pages_with_imgs
# 调用通用表格识别对图片中的表格进行提取放回json形式的表格结构
def table_ocr_extract(image_path):
table_ocr = CommonOcr(img_path=image_path) # 创建时传递 img_path
return table_ocr.recognize()
# 提取pdf中某一页的所有图片
def extract_images_from_page(pdf_path, image_list, page_num):
images = []
try:
doc = fitz.open(pdf_path)
for img in image_list:
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image['image']
image_ext = base_image.get('ext', 'png')
images.append({'data': image_bytes, 'ext': image_ext, 'page_num': page_num + 1})
except Exception as e:
print(f"提取图片时出错: {e}")
return images
def pdf_image2txt(file_path, img_pdf_list):
common_header = extract_common_header(file_path)
# print(f"公共抬头:{common_header}")
# print("--------------------正文开始-------------------")
result = ""
pdf_document = fitz.open(file_path)
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
num_pages = len(reader.pages)
# print(f"Total pages: {num_pages}")
for page_num in range(num_pages):
page = reader.pages[page_num]
# text = page.extract_text()
text = page.extract_text() or ""
cleaned_text = clean_page_content(text, common_header)
# # print(f"--------第{page_num}页-----------")
if (page_num + 1) in img_pdf_list:
print(f"{page_num + 1} 页含有图片开始提取图片并OCR")
images = extract_images_from_page(file_path, img_pdf_list[page_num + 1], page_num)
for img in images:
try:
with tempfile.NamedTemporaryFile(delete=False, suffix='.' + img['ext']) as temp_image:
temp_image.write(img['data'])
temp_image.flush()
# 调用OCR函数
ocr_result = table_ocr_extract(temp_image.name)
ocr_result = json.loads(ocr_result)
# 判断是否提取成功并且 pages 中有数据
if ocr_result['code'] == 200 and len(ocr_result['result']['pages']) > 0:
print("提取成功,图片数据已提取。")
ocr_result_pages = ocr_result['result']['pages']
table_text = extract_img_table_text(ocr_result_pages)
if table_text.strip():
cleaned_text += "\n" + table_text
else:
print("提取失败或没有页面数据。")
except Exception as e:
print(f"OCR处理失败: {e}")
finally:
try:
os.remove(temp_image.name)
except Exception as e:
print(f"删除临时文件失败: {e}")
result += cleaned_text
directory = os.path.dirname(os.path.abspath(file_path))
output_path = os.path.join(directory, 'extract.txt')
# 将结果保存到 extract.txt 文件中
try:
with open(output_path, 'w', encoding='utf-8') as output_file:
output_file.write(result)
print(f"提取内容已保存到: {output_path}")
except IOError as e:
print(f"写入文件时发生错误: {e}")
# 返回保存的文件路径
return output_path