zbparse/flask_app/general/format_change.py

236 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os
import mimetypes
import requests
from PyPDF2 import PdfReader
from flask_app.general.读取文件.clean_pdf import is_scanned_pdf, is_pure_image
def download_file(url, local_filename,enable=False):
"""
下载文件并保存到本地基于Content-Type设置文件扩展名。
参数:
- url (str): 文件的URL地址。
- local_filename (str): 本地保存的文件名(不含扩展名)。
- enable: 是否需要判断为扫描型/纯图片文件
返回:
- tuple: (完整文件名, 文件类型代码)
文件类型代码:
1 - .docx
2 - .pdf
3 - .doc
4 - 其他
- "": 下载失败
"""
try:
with requests.get(url, stream=True) as response:
response.raise_for_status() # 确保请求成功,否则抛出异常
# 获取Content-Type并猜测文件扩展名
content_type = response.headers.get('Content-Type', '')
extension = mimetypes.guess_extension(content_type, strict=False) or '.docx'
extension_lower = extension.lower() #显式地将 extension 转换为小写(存入 extension_lower更加健壮
# 根据传入的 local_filename 构造完整的文件名
base, current_ext = os.path.splitext(local_filename)
if current_ext.lower() != extension_lower:
full_filename = base + extension
else:
full_filename = local_filename
# 下载并保存文件
with open(full_filename, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
if chunk: # 避免写入空块
file.write(chunk)
# 定义扩展名到代码的映射
extension_mapping = {
'.docx': 1,
'.pdf': 2,
'.doc': 3
}
file_code = extension_mapping.get(extension.lower(), 4)
# 如果是 PDF判断是否为扫描型
if enable and extension.lower() == '.pdf':
print(f"Checking if the PDF is scanned: {full_filename}")
if is_scanned_pdf(full_filename):
print(f"PDF is scanned. Converting to normal PDF: {full_filename}")
# 转换扫描型 PDF -> DOCX -> 普通 PDF
intermediate_docx = pdf2docx(full_filename)
if intermediate_docx:
normal_pdf = docx2pdf(intermediate_docx, force=True)
if normal_pdf:
# 替换原始 PDF 文件
os.replace(normal_pdf, full_filename)
print(f"Replaced scanned PDF with normal PDF: {full_filename}")
return full_filename, file_code
except requests.HTTPError as e:
print(f"download: HTTP 错误: {e}")
except requests.RequestException as e:
print(f"download: 下载文件时出错: {e}")
except Exception as e:
print(f"download: 发生错误: {e}")
return "",4
def local_file_2_url(file_path, url):
receive_file_url = ""
# 定义文件名和路径
filename = file_path.split('/')[-1]
# 打开文件以二进制形式读取
with open(file_path, 'rb') as f:
# 使用multipart/form-data格式发送文件
files = {'file': (filename, f)}
# 发送POST请求
response = requests.post(url, files=files)
# 检查响应状态码
if response.status_code == 200:
print("format_change 文件上传成功")
receive_file_response = response.content.decode('utf-8')
receive_file_json = json.loads(receive_file_response)
receive_file_url = receive_file_json["data"]
else:
print(f"format_change 文件上传失败,状态码: {response.status_code}")
print(f"format_change {response.text}")
return receive_file_url
def get_filename_and_folder(file_path):
# 使用os.path.basename获取文件名
original_filename = os.path.basename(file_path)
filename_without_ext, ext = os.path.splitext(original_filename)
# 替换文件名中的点为下划线,避免分割问题
safe_filename = filename_without_ext.replace('.', '_')
# 使用os.path.dirname获取文件所在的完整目录路径
directory = os.path.dirname(file_path)
return safe_filename, directory
#参数为要转换的文件路径,以及输出的文件名,文件类型为自动补全。
def pdf2docx(local_path_in):
if not local_path_in:
return ""
# 获取文件名和所在文件夹
filename, folder = get_filename_and_folder(local_path_in)
# 检查是否已经存在同名的 .docx 文件
docx_file_path = os.path.join(folder, f"{filename}.docx")
if os.path.exists(docx_file_path):
print(f"Skipping conversion, {docx_file_path} already exists.")
return docx_file_path
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/p2d'
receive_download_url = local_file_2_url(local_path_in, remote_url) # 转换完成,得到下载链接
local_filename = os.path.join(folder,filename) # 输出文件名 C:\Users\Administrator\Desktop\货物标\zbfiles\6.2定版视频会议磋商文件 不带后缀
downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
print(f"format_change p2d:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath
def doc2docx(local_path_in):
if not local_path_in:
return ""
# 获取文件名和所在文件夹
filename, folder = get_filename_and_folder(local_path_in)
# 检查是否已经存在同名的 .docx 文件
docx_file_path = os.path.join(folder, f"{filename}.docx")
if os.path.exists(docx_file_path):
print(f"Skipping conversion, {docx_file_path} already exists.")
return docx_file_path
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d'
receive_download_url = local_file_2_url(local_path_in, remote_url)
print(receive_download_url)
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
local_filename = os.path.join(folder, filename) # 输出文件名
downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
print(f"format_change d2d:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath
def docx2pdf(local_path_in,force=False):
"""
将 DOCX 文件转换为 PDF。
参数:
- local_path_in (str): 输入的 DOCX 文件路径。
- force (bool): 是否强制转换并覆盖已存在的 PDF 文件。默认为 False。
返回:
- str: 转换后的 PDF 文件路径
- "" 如果转换失败
"""
if not local_path_in:
return ""
# 获取文件名和所在文件夹
filename, folder = get_filename_and_folder(local_path_in)
# 检查是否已经存在同名的 .pdf 文件
pdf_file_path = os.path.join(folder, f"{filename}.pdf")
if os.path.exists(pdf_file_path):
if force:
print(f"强制转换,覆盖已存在的文件: {pdf_file_path}") #覆盖掉原来的扫描型pdf
else:
print(f"跳过转换,文件已存在: {pdf_file_path}")
return pdf_file_path # 跳过转换
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p'
receive_download_url = local_file_2_url(local_path_in, remote_url)
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
local_filename = os.path.join(folder, filename) # 输出文件名
downloaded_filepath,file_type = download_file(receive_download_url, local_filename)
print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath
def get_pdf_page_count(file_path):
"""
获取 PDF 文件的页码数量
"""
try:
pdf_path=file_path
if file_path.lower().endswith(('.doc', '.docx')):
pdf_path = docx2pdf(file_path)
reader = PdfReader(pdf_path)
return len(reader.pages)
except Exception as e:
print(f"读取 PDF 页码时出错:{e}")
return 0
if __name__ == '__main__':
# 替换为你的文件路径和API URL
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf"
local_path_in = r"C:\Users\Administrator\Desktop\fsdownload\bf0346dc-8711-43f5-839e-e4076acea84a\tmp\invalid_added.pdf"
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
# local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
# local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf"
# local_path_in = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\ztbfile(1).pdf"
# intermediate_docx = pdf2docx(local_path_in)
# if intermediate_docx:
# normal_pdf = docx2pdf(intermediate_docx, force=True)
downloaded_file=pdf2docx(local_path_in)
# downloaded_file=docx2pdf(local_path_in)
print(downloaded_file)
# test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7.pdf?Expires=1736852995&OSSAccessKeyId=TMP.3Kg1oKKcsSWb7DXNe4F56bfGfKY5nNWUi274p39HyY7GR3mghMCaFWy69Fi83SBab6PmSkErh4JUD4yAxAGzVVx2hxxoxm&Signature=rmxS5lett4MzWdksDI57EujCklw%3"
# local_file_name = r'D:\flask_project\flask_app\static\output\output1\1d763771-f25a-4b65-839e-3b2ca56577b1\tmp\ztbfile.pdf'
# downloaded = download_file(test_url, local_file_name)
# if not downloaded:
# print("下载文件失败或不支持的文件类型")
# downloaded_filepath, file_type = downloaded
# print(downloaded_filepath)
# print(file_type)
# # 检查文件类型
# if file_type == 4:
# print("error")
# res=pdf2docx(local_path_in)
# print(res)