zbparse/flask_app/general/format_change.py

327 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os
import mimetypes
import requests
from PyPDF2 import PdfReader
from flask_app.general.clean_pdf import is_scanned_pdf
def download_file(url, local_filename):
"""
下载文件并保存到本地基于Content-Type设置文件扩展名。
参数:
- url (str): 文件的URL地址。
- local_filename (str): 本地保存的文件名(不含扩展名)。
返回:
- tuple: (完整文件名, 文件类型代码)
文件类型代码:
1 - .docx
2 - .pdf
3 - .doc
4 - 其他
- None: 下载失败
"""
try:
with requests.get(url, stream=True) as response:
response.raise_for_status() # 确保请求成功,否则抛出异常
# 获取Content-Type并猜测文件扩展名
content_type = response.headers.get('Content-Type', '')
extension = mimetypes.guess_extension(content_type, strict=False) or '.docx'
# 分离文件名和现有扩展名
base, ext = os.path.splitext(local_filename)
if ext.lower() != extension:
full_filename = base + extension
else:
full_filename = local_filename
# 下载并保存文件
with open(full_filename, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
if chunk: # 避免写入空块
file.write(chunk)
# 定义扩展名到代码的映射
extension_mapping = {
'.docx': 1,
'.pdf': 2,
'.doc': 3
}
file_code = extension_mapping.get(extension.lower(), 4)
# 如果是 PDF判断是否为扫描型
if extension.lower() == '.pdf':
print(f"Checking if the PDF is scanned: {full_filename}")
if is_scanned_pdf(full_filename):
print(f"PDF is scanned. Converting to normal PDF: {full_filename}")
# 转换扫描型 PDF -> DOCX -> 普通 PDF
intermediate_docx = pdf2docx(full_filename)
if intermediate_docx:
normal_pdf = docx2pdf(intermediate_docx, force=True)
if normal_pdf:
# 替换原始 PDF 文件
os.replace(normal_pdf, full_filename)
print(f"Replaced scanned PDF with normal PDF: {full_filename}")
return full_filename, file_code
except requests.HTTPError as e:
print(f"download: HTTP 错误: {e}")
except requests.RequestException as e:
print(f"download: 下载文件时出错: {e}")
except Exception as e:
print(f"download: 发生错误: {e}")
return None
def upload_file(file_path, url):
receive_file_url = ""
# 定义文件名和路径
filename = file_path.split('/')[-1]
# 打开文件以二进制形式读取
with open(file_path, 'rb') as f:
# 使用multipart/form-data格式发送文件
files = {'file': (filename, f)}
# 发送POST请求
response = requests.post(url, files=files)
# 检查响应状态码
if response.status_code == 200:
print("format_change 文件上传成功")
receive_file_response = response.content.decode('utf-8')
receive_file_json = json.loads(receive_file_response)
receive_file_url = receive_file_json["data"]
else:
print(f"format_change 文件上传失败,状态码: {response.status_code}")
print(f"format_change {response.text}")
return receive_file_url
def get_filename_and_folder(file_path):
# 使用os.path.basename获取文件名
original_filename = os.path.basename(file_path)
filename_without_ext, ext = os.path.splitext(original_filename)
# 替换文件名中的点为下划线,避免分割问题
safe_filename = filename_without_ext.replace('.', '_')
# 使用os.path.dirname获取文件所在的完整目录路径
directory = os.path.dirname(file_path)
return safe_filename, directory
#参数为要转换的文件路径,以及输出的文件名,文件类型为自动补全。
def pdf2docx(local_path_in):
if not local_path_in:
return ""
# 获取文件名和所在文件夹
filename, folder = get_filename_and_folder(local_path_in)
# 检查是否已经存在同名的 .docx 文件
docx_file_path = os.path.join(folder, f"{filename}.docx")
if os.path.exists(docx_file_path):
print(f"Skipping conversion, {docx_file_path} already exists.")
return docx_file_path
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/p2d'
receive_download_url = upload_file(local_path_in, remote_url) # 转换完成,得到下载链接
local_filename = os.path.join(folder,filename) # 输出文件名 C:\Users\Administrator\Desktop\货物标\zbfiles\6.2定版视频会议磋商文件 不带后缀
downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
print(f"format_change p2d:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath
def doc2docx(local_path_in):
if not local_path_in:
return ""
# 获取文件名和所在文件夹
filename, folder = get_filename_and_folder(local_path_in)
# 检查是否已经存在同名的 .docx 文件
docx_file_path = os.path.join(folder, f"{filename}.docx")
if os.path.exists(docx_file_path):
print(f"Skipping conversion, {docx_file_path} already exists.")
return docx_file_path
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d'
receive_download_url = upload_file(local_path_in, remote_url)
print(receive_download_url)
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
local_filename = os.path.join(folder, filename) # 输出文件名
downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
print(f"format_change d2d:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath
def docx2pdf(local_path_in,force=False):
"""
将 DOCX 文件转换为 PDF。
参数:
- local_path_in (str): 输入的 DOCX 文件路径。
- force (bool): 是否强制转换并覆盖已存在的 PDF 文件。默认为 False。
返回:
- str: 转换后的 PDF 文件路径
- "" 如果转换失败
"""
if not local_path_in:
return ""
# 获取文件名和所在文件夹
filename, folder = get_filename_and_folder(local_path_in)
# 检查是否已经存在同名的 .pdf 文件
pdf_file_path = os.path.join(folder, f"{filename}.pdf")
if os.path.exists(pdf_file_path):
if force:
print(f"强制转换,覆盖已存在的文件: {pdf_file_path}")
else:
print(f"跳过转换,文件已存在: {pdf_file_path}")
return pdf_file_path # 跳过转换
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p'
receive_download_url = upload_file(local_path_in, remote_url)
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
local_filename = os.path.join(folder, filename) # 输出文件名
downloaded_filepath,file_type = download_file(receive_download_url, local_filename)
print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath
def get_pdf_page_count(file_path):
"""
获取 PDF 文件的页码数量
"""
try:
pdf_path=file_path
if file_path.lower().endswith(('.doc', '.docx')):
pdf_path = docx2pdf(file_path)
reader = PdfReader(pdf_path)
return len(reader.pages)
except Exception as e:
print(f"读取 PDF 页码时出错:{e}")
return 0
# def docx2pdf(file_path):
# """
# 将本地的 .docx 或 .doc 文件转换为 .pdf 文件。
#
# 参数:
# - file_path: str, 本地文件的路径,支持 .docx 和 .doc 格式。
# """
# # 检查文件是否存在
# if not file_path:
# return ""
# # 获取文件名和扩展名
# base_name = os.path.basename(file_path)
# name, ext = os.path.splitext(base_name)
# ext = ext.lower().lstrip('.')
#
# if ext not in ['docx', 'doc']:
# raise ValueError(f"doc2pdf 仅支持 .docx 和 .doc 文件,当前文件扩展名为: .{ext}")
#
# # 定义转换接口
# endpoint = 'http://120.26.236.97:5008/convert_to_pdf'
# # endpoint = 'http://192.168.0.2:5008/convert_to_pdf'
#
# # 获取文件所在目录
# output_dir = os.path.dirname(file_path)
#
# # 准备上传的文件
# with open(file_path, 'rb') as f:
# files = {'file': (base_name, f)}
# try:
# print(f"正在将 {base_name} 转换为 .pdf 格式...")
# response = requests.post(endpoint, files=files)
# response.raise_for_status() # 检查请求是否成功
# except requests.RequestException as e:
# print(f"转换过程中发生错误: {e}")
# return
#
# # 准备保存转换后文件的路径
# output_file_name = f"{name}.pdf"
# output_path = os.path.join(output_dir, output_file_name)
#
# # 保存转换后的文件
# with open(output_path, 'wb') as out_file:
# out_file.write(response.content)
#
# print(f"文件已成功转换并保存至: {output_path}")
# return output_path
#
#
# def doc2docx(file_path):
# """
# 将本地的 .doc 文件转换为 .docx 文件。
#
# 参数:
# - file_path: str, 本地文件的路径,支持 .doc 格式。
# """
# # 检查文件是否存在
# if not file_path:
# return ""
# # 获取文件名和扩展名
# base_name = os.path.basename(file_path)
# name, ext = os.path.splitext(base_name)
# ext = ext.lower().lstrip('.')
#
# if ext != 'doc':
# raise ValueError(f"doc2docx 仅支持 .doc 文件,当前文件扩展名为: .{ext}")
#
# # 定义转换接口
# endpoint = 'http://120.26.236.97:5008/convert_to_docx'
#
# # 获取文件所在目录
# output_dir = os.path.dirname(file_path)
#
# # 准备上传的文件
# with open(file_path, 'rb') as f:
# files = {'file': (base_name, f)}
# try:
# print(f"正在将 {base_name} 转换为 .docx 格式...")
# response = requests.post(endpoint, files=files)
# response.raise_for_status() # 检查请求是否成功
# except requests.RequestException as e:
# print(f"转换过程中发生错误: {e}")
# return
#
# # 准备保存转换后文件的路径
# output_file_name = f"{name}.docx"
# output_path = os.path.join(output_dir, output_file_name)
#
# # 保存转换后的文件
# with open(output_path, 'wb') as out_file:
# out_file.write(response.content)
#
# print(f"文件已成功转换并保存至: {output_path}")
# return output_path
if __name__ == '__main__':
# 替换为你的文件路径和API URL
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf"
# local_path_in = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.docx"
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
# local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
# local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf"
# local_path_in = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile.pdf"
# downloaded_file=pdf2docx(local_path_in)
# # downloaded_file=pdf2docx(local_path_in)
# # downloaded_file=docx2pdf(local_path_in)
# print(downloaded_file)
test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/2022-%E5%B9%BF%E4%B8%9C-%E9%B9%8F%E5%8D%8E%E5%9F%BA%E9%87%91%E7%AE%A1%E7%90%86%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8%E6%B7%B1%E5%9C%B3%E6%B7%B1%E4%B8%9A%E4%B8%8A%E5%9F%8E%E5%8A%9E%E5%85%AC%E5%AE%A4%E8%A3%85%E4%BF%AE%E9%A1%B9%E7%9B%AE.pdf?Expires=1734952142&OSSAccessKeyId=TMP.3KiE75LGW8c68AXJaPpYRFnZXWrLq6zszWkdUCghFWLphdM9YvAMwoCNofeTSYLTBAU3TebtNuwubFH7s3qgTFhCs7q98b&Signature=NnSiQaqhznJ33Q6DhfsxATUa1ls%3D"
local_file_name = r'D:\flask_project\flask_app\static\output\output1\1d763771-f25a-4b65-839e-3b2ca56577b1\tmp\ztbfile.pdf'
downloaded = download_file(test_url, local_file_name)
if not downloaded:
print("下载文件失败或不支持的文件类型")
downloaded_filepath, file_type = downloaded
print(downloaded_filepath)
print(file_type)
# 检查文件类型
if file_type == 4:
print("error")