zbparse/flask_app/general/format_change.py

232 lines
9.6 KiB
Python
Raw Normal View History

2024-08-29 16:37:09 +08:00
import json
import os
2024-12-05 15:01:37 +08:00
import mimetypes
2024-08-29 16:37:09 +08:00
import requests
2024-12-19 14:39:35 +08:00
from PyPDF2 import PdfReader
from flask_app.general.读取文件.clean_pdf import is_scanned_pdf
2024-12-05 15:01:37 +08:00
def download_file(url, local_filename):
"""
下载文件并保存到本地基于Content-Type设置文件扩展名
参数:
- url (str): 文件的URL地址
- local_filename (str): 本地保存的文件名不含扩展名
返回:
- tuple: (完整文件名, 文件类型代码)
文件类型代码:
1 - .docx
2 - .pdf
3 - .doc
4 - 其他
- None: 下载失败
"""
try:
with requests.get(url, stream=True) as response:
response.raise_for_status() # 确保请求成功,否则抛出异常
# 获取Content-Type并猜测文件扩展名
content_type = response.headers.get('Content-Type', '')
extension = mimetypes.guess_extension(content_type, strict=False) or '.docx'
# 分离文件名和现有扩展名
base, ext = os.path.splitext(local_filename)
if ext.lower() != extension:
full_filename = base + extension
else:
full_filename = local_filename
# 下载并保存文件
with open(full_filename, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
if chunk: # 避免写入空块
file.write(chunk)
# 定义扩展名到代码的映射
extension_mapping = {
'.docx': 1,
'.pdf': 2,
'.doc': 3
}
file_code = extension_mapping.get(extension.lower(), 4)
# 如果是 PDF判断是否为扫描型
if extension.lower() == '.pdf':
print(f"Checking if the PDF is scanned: {full_filename}")
if is_scanned_pdf(full_filename):
print(f"PDF is scanned. Converting to normal PDF: {full_filename}")
# 转换扫描型 PDF -> DOCX -> 普通 PDF
intermediate_docx = pdf2docx(full_filename)
if intermediate_docx:
2024-12-23 15:47:41 +08:00
normal_pdf = docx2pdf(intermediate_docx, force=True)
2024-12-05 15:01:37 +08:00
if normal_pdf:
# 替换原始 PDF 文件
os.replace(normal_pdf, full_filename)
print(f"Replaced scanned PDF with normal PDF: {full_filename}")
return full_filename, file_code
2024-08-29 16:37:09 +08:00
2024-12-05 15:01:37 +08:00
except requests.HTTPError as e:
print(f"download: HTTP 错误: {e}")
except requests.RequestException as e:
print(f"download: 下载文件时出错: {e}")
except Exception as e:
print(f"download: 发生错误: {e}")
return None,4
2024-08-29 16:37:09 +08:00
def local_file_2_url(file_path, url):
2024-08-29 16:37:09 +08:00
receive_file_url = ""
# 定义文件名和路径
filename = file_path.split('/')[-1]
# 打开文件以二进制形式读取
with open(file_path, 'rb') as f:
# 使用multipart/form-data格式发送文件
files = {'file': (filename, f)}
# 发送POST请求
response = requests.post(url, files=files)
# 检查响应状态码
if response.status_code == 200:
2024-09-13 15:03:55 +08:00
print("format_change 文件上传成功")
2024-08-29 16:37:09 +08:00
receive_file_response = response.content.decode('utf-8')
receive_file_json = json.loads(receive_file_response)
receive_file_url = receive_file_json["data"]
else:
2024-09-13 15:03:55 +08:00
print(f"format_change 文件上传失败,状态码: {response.status_code}")
print(f"format_change {response.text}")
2024-08-29 16:37:09 +08:00
return receive_file_url
2024-11-01 14:28:10 +08:00
2024-08-29 16:37:09 +08:00
def get_filename_and_folder(file_path):
# 使用os.path.basename获取文件名
2024-11-01 14:28:10 +08:00
original_filename = os.path.basename(file_path)
filename_without_ext, ext = os.path.splitext(original_filename)
# 替换文件名中的点为下划线,避免分割问题
safe_filename = filename_without_ext.replace('.', '_')
# 使用os.path.dirname获取文件所在的完整目录路径
2024-08-29 16:37:09 +08:00
directory = os.path.dirname(file_path)
2024-11-01 14:28:10 +08:00
return safe_filename, directory
2024-08-29 16:37:09 +08:00
#参数为要转换的文件路径,以及输出的文件名,文件类型为自动补全。
def pdf2docx(local_path_in):
2024-10-30 16:56:05 +08:00
if not local_path_in:
return ""
2024-12-05 15:01:37 +08:00
# 获取文件名和所在文件夹
filename, folder = get_filename_and_folder(local_path_in)
# 检查是否已经存在同名的 .docx 文件
docx_file_path = os.path.join(folder, f"{filename}.docx")
if os.path.exists(docx_file_path):
print(f"Skipping conversion, {docx_file_path} already exists.")
return docx_file_path
2024-08-29 16:37:09 +08:00
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/p2d'
receive_download_url = local_file_2_url(local_path_in, remote_url) # 转换完成,得到下载链接
2024-12-05 15:01:37 +08:00
2024-12-19 12:18:50 +08:00
local_filename = os.path.join(folder,filename) # 输出文件名 C:\Users\Administrator\Desktop\货物标\zbfiles\6.2定版视频会议磋商文件 不带后缀
2024-12-05 15:01:37 +08:00
downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
2024-09-13 15:03:55 +08:00
print(f"format_change p2d:have downloaded file to: {downloaded_filepath}")
2024-08-29 16:37:09 +08:00
return downloaded_filepath
2024-11-01 14:28:10 +08:00
def doc2docx(local_path_in):
if not local_path_in:
2024-10-30 16:56:05 +08:00
return ""
2024-12-19 12:18:50 +08:00
# 获取文件名和所在文件夹
filename, folder = get_filename_and_folder(local_path_in)
# 检查是否已经存在同名的 .docx 文件
docx_file_path = os.path.join(folder, f"{filename}.docx")
if os.path.exists(docx_file_path):
print(f"Skipping conversion, {docx_file_path} already exists.")
return docx_file_path
2024-11-01 14:28:10 +08:00
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d'
receive_download_url = local_file_2_url(local_path_in, remote_url)
2024-11-01 14:28:10 +08:00
print(receive_download_url)
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
local_filename = os.path.join(folder, filename) # 输出文件名
downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
print(f"format_change d2d:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath
2024-12-23 15:47:41 +08:00
def docx2pdf(local_path_in,force=False):
"""
DOCX 文件转换为 PDF
参数:
- local_path_in (str): 输入的 DOCX 文件路径
- force (bool): 是否强制转换并覆盖已存在的 PDF 文件默认为 False
返回:
- str: 转换后的 PDF 文件路径
- "" 如果转换失败
"""
2024-11-01 14:28:10 +08:00
if not local_path_in:
2024-10-30 16:56:05 +08:00
return ""
2024-12-19 12:18:50 +08:00
# 获取文件名和所在文件夹
filename, folder = get_filename_and_folder(local_path_in)
# 检查是否已经存在同名的 .pdf 文件
pdf_file_path = os.path.join(folder, f"{filename}.pdf")
if os.path.exists(pdf_file_path):
2024-12-23 15:47:41 +08:00
if force:
print(f"强制转换,覆盖已存在的文件: {pdf_file_path}")
else:
print(f"跳过转换,文件已存在: {pdf_file_path}")
return pdf_file_path # 跳过转换
2024-11-01 14:28:10 +08:00
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p'
receive_download_url = local_file_2_url(local_path_in, remote_url)
2024-11-01 14:28:10 +08:00
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
local_filename = os.path.join(folder, filename) # 输出文件名
downloaded_filepath,file_type = download_file(receive_download_url, local_filename)
print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath
2024-10-24 20:51:48 +08:00
2024-12-19 14:39:35 +08:00
def get_pdf_page_count(file_path):
"""
获取 PDF 文件的页码数量
"""
try:
pdf_path=file_path
if file_path.lower().endswith(('.doc', '.docx')):
pdf_path = docx2pdf(file_path)
reader = PdfReader(pdf_path)
return len(reader.pages)
except Exception as e:
print(f"读取 PDF 页码时出错:{e}")
return 0
2024-08-29 16:37:09 +08:00
if __name__ == '__main__':
# 替换为你的文件路径和API URL
2024-10-24 10:56:33 +08:00
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf"
2024-10-24 20:51:48 +08:00
# local_path_in = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.docx"
2024-10-24 15:52:35 +08:00
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
2024-10-29 20:40:14 +08:00
# local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
# local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf"
local_path_in = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\有问题的.doc"
2024-12-06 14:40:22 +08:00
# downloaded_file=pdf2docx(local_path_in)
2024-12-05 15:01:37 +08:00
# # downloaded_file=pdf2docx(local_path_in)
downloaded_file=docx2pdf(local_path_in)
print(downloaded_file)
# test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7.pdf?Expires=1736852995&OSSAccessKeyId=TMP.3Kg1oKKcsSWb7DXNe4F56bfGfKY5nNWUi274p39HyY7GR3mghMCaFWy69Fi83SBab6PmSkErh4JUD4yAxAGzVVx2hxxoxm&Signature=rmxS5lett4MzWdksDI57EujCklw%3"
# local_file_name = r'D:\flask_project\flask_app\static\output\output1\1d763771-f25a-4b65-839e-3b2ca56577b1\tmp\ztbfile.pdf'
# downloaded = download_file(test_url, local_file_name)
# if not downloaded:
# print("下载文件失败或不支持的文件类型")
# downloaded_filepath, file_type = downloaded
# print(downloaded_filepath)
# print(file_type)
# # 检查文件类型
# if file_type == 4:
# print("error")
res=pdf2docx(local_path_in)
print(res)
2024-08-29 16:37:09 +08:00