2024-08-29 16:37:09 +08:00
|
|
|
|
import json
|
|
|
|
|
import os
|
2024-12-05 15:01:37 +08:00
|
|
|
|
import mimetypes
|
2024-08-29 16:37:09 +08:00
|
|
|
|
import requests
|
2024-09-11 12:02:09 +08:00
|
|
|
|
|
2024-12-05 15:01:37 +08:00
|
|
|
|
from flask_app.general.clean_pdf import is_scanned_pdf
|
|
|
|
|
def download_file(url, local_filename):
|
|
|
|
|
"""
|
|
|
|
|
下载文件并保存到本地,基于Content-Type设置文件扩展名。
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
- url (str): 文件的URL地址。
|
|
|
|
|
- local_filename (str): 本地保存的文件名(不含扩展名)。
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
- tuple: (完整文件名, 文件类型代码)
|
|
|
|
|
文件类型代码:
|
|
|
|
|
1 - .docx
|
|
|
|
|
2 - .pdf
|
|
|
|
|
3 - .doc
|
|
|
|
|
4 - 其他
|
|
|
|
|
- None: 下载失败
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
with requests.get(url, stream=True) as response:
|
|
|
|
|
response.raise_for_status() # 确保请求成功,否则抛出异常
|
|
|
|
|
|
|
|
|
|
# 获取Content-Type并猜测文件扩展名
|
|
|
|
|
content_type = response.headers.get('Content-Type', '')
|
|
|
|
|
extension = mimetypes.guess_extension(content_type, strict=False) or '.docx'
|
|
|
|
|
|
|
|
|
|
# 分离文件名和现有扩展名
|
|
|
|
|
base, ext = os.path.splitext(local_filename)
|
|
|
|
|
if ext.lower() != extension:
|
|
|
|
|
full_filename = base + extension
|
|
|
|
|
else:
|
|
|
|
|
full_filename = local_filename
|
|
|
|
|
|
|
|
|
|
# 下载并保存文件
|
|
|
|
|
with open(full_filename, 'wb') as file:
|
|
|
|
|
for chunk in response.iter_content(chunk_size=8192):
|
|
|
|
|
if chunk: # 避免写入空块
|
|
|
|
|
file.write(chunk)
|
|
|
|
|
|
|
|
|
|
# 定义扩展名到代码的映射
|
|
|
|
|
extension_mapping = {
|
|
|
|
|
'.docx': 1,
|
|
|
|
|
'.pdf': 2,
|
|
|
|
|
'.doc': 3
|
|
|
|
|
}
|
|
|
|
|
file_code = extension_mapping.get(extension.lower(), 4)
|
|
|
|
|
# 如果是 PDF,判断是否为扫描型
|
|
|
|
|
if extension.lower() == '.pdf':
|
|
|
|
|
print(f"Checking if the PDF is scanned: {full_filename}")
|
|
|
|
|
if is_scanned_pdf(full_filename):
|
|
|
|
|
print(f"PDF is scanned. Converting to normal PDF: {full_filename}")
|
|
|
|
|
# 转换扫描型 PDF -> DOCX -> 普通 PDF
|
|
|
|
|
intermediate_docx = pdf2docx(full_filename)
|
|
|
|
|
if intermediate_docx:
|
|
|
|
|
normal_pdf = docx2pdf(intermediate_docx)
|
|
|
|
|
if normal_pdf:
|
|
|
|
|
# 替换原始 PDF 文件
|
|
|
|
|
os.replace(normal_pdf, full_filename)
|
|
|
|
|
print(f"Replaced scanned PDF with normal PDF: {full_filename}")
|
|
|
|
|
return full_filename, file_code
|
2024-08-29 16:37:09 +08:00
|
|
|
|
|
2024-12-05 15:01:37 +08:00
|
|
|
|
except requests.HTTPError as e:
|
|
|
|
|
print(f"download: HTTP 错误: {e}")
|
|
|
|
|
except requests.RequestException as e:
|
|
|
|
|
print(f"download: 下载文件时出错: {e}")
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"download: 发生错误: {e}")
|
|
|
|
|
|
|
|
|
|
return None
|
2024-08-29 16:37:09 +08:00
|
|
|
|
|
|
|
|
|
def upload_file(file_path, url):
|
|
|
|
|
receive_file_url = ""
|
|
|
|
|
# 定义文件名和路径
|
|
|
|
|
filename = file_path.split('/')[-1]
|
|
|
|
|
|
|
|
|
|
# 打开文件以二进制形式读取
|
|
|
|
|
with open(file_path, 'rb') as f:
|
|
|
|
|
# 使用multipart/form-data格式发送文件
|
|
|
|
|
files = {'file': (filename, f)}
|
|
|
|
|
|
|
|
|
|
# 发送POST请求
|
|
|
|
|
response = requests.post(url, files=files)
|
|
|
|
|
|
|
|
|
|
# 检查响应状态码
|
|
|
|
|
if response.status_code == 200:
|
2024-09-13 15:03:55 +08:00
|
|
|
|
print("format_change 文件上传成功")
|
2024-08-29 16:37:09 +08:00
|
|
|
|
receive_file_response = response.content.decode('utf-8')
|
|
|
|
|
receive_file_json = json.loads(receive_file_response)
|
|
|
|
|
receive_file_url = receive_file_json["data"]
|
|
|
|
|
|
|
|
|
|
else:
|
2024-09-13 15:03:55 +08:00
|
|
|
|
print(f"format_change 文件上传失败,状态码: {response.status_code}")
|
|
|
|
|
print(f"format_change {response.text}")
|
2024-08-29 16:37:09 +08:00
|
|
|
|
|
|
|
|
|
return receive_file_url
|
|
|
|
|
|
2024-11-01 14:28:10 +08:00
|
|
|
|
|
2024-08-29 16:37:09 +08:00
|
|
|
|
def get_filename_and_folder(file_path):
|
|
|
|
|
# 使用os.path.basename获取文件名
|
2024-11-01 14:28:10 +08:00
|
|
|
|
original_filename = os.path.basename(file_path)
|
|
|
|
|
filename_without_ext, ext = os.path.splitext(original_filename)
|
|
|
|
|
|
|
|
|
|
# 替换文件名中的点为下划线,避免分割问题
|
|
|
|
|
safe_filename = filename_without_ext.replace('.', '_')
|
|
|
|
|
|
|
|
|
|
# 使用os.path.dirname获取文件所在的完整目录路径
|
2024-08-29 16:37:09 +08:00
|
|
|
|
directory = os.path.dirname(file_path)
|
2024-11-01 14:28:10 +08:00
|
|
|
|
return safe_filename, directory
|
2024-08-29 16:37:09 +08:00
|
|
|
|
|
|
|
|
|
#参数为要转换的文件路径,以及输出的文件名,文件类型为自动补全。
|
|
|
|
|
def pdf2docx(local_path_in):
|
2024-10-30 16:56:05 +08:00
|
|
|
|
if not local_path_in:
|
|
|
|
|
return ""
|
2024-12-05 15:01:37 +08:00
|
|
|
|
|
|
|
|
|
# 获取文件名和所在文件夹
|
|
|
|
|
filename, folder = get_filename_and_folder(local_path_in)
|
|
|
|
|
|
|
|
|
|
# 检查是否已经存在同名的 .docx 文件
|
|
|
|
|
docx_file_path = os.path.join(folder, f"{filename}.docx")
|
|
|
|
|
if os.path.exists(docx_file_path):
|
|
|
|
|
print(f"Skipping conversion, {docx_file_path} already exists.")
|
|
|
|
|
return docx_file_path
|
|
|
|
|
|
2024-08-29 16:37:09 +08:00
|
|
|
|
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/p2d'
|
2024-12-05 15:01:37 +08:00
|
|
|
|
receive_download_url = upload_file(local_path_in, remote_url) # 转换完成,得到下载链接
|
|
|
|
|
|
|
|
|
|
local_filename = os.path.join(folder,
|
|
|
|
|
filename) # 输出文件名 C:\Users\Administrator\Desktop\货物标\zbfiles\6.2定版视频会议磋商文件 不带后缀
|
|
|
|
|
downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
|
2024-09-13 15:03:55 +08:00
|
|
|
|
print(f"format_change p2d:have downloaded file to: {downloaded_filepath}")
|
2024-08-29 16:37:09 +08:00
|
|
|
|
return downloaded_filepath
|
|
|
|
|
|
2024-11-01 14:28:10 +08:00
|
|
|
|
def doc2docx(local_path_in):
|
|
|
|
|
if not local_path_in:
|
2024-10-30 16:56:05 +08:00
|
|
|
|
return ""
|
2024-11-01 14:28:10 +08:00
|
|
|
|
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d'
|
|
|
|
|
receive_download_url = upload_file(local_path_in, remote_url)
|
|
|
|
|
print(receive_download_url)
|
|
|
|
|
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
|
|
|
|
|
local_filename = os.path.join(folder, filename) # 输出文件名
|
|
|
|
|
downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
|
|
|
|
|
print(f"format_change d2d:have downloaded file to: {downloaded_filepath}")
|
|
|
|
|
return downloaded_filepath
|
|
|
|
|
def docx2pdf(local_path_in):
|
|
|
|
|
if not local_path_in:
|
2024-10-30 16:56:05 +08:00
|
|
|
|
return ""
|
2024-11-01 14:28:10 +08:00
|
|
|
|
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p'
|
|
|
|
|
receive_download_url = upload_file(local_path_in, remote_url)
|
|
|
|
|
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
|
|
|
|
|
local_filename = os.path.join(folder, filename) # 输出文件名
|
|
|
|
|
downloaded_filepath,file_type = download_file(receive_download_url, local_filename)
|
|
|
|
|
print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
|
|
|
|
|
return downloaded_filepath
|
2024-10-24 20:51:48 +08:00
|
|
|
|
|
2024-11-01 14:28:10 +08:00
|
|
|
|
# def docx2pdf(file_path):
|
|
|
|
|
# """
|
|
|
|
|
# 将本地的 .docx 或 .doc 文件转换为 .pdf 文件。
|
|
|
|
|
#
|
|
|
|
|
# 参数:
|
|
|
|
|
# - file_path: str, 本地文件的路径,支持 .docx 和 .doc 格式。
|
|
|
|
|
# """
|
|
|
|
|
# # 检查文件是否存在
|
|
|
|
|
# if not file_path:
|
|
|
|
|
# return ""
|
|
|
|
|
# # 获取文件名和扩展名
|
|
|
|
|
# base_name = os.path.basename(file_path)
|
|
|
|
|
# name, ext = os.path.splitext(base_name)
|
|
|
|
|
# ext = ext.lower().lstrip('.')
|
|
|
|
|
#
|
|
|
|
|
# if ext not in ['docx', 'doc']:
|
|
|
|
|
# raise ValueError(f"doc2pdf 仅支持 .docx 和 .doc 文件,当前文件扩展名为: .{ext}")
|
|
|
|
|
#
|
|
|
|
|
# # 定义转换接口
|
|
|
|
|
# endpoint = 'http://120.26.236.97:5008/convert_to_pdf'
|
|
|
|
|
# # endpoint = 'http://192.168.0.2:5008/convert_to_pdf'
|
|
|
|
|
#
|
|
|
|
|
# # 获取文件所在目录
|
|
|
|
|
# output_dir = os.path.dirname(file_path)
|
|
|
|
|
#
|
|
|
|
|
# # 准备上传的文件
|
|
|
|
|
# with open(file_path, 'rb') as f:
|
|
|
|
|
# files = {'file': (base_name, f)}
|
|
|
|
|
# try:
|
|
|
|
|
# print(f"正在将 {base_name} 转换为 .pdf 格式...")
|
|
|
|
|
# response = requests.post(endpoint, files=files)
|
|
|
|
|
# response.raise_for_status() # 检查请求是否成功
|
|
|
|
|
# except requests.RequestException as e:
|
|
|
|
|
# print(f"转换过程中发生错误: {e}")
|
|
|
|
|
# return
|
|
|
|
|
#
|
|
|
|
|
# # 准备保存转换后文件的路径
|
|
|
|
|
# output_file_name = f"{name}.pdf"
|
|
|
|
|
# output_path = os.path.join(output_dir, output_file_name)
|
|
|
|
|
#
|
|
|
|
|
# # 保存转换后的文件
|
|
|
|
|
# with open(output_path, 'wb') as out_file:
|
|
|
|
|
# out_file.write(response.content)
|
|
|
|
|
#
|
|
|
|
|
# print(f"文件已成功转换并保存至: {output_path}")
|
|
|
|
|
# return output_path
|
|
|
|
|
#
|
|
|
|
|
#
|
|
|
|
|
# def doc2docx(file_path):
|
|
|
|
|
# """
|
|
|
|
|
# 将本地的 .doc 文件转换为 .docx 文件。
|
|
|
|
|
#
|
|
|
|
|
# 参数:
|
|
|
|
|
# - file_path: str, 本地文件的路径,支持 .doc 格式。
|
|
|
|
|
# """
|
|
|
|
|
# # 检查文件是否存在
|
|
|
|
|
# if not file_path:
|
|
|
|
|
# return ""
|
|
|
|
|
# # 获取文件名和扩展名
|
|
|
|
|
# base_name = os.path.basename(file_path)
|
|
|
|
|
# name, ext = os.path.splitext(base_name)
|
|
|
|
|
# ext = ext.lower().lstrip('.')
|
|
|
|
|
#
|
|
|
|
|
# if ext != 'doc':
|
|
|
|
|
# raise ValueError(f"doc2docx 仅支持 .doc 文件,当前文件扩展名为: .{ext}")
|
|
|
|
|
#
|
|
|
|
|
# # 定义转换接口
|
|
|
|
|
# endpoint = 'http://120.26.236.97:5008/convert_to_docx'
|
|
|
|
|
#
|
|
|
|
|
# # 获取文件所在目录
|
|
|
|
|
# output_dir = os.path.dirname(file_path)
|
|
|
|
|
#
|
|
|
|
|
# # 准备上传的文件
|
|
|
|
|
# with open(file_path, 'rb') as f:
|
|
|
|
|
# files = {'file': (base_name, f)}
|
|
|
|
|
# try:
|
|
|
|
|
# print(f"正在将 {base_name} 转换为 .docx 格式...")
|
|
|
|
|
# response = requests.post(endpoint, files=files)
|
|
|
|
|
# response.raise_for_status() # 检查请求是否成功
|
|
|
|
|
# except requests.RequestException as e:
|
|
|
|
|
# print(f"转换过程中发生错误: {e}")
|
|
|
|
|
# return
|
|
|
|
|
#
|
|
|
|
|
# # 准备保存转换后文件的路径
|
|
|
|
|
# output_file_name = f"{name}.docx"
|
|
|
|
|
# output_path = os.path.join(output_dir, output_file_name)
|
|
|
|
|
#
|
|
|
|
|
# # 保存转换后的文件
|
|
|
|
|
# with open(output_path, 'wb') as out_file:
|
|
|
|
|
# out_file.write(response.content)
|
|
|
|
|
#
|
|
|
|
|
# print(f"文件已成功转换并保存至: {output_path}")
|
|
|
|
|
# return output_path
|
2024-10-23 20:33:41 +08:00
|
|
|
|
|
|
|
|
|
|
2024-08-29 16:37:09 +08:00
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
# 替换为你的文件路径和API URL
|
2024-10-24 10:56:33 +08:00
|
|
|
|
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf"
|
2024-10-24 20:51:48 +08:00
|
|
|
|
# local_path_in = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.docx"
|
2024-10-24 15:52:35 +08:00
|
|
|
|
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
|
2024-10-29 20:40:14 +08:00
|
|
|
|
# local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
|
2024-11-21 11:54:30 +08:00
|
|
|
|
# local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf"
|
2024-12-06 14:40:22 +08:00
|
|
|
|
# local_path_in = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile.pdf"
|
|
|
|
|
# downloaded_file=pdf2docx(local_path_in)
|
2024-12-05 15:01:37 +08:00
|
|
|
|
# # downloaded_file=pdf2docx(local_path_in)
|
|
|
|
|
# # downloaded_file=docx2pdf(local_path_in)
|
2024-12-06 14:40:22 +08:00
|
|
|
|
# print(downloaded_file)
|
|
|
|
|
|
|
|
|
|
test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/094%E5%AE%9A%E7%A8%BF-%E6%B9%96%E5%8C%97%E5%B7%A5%E4%B8%9A%E5%A4%A7%E5%AD%A6%E8%BD%BB%E6%AD%A6%E5%99%A8%E6%A8%A1%E6%8B%9F%E5%B0%84%E5%87%BB%E8%AE%BE%E5%A4%87%E9%87%87%E8%B4%AD%E9%A1%B9%E7%9B%AE%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6%20-%20%E5%89%AF%E6%9C%AC.PDF?Expires=1733478585&OSSAccessKeyId=TMP.3KhfwZc3kpT9TUmsb46yBDdnRq8bbENcEWBbZP8nLMgmSjVkjg9edpTPUQUsH8VXtvvg839Xbm8N5paYxPKvxCGqx3Vx4m&Signature=RYOo7tMEyahaMA3cSsf2kkf8co8%3D"
|
|
|
|
|
local_file_name = r'C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile'
|
|
|
|
|
downloaded = download_file(test_url, local_file_name)
|
|
|
|
|
if not downloaded:
|
|
|
|
|
print("下载文件失败或不支持的文件类型")
|
|
|
|
|
downloaded_filepath, file_type = downloaded
|
|
|
|
|
print(downloaded_filepath)
|
|
|
|
|
print(file_type)
|
|
|
|
|
# 检查文件类型
|
|
|
|
|
if file_type == 4:
|
|
|
|
|
print("error")
|
2024-08-29 16:37:09 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|