import json import os import mimetypes import requests from flask_app.general.clean_pdf import is_scanned_pdf def download_file(url, local_filename): """ 下载文件并保存到本地,基于Content-Type设置文件扩展名。 参数: - url (str): 文件的URL地址。 - local_filename (str): 本地保存的文件名(不含扩展名)。 返回: - tuple: (完整文件名, 文件类型代码) 文件类型代码: 1 - .docx 2 - .pdf 3 - .doc 4 - 其他 - None: 下载失败 """ try: with requests.get(url, stream=True) as response: response.raise_for_status() # 确保请求成功,否则抛出异常 # 获取Content-Type并猜测文件扩展名 content_type = response.headers.get('Content-Type', '') extension = mimetypes.guess_extension(content_type, strict=False) or '.docx' # 分离文件名和现有扩展名 base, ext = os.path.splitext(local_filename) if ext.lower() != extension: full_filename = base + extension else: full_filename = local_filename # 下载并保存文件 with open(full_filename, 'wb') as file: for chunk in response.iter_content(chunk_size=8192): if chunk: # 避免写入空块 file.write(chunk) # 定义扩展名到代码的映射 extension_mapping = { '.docx': 1, '.pdf': 2, '.doc': 3 } file_code = extension_mapping.get(extension.lower(), 4) # 如果是 PDF,判断是否为扫描型 if extension.lower() == '.pdf': print(f"Checking if the PDF is scanned: {full_filename}") if is_scanned_pdf(full_filename): print(f"PDF is scanned. Converting to normal PDF: {full_filename}") # 转换扫描型 PDF -> DOCX -> 普通 PDF intermediate_docx = pdf2docx(full_filename) if intermediate_docx: normal_pdf = docx2pdf(intermediate_docx) if normal_pdf: # 替换原始 PDF 文件 os.replace(normal_pdf, full_filename) print(f"Replaced scanned PDF with normal PDF: {full_filename}") return full_filename, file_code except requests.HTTPError as e: print(f"download: HTTP 错误: {e}") except requests.RequestException as e: print(f"download: 下载文件时出错: {e}") except Exception as e: print(f"download: 发生错误: {e}") return None def upload_file(file_path, url): receive_file_url = "" # 定义文件名和路径 filename = file_path.split('/')[-1] # 打开文件以二进制形式读取 with open(file_path, 'rb') as f: # 使用multipart/form-data格式发送文件 files = {'file': (filename, f)} # 发送POST请求 response = requests.post(url, files=files) # 检查响应状态码 if response.status_code == 200: print("format_change 文件上传成功") receive_file_response = response.content.decode('utf-8') receive_file_json = json.loads(receive_file_response) receive_file_url = receive_file_json["data"] else: print(f"format_change 文件上传失败,状态码: {response.status_code}") print(f"format_change {response.text}") return receive_file_url def get_filename_and_folder(file_path): # 使用os.path.basename获取文件名 original_filename = os.path.basename(file_path) filename_without_ext, ext = os.path.splitext(original_filename) # 替换文件名中的点为下划线,避免分割问题 safe_filename = filename_without_ext.replace('.', '_') # 使用os.path.dirname获取文件所在的完整目录路径 directory = os.path.dirname(file_path) return safe_filename, directory #参数为要转换的文件路径,以及输出的文件名,文件类型为自动补全。 def pdf2docx(local_path_in): if not local_path_in: return "" # 获取文件名和所在文件夹 filename, folder = get_filename_and_folder(local_path_in) # 检查是否已经存在同名的 .docx 文件 docx_file_path = os.path.join(folder, f"{filename}.docx") if os.path.exists(docx_file_path): print(f"Skipping conversion, {docx_file_path} already exists.") return docx_file_path remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/p2d' receive_download_url = upload_file(local_path_in, remote_url) # 转换完成,得到下载链接 local_filename = os.path.join(folder, filename) # 输出文件名 C:\Users\Administrator\Desktop\货物标\zbfiles\6.2定版视频会议磋商文件 不带后缀 downloaded_filepath, file_type = download_file(receive_download_url, local_filename) print(f"format_change p2d:have downloaded file to: {downloaded_filepath}") return downloaded_filepath def doc2docx(local_path_in): if not local_path_in: return "" remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d' receive_download_url = upload_file(local_path_in, remote_url) print(receive_download_url) filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹 local_filename = os.path.join(folder, filename) # 输出文件名 downloaded_filepath, file_type = download_file(receive_download_url, local_filename) print(f"format_change d2d:have downloaded file to: {downloaded_filepath}") return downloaded_filepath def docx2pdf(local_path_in): if not local_path_in: return "" remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p' receive_download_url = upload_file(local_path_in, remote_url) filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹 local_filename = os.path.join(folder, filename) # 输出文件名 downloaded_filepath,file_type = download_file(receive_download_url, local_filename) print(f"format_change d2p:have downloaded file to: {downloaded_filepath}") return downloaded_filepath # def docx2pdf(file_path): # """ # 将本地的 .docx 或 .doc 文件转换为 .pdf 文件。 # # 参数: # - file_path: str, 本地文件的路径,支持 .docx 和 .doc 格式。 # """ # # 检查文件是否存在 # if not file_path: # return "" # # 获取文件名和扩展名 # base_name = os.path.basename(file_path) # name, ext = os.path.splitext(base_name) # ext = ext.lower().lstrip('.') # # if ext not in ['docx', 'doc']: # raise ValueError(f"doc2pdf 仅支持 .docx 和 .doc 文件,当前文件扩展名为: .{ext}") # # # 定义转换接口 # endpoint = 'http://120.26.236.97:5008/convert_to_pdf' # # endpoint = 'http://192.168.0.2:5008/convert_to_pdf' # # # 获取文件所在目录 # output_dir = os.path.dirname(file_path) # # # 准备上传的文件 # with open(file_path, 'rb') as f: # files = {'file': (base_name, f)} # try: # print(f"正在将 {base_name} 转换为 .pdf 格式...") # response = requests.post(endpoint, files=files) # response.raise_for_status() # 检查请求是否成功 # except requests.RequestException as e: # print(f"转换过程中发生错误: {e}") # return # # # 准备保存转换后文件的路径 # output_file_name = f"{name}.pdf" # output_path = os.path.join(output_dir, output_file_name) # # # 保存转换后的文件 # with open(output_path, 'wb') as out_file: # out_file.write(response.content) # # print(f"文件已成功转换并保存至: {output_path}") # return output_path # # # def doc2docx(file_path): # """ # 将本地的 .doc 文件转换为 .docx 文件。 # # 参数: # - file_path: str, 本地文件的路径,支持 .doc 格式。 # """ # # 检查文件是否存在 # if not file_path: # return "" # # 获取文件名和扩展名 # base_name = os.path.basename(file_path) # name, ext = os.path.splitext(base_name) # ext = ext.lower().lstrip('.') # # if ext != 'doc': # raise ValueError(f"doc2docx 仅支持 .doc 文件,当前文件扩展名为: .{ext}") # # # 定义转换接口 # endpoint = 'http://120.26.236.97:5008/convert_to_docx' # # # 获取文件所在目录 # output_dir = os.path.dirname(file_path) # # # 准备上传的文件 # with open(file_path, 'rb') as f: # files = {'file': (base_name, f)} # try: # print(f"正在将 {base_name} 转换为 .docx 格式...") # response = requests.post(endpoint, files=files) # response.raise_for_status() # 检查请求是否成功 # except requests.RequestException as e: # print(f"转换过程中发生错误: {e}") # return # # # 准备保存转换后文件的路径 # output_file_name = f"{name}.docx" # output_path = os.path.join(output_dir, output_file_name) # # # 保存转换后的文件 # with open(output_path, 'wb') as out_file: # out_file.write(response.content) # # print(f"文件已成功转换并保存至: {output_path}") # return output_path if __name__ == '__main__': # 替换为你的文件路径和API URL # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf" # local_path_in = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.docx" # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf" # local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf" # local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf" local_path_in = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile.pdf" downloaded_file=pdf2docx(local_path_in) # # downloaded_file=pdf2docx(local_path_in) # # downloaded_file=docx2pdf(local_path_in) print(downloaded_file) # test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/094%E5%AE%9A%E7%A8%BF-%E6%B9%96%E5%8C%97%E5%B7%A5%E4%B8%9A%E5%A4%A7%E5%AD%A6%E8%BD%BB%E6%AD%A6%E5%99%A8%E6%A8%A1%E6%8B%9F%E5%B0%84%E5%87%BB%E8%AE%BE%E5%A4%87%E9%87%87%E8%B4%AD%E9%A1%B9%E7%9B%AE%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6.pdf?Expires=1733410874&OSSAccessKeyId=TMP.3KdesMxbSGgK41BgYKRtckUxdQ2LUh2YMYiH1FnuupFhvRuxMbiuBAsXK9oGvHxaLvsDLQjp3db28cUK5YJSEYTGPnUevo&Signature=EjLj3KeLtj337lS1DEJO56471Tg%3D" # local_file_name = r'C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile' # downloaded = download_file(test_url, local_file_name) # if not downloaded: # print("下载文件失败或不支持的文件类型") # downloaded_filepath, file_type = downloaded # print(downloaded_filepath) # print(file_type) # # 检查文件类型 # if file_type == 4: # print("error")