import json import os import mimetypes import requests from PyPDF2 import PdfReader from flask_app.general.读取文件.clean_pdf import is_scanned_pdf def download_file(url, local_filename): """ 下载文件并保存到本地,基于Content-Type设置文件扩展名。 参数: - url (str): 文件的URL地址。 - local_filename (str): 本地保存的文件名(不含扩展名)。 返回: - tuple: (完整文件名, 文件类型代码) 文件类型代码: 1 - .docx 2 - .pdf 3 - .doc 4 - 其他 - None: 下载失败 """ try: with requests.get(url, stream=True) as response: response.raise_for_status() # 确保请求成功,否则抛出异常 # 获取Content-Type并猜测文件扩展名 content_type = response.headers.get('Content-Type', '') extension = mimetypes.guess_extension(content_type, strict=False) or '.docx' # 分离文件名和现有扩展名 base, ext = os.path.splitext(local_filename) if ext.lower() != extension: full_filename = base + extension else: full_filename = local_filename # 下载并保存文件 with open(full_filename, 'wb') as file: for chunk in response.iter_content(chunk_size=8192): if chunk: # 避免写入空块 file.write(chunk) # 定义扩展名到代码的映射 extension_mapping = { '.docx': 1, '.pdf': 2, '.doc': 3 } file_code = extension_mapping.get(extension.lower(), 4) # 如果是 PDF,判断是否为扫描型 if extension.lower() == '.pdf': print(f"Checking if the PDF is scanned: {full_filename}") if is_scanned_pdf(full_filename): print(f"PDF is scanned. Converting to normal PDF: {full_filename}") # 转换扫描型 PDF -> DOCX -> 普通 PDF intermediate_docx = pdf2docx(full_filename) if intermediate_docx: normal_pdf = docx2pdf(intermediate_docx, force=True) if normal_pdf: # 替换原始 PDF 文件 os.replace(normal_pdf, full_filename) print(f"Replaced scanned PDF with normal PDF: {full_filename}") return full_filename, file_code except requests.HTTPError as e: print(f"download: HTTP 错误: {e}") except requests.RequestException as e: print(f"download: 下载文件时出错: {e}") except Exception as e: print(f"download: 发生错误: {e}") return None,4 def local_file_2_url(file_path, url): receive_file_url = "" # 定义文件名和路径 filename = file_path.split('/')[-1] # 打开文件以二进制形式读取 with open(file_path, 'rb') as f: # 使用multipart/form-data格式发送文件 files = {'file': (filename, f)} # 发送POST请求 response = requests.post(url, files=files) # 检查响应状态码 if response.status_code == 200: print("format_change 文件上传成功") receive_file_response = response.content.decode('utf-8') receive_file_json = json.loads(receive_file_response) receive_file_url = receive_file_json["data"] else: print(f"format_change 文件上传失败,状态码: {response.status_code}") print(f"format_change {response.text}") return receive_file_url def get_filename_and_folder(file_path): # 使用os.path.basename获取文件名 original_filename = os.path.basename(file_path) filename_without_ext, ext = os.path.splitext(original_filename) # 替换文件名中的点为下划线,避免分割问题 safe_filename = filename_without_ext.replace('.', '_') # 使用os.path.dirname获取文件所在的完整目录路径 directory = os.path.dirname(file_path) return safe_filename, directory #参数为要转换的文件路径,以及输出的文件名,文件类型为自动补全。 def pdf2docx(local_path_in): if not local_path_in: return "" # 获取文件名和所在文件夹 filename, folder = get_filename_and_folder(local_path_in) # 检查是否已经存在同名的 .docx 文件 docx_file_path = os.path.join(folder, f"{filename}.docx") if os.path.exists(docx_file_path): print(f"Skipping conversion, {docx_file_path} already exists.") return docx_file_path remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/p2d' receive_download_url = local_file_2_url(local_path_in, remote_url) # 转换完成,得到下载链接 local_filename = os.path.join(folder,filename) # 输出文件名 C:\Users\Administrator\Desktop\货物标\zbfiles\6.2定版视频会议磋商文件 不带后缀 downloaded_filepath, file_type = download_file(receive_download_url, local_filename) print(f"format_change p2d:have downloaded file to: {downloaded_filepath}") return downloaded_filepath def doc2docx(local_path_in): if not local_path_in: return "" # 获取文件名和所在文件夹 filename, folder = get_filename_and_folder(local_path_in) # 检查是否已经存在同名的 .docx 文件 docx_file_path = os.path.join(folder, f"{filename}.docx") if os.path.exists(docx_file_path): print(f"Skipping conversion, {docx_file_path} already exists.") return docx_file_path remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d' receive_download_url = local_file_2_url(local_path_in, remote_url) print(receive_download_url) filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹 local_filename = os.path.join(folder, filename) # 输出文件名 downloaded_filepath, file_type = download_file(receive_download_url, local_filename) print(f"format_change d2d:have downloaded file to: {downloaded_filepath}") return downloaded_filepath def docx2pdf(local_path_in,force=False): """ 将 DOCX 文件转换为 PDF。 参数: - local_path_in (str): 输入的 DOCX 文件路径。 - force (bool): 是否强制转换并覆盖已存在的 PDF 文件。默认为 False。 返回: - str: 转换后的 PDF 文件路径 - "" 如果转换失败 """ if not local_path_in: return "" # 获取文件名和所在文件夹 filename, folder = get_filename_and_folder(local_path_in) # 检查是否已经存在同名的 .pdf 文件 pdf_file_path = os.path.join(folder, f"{filename}.pdf") if os.path.exists(pdf_file_path): if force: print(f"强制转换,覆盖已存在的文件: {pdf_file_path}") else: print(f"跳过转换,文件已存在: {pdf_file_path}") return pdf_file_path # 跳过转换 remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p' receive_download_url = local_file_2_url(local_path_in, remote_url) filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹 local_filename = os.path.join(folder, filename) # 输出文件名 downloaded_filepath,file_type = download_file(receive_download_url, local_filename) print(f"format_change d2p:have downloaded file to: {downloaded_filepath}") return downloaded_filepath def get_pdf_page_count(file_path): """ 获取 PDF 文件的页码数量 """ try: pdf_path=file_path if file_path.lower().endswith(('.doc', '.docx')): pdf_path = docx2pdf(file_path) reader = PdfReader(pdf_path) return len(reader.pages) except Exception as e: print(f"读取 PDF 页码时出错:{e}") return 0 if __name__ == '__main__': # 替换为你的文件路径和API URL # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf" # local_path_in = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.docx" # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf" # local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf" # local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf" local_path_in = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\有问题的.doc" # downloaded_file=pdf2docx(local_path_in) # # downloaded_file=pdf2docx(local_path_in) downloaded_file=docx2pdf(local_path_in) print(downloaded_file) # test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7.pdf?Expires=1736852995&OSSAccessKeyId=TMP.3Kg1oKKcsSWb7DXNe4F56bfGfKY5nNWUi274p39HyY7GR3mghMCaFWy69Fi83SBab6PmSkErh4JUD4yAxAGzVVx2hxxoxm&Signature=rmxS5lett4MzWdksDI57EujCklw%3" # local_file_name = r'D:\flask_project\flask_app\static\output\output1\1d763771-f25a-4b65-839e-3b2ca56577b1\tmp\ztbfile.pdf' # downloaded = download_file(test_url, local_file_name) # if not downloaded: # print("下载文件失败或不支持的文件类型") # downloaded_filepath, file_type = downloaded # print(downloaded_filepath) # print(file_type) # # 检查文件类型 # if file_type == 4: # print("error") res=pdf2docx(local_path_in) print(res)