import os import requests import mimetypes from flask_app.general.读取文件.clean_pdf import is_scanned_pdf from flask_app.general.format_change import pdf2docx, docx2pdf def download_file(url, local_filename): """ 下载文件并保存到本地,基于Content-Type设置文件扩展名。 参数: - url (str): 文件的URL地址。 - local_filename (str): 本地保存的文件名(不含扩展名)。 返回: - tuple: (完整文件名, 文件类型代码) 文件类型代码: 1 - .docx 2 - .pdf 3 - .doc 4 - 其他 - None: 下载失败 """ try: with requests.get(url, stream=True) as response: response.raise_for_status() # 确保请求成功,否则抛出异常 # 获取Content-Type并猜测文件扩展名 content_type = response.headers.get('Content-Type', '') extension = mimetypes.guess_extension(content_type, strict=False) or '.docx' # 分离文件名和现有扩展名 base, ext = os.path.splitext(local_filename) if ext.lower() != extension: full_filename = base + extension else: full_filename = local_filename # 下载并保存文件 with open(full_filename, 'wb') as file: for chunk in response.iter_content(chunk_size=8192): if chunk: # 避免写入空块 file.write(chunk) # 定义扩展名到代码的映射 extension_mapping = { '.docx': 1, '.pdf': 2, '.doc': 3 } file_code = extension_mapping.get(extension.lower(), 4) # 如果是 PDF,判断是否为扫描型 if extension.lower() == '.pdf': print(f"Checking if the PDF is scanned: {full_filename}") if is_scanned_pdf(full_filename): print(f"PDF is scanned. Converting to normal PDF: {full_filename}") # 转换扫描型 PDF -> DOCX -> 普通 PDF intermediate_docx = pdf2docx(full_filename) if intermediate_docx: normal_pdf = docx2pdf(intermediate_docx) if normal_pdf: # 替换原始 PDF 文件 os.replace(normal_pdf, full_filename) print(f"Replaced scanned PDF with normal PDF: {full_filename}") return full_filename, file_code except requests.HTTPError as e: print(f"download: HTTP 错误: {e}") except requests.RequestException as e: print(f"download: 下载文件时出错: {e}") except Exception as e: print(f"download: 发生错误: {e}") return None if __name__ == '__main__': # 测试下载的URL test_url ="https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/2020-%E5%AE%89%E5%BE%BD-%E5%AE%89%E5%BE%BD%E7%9C%81%E7%94%9F%E6%80%81%E7%8E%AF%E5%A2%83%E5%8E%85%E7%94%B5%E6%A2%AF%E9%87%87%E8%B4%AD.pdf?Expires=1733410076&OSSAccessKeyId=TMP.3KdesMxbSGgK41BgYKRtckUxdQ2LUh2YMYiH1FnuupFhvRuxMbiuBAsXK9oGvHxaLvsDLQjp3db28cUK5YJSEYTGPnUevo&Signature=1cZOCTJPwOJY%2BwCfnLHc0teGTZQ%3D" local_file_name = r'C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile' downloaded = download_file(test_url, local_file_name) if not downloaded: print("下载文件失败或不支持的文件类型") downloaded_filepath, file_type = downloaded print(downloaded_filepath) print(file_type) # 检查文件类型 if file_type == 4: print("error")