zbparse/flask_app/main/download.py

76 lines
2.6 KiB
Python
Raw Normal View History

2024-10-14 10:52:31 +08:00
import os
2024-08-29 16:37:09 +08:00
import requests
import mimetypes
2024-08-29 16:37:09 +08:00
def download_file(url, local_filename):
2024-10-12 18:01:59 +08:00
"""
下载文件并保存到本地基于Content-Type设置文件扩展名
参数:
- url (str): 文件的URL地址
- local_filename (str): 本地保存的文件名不含扩展名
返回:
- tuple: (完整文件名, 文件类型代码)
文件类型代码:
1 - .docx
2 - .pdf
3 - .doc
4 - 其他
- None: 下载失败
"""
2024-08-29 16:37:09 +08:00
try:
with requests.get(url, stream=True) as response:
response.raise_for_status() # 确保请求成功,否则抛出异常
2024-10-12 18:01:59 +08:00
# 获取Content-Type并猜测文件扩展名
content_type = response.headers.get('Content-Type', '')
extension = mimetypes.guess_extension(content_type, strict=False) or '.docx'
# 分离文件名和现有扩展名
base, ext = os.path.splitext(local_filename)
if ext.lower() != extension:
full_filename = base + extension
else:
full_filename = local_filename
2024-08-29 16:37:09 +08:00
2024-10-12 18:01:59 +08:00
# 下载并保存文件
2024-08-29 16:37:09 +08:00
with open(full_filename, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
2024-10-12 18:01:59 +08:00
if chunk: # 避免写入空块
file.write(chunk)
# 定义扩展名到代码的映射
extension_mapping = {
'.docx': 1,
'.pdf': 2,
'.doc': 3
}
file_code = extension_mapping.get(extension.lower(), 4)
return full_filename, file_code
2024-08-29 16:37:09 +08:00
except requests.HTTPError as e:
2024-10-12 18:01:59 +08:00
print(f"download: HTTP 错误: {e}")
2024-08-29 16:37:09 +08:00
except requests.RequestException as e:
2024-10-12 18:01:59 +08:00
print(f"download: 下载文件时出错: {e}")
2024-08-29 16:37:09 +08:00
except Exception as e:
2024-10-12 18:01:59 +08:00
print(f"download: 发生错误: {e}")
return None
2024-08-29 16:37:09 +08:00
if __name__ == '__main__':
# 测试下载的URL
2024-10-19 15:33:55 +08:00
test_url ="http://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/tender/444d68db95c3438ca481f029500f4f9b.pdf?Expires=1729325467&OSSAccessKeyId=LTAI5tJmL4Nvg95VjgUNvVL6&Signature=2%2Bax895ik7DoFs%2FHVYeN29%2BEiTo%3D&x-oss-process=doc%2Fedit"
2024-10-12 18:01:59 +08:00
local_file_name = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output\\testdownload'
2024-10-19 15:33:55 +08:00
downloaded = download_file(test_url, local_file_name)
if not downloaded:
print("下载文件失败或不支持的文件类型")
downloaded_filepath, file_type = downloaded
print(downloaded_filepath)
print(file_type)
# 检查文件类型
if file_type == 4:
print("error")