2024-10-14 10:52:31 +08:00
|
|
|
|
import os
|
|
|
|
|
|
2024-08-29 16:37:09 +08:00
|
|
|
|
import requests
|
|
|
|
|
import mimetypes
|
|
|
|
|
|
2024-12-05 15:01:37 +08:00
|
|
|
|
from flask_app.general.clean_pdf import is_scanned_pdf
|
|
|
|
|
from flask_app.general.format_change import pdf2docx, docx2pdf
|
|
|
|
|
|
2024-09-11 12:02:09 +08:00
|
|
|
|
|
2024-08-29 16:37:09 +08:00
|
|
|
|
def download_file(url, local_filename):
|
2024-10-12 18:01:59 +08:00
|
|
|
|
"""
|
|
|
|
|
下载文件并保存到本地,基于Content-Type设置文件扩展名。
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
- url (str): 文件的URL地址。
|
|
|
|
|
- local_filename (str): 本地保存的文件名(不含扩展名)。
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
- tuple: (完整文件名, 文件类型代码)
|
|
|
|
|
文件类型代码:
|
|
|
|
|
1 - .docx
|
|
|
|
|
2 - .pdf
|
|
|
|
|
3 - .doc
|
|
|
|
|
4 - 其他
|
|
|
|
|
- None: 下载失败
|
|
|
|
|
"""
|
2024-08-29 16:37:09 +08:00
|
|
|
|
try:
|
|
|
|
|
with requests.get(url, stream=True) as response:
|
|
|
|
|
response.raise_for_status() # 确保请求成功,否则抛出异常
|
|
|
|
|
|
2024-10-12 18:01:59 +08:00
|
|
|
|
# 获取Content-Type并猜测文件扩展名
|
|
|
|
|
content_type = response.headers.get('Content-Type', '')
|
|
|
|
|
extension = mimetypes.guess_extension(content_type, strict=False) or '.docx'
|
|
|
|
|
|
|
|
|
|
# 分离文件名和现有扩展名
|
|
|
|
|
base, ext = os.path.splitext(local_filename)
|
|
|
|
|
if ext.lower() != extension:
|
|
|
|
|
full_filename = base + extension
|
|
|
|
|
else:
|
|
|
|
|
full_filename = local_filename
|
2024-08-29 16:37:09 +08:00
|
|
|
|
|
2024-10-12 18:01:59 +08:00
|
|
|
|
# 下载并保存文件
|
2024-08-29 16:37:09 +08:00
|
|
|
|
with open(full_filename, 'wb') as file:
|
|
|
|
|
for chunk in response.iter_content(chunk_size=8192):
|
2024-10-12 18:01:59 +08:00
|
|
|
|
if chunk: # 避免写入空块
|
|
|
|
|
file.write(chunk)
|
|
|
|
|
|
|
|
|
|
# 定义扩展名到代码的映射
|
|
|
|
|
extension_mapping = {
|
|
|
|
|
'.docx': 1,
|
|
|
|
|
'.pdf': 2,
|
|
|
|
|
'.doc': 3
|
|
|
|
|
}
|
|
|
|
|
file_code = extension_mapping.get(extension.lower(), 4)
|
2024-12-05 15:01:37 +08:00
|
|
|
|
# 如果是 PDF,判断是否为扫描型
|
|
|
|
|
if extension.lower() == '.pdf':
|
|
|
|
|
print(f"Checking if the PDF is scanned: {full_filename}")
|
|
|
|
|
if is_scanned_pdf(full_filename):
|
|
|
|
|
print(f"PDF is scanned. Converting to normal PDF: {full_filename}")
|
|
|
|
|
# 转换扫描型 PDF -> DOCX -> 普通 PDF
|
|
|
|
|
intermediate_docx = pdf2docx(full_filename)
|
|
|
|
|
if intermediate_docx:
|
|
|
|
|
normal_pdf = docx2pdf(intermediate_docx)
|
|
|
|
|
if normal_pdf:
|
|
|
|
|
# 替换原始 PDF 文件
|
|
|
|
|
os.replace(normal_pdf, full_filename)
|
|
|
|
|
print(f"Replaced scanned PDF with normal PDF: {full_filename}")
|
2024-10-12 18:01:59 +08:00
|
|
|
|
return full_filename, file_code
|
2024-08-29 16:37:09 +08:00
|
|
|
|
|
|
|
|
|
except requests.HTTPError as e:
|
2024-10-12 18:01:59 +08:00
|
|
|
|
print(f"download: HTTP 错误: {e}")
|
2024-08-29 16:37:09 +08:00
|
|
|
|
except requests.RequestException as e:
|
2024-10-12 18:01:59 +08:00
|
|
|
|
print(f"download: 下载文件时出错: {e}")
|
2024-08-29 16:37:09 +08:00
|
|
|
|
except Exception as e:
|
2024-10-12 18:01:59 +08:00
|
|
|
|
print(f"download: 发生错误: {e}")
|
|
|
|
|
|
|
|
|
|
return None
|
2024-08-29 16:37:09 +08:00
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
# 测试下载的URL
|
2024-12-05 15:01:37 +08:00
|
|
|
|
test_url ="https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/2020-%E5%AE%89%E5%BE%BD-%E5%AE%89%E5%BE%BD%E7%9C%81%E7%94%9F%E6%80%81%E7%8E%AF%E5%A2%83%E5%8E%85%E7%94%B5%E6%A2%AF%E9%87%87%E8%B4%AD.pdf?Expires=1733410076&OSSAccessKeyId=TMP.3KdesMxbSGgK41BgYKRtckUxdQ2LUh2YMYiH1FnuupFhvRuxMbiuBAsXK9oGvHxaLvsDLQjp3db28cUK5YJSEYTGPnUevo&Signature=1cZOCTJPwOJY%2BwCfnLHc0teGTZQ%3D"
|
|
|
|
|
local_file_name = r'C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile'
|
2024-10-19 15:33:55 +08:00
|
|
|
|
downloaded = download_file(test_url, local_file_name)
|
|
|
|
|
if not downloaded:
|
|
|
|
|
print("下载文件失败或不支持的文件类型")
|
|
|
|
|
downloaded_filepath, file_type = downloaded
|
|
|
|
|
print(downloaded_filepath)
|
|
|
|
|
print(file_type)
|
|
|
|
|
# 检查文件类型
|
|
|
|
|
if file_type == 4:
|
|
|
|
|
print("error")
|