12.6
This commit is contained in:
parent
3c85ce0c98
commit
95dde18aab
@ -1,8 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
|
from PyPDF2 import PdfReader
|
||||||
|
|
||||||
def extract_common_header(pdf_path):
|
def extract_common_header(pdf_path):
|
||||||
from PyPDF2 import PdfReader
|
|
||||||
|
|
||||||
def get_headers(pdf_document, start_page, pages_to_read):
|
def get_headers(pdf_document, start_page, pages_to_read):
|
||||||
headers = []
|
headers = []
|
||||||
@ -80,4 +78,20 @@ def clean_page_content(text, common_header):
|
|||||||
text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码
|
text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码
|
||||||
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码
|
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码
|
||||||
text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
|
text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def is_scanned_pdf(file_path):
|
||||||
|
with open(file_path, 'rb') as file:
|
||||||
|
reader = PdfReader(file)
|
||||||
|
for page in reader.pages:
|
||||||
|
if page.extract_text().strip(): # 如果有文本
|
||||||
|
return False # 不是扫描型
|
||||||
|
return True # 全部页面都没有文本
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
file_path = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\2020-安徽-安徽省生态环境厅电梯采购.pdf"
|
||||||
|
res=is_scanned_pdf(file_path)
|
||||||
|
if res:
|
||||||
|
print("扫描型")
|
||||||
|
else:
|
||||||
|
print("普通型")
|
@ -1,11 +1,77 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import mimetypes
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from flask_app.main.download import download_file
|
from flask_app.general.clean_pdf import is_scanned_pdf
|
||||||
|
def download_file(url, local_filename):
|
||||||
|
"""
|
||||||
|
下载文件并保存到本地,基于Content-Type设置文件扩展名。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
- url (str): 文件的URL地址。
|
||||||
|
- local_filename (str): 本地保存的文件名(不含扩展名)。
|
||||||
|
|
||||||
|
返回:
|
||||||
|
- tuple: (完整文件名, 文件类型代码)
|
||||||
|
文件类型代码:
|
||||||
|
1 - .docx
|
||||||
|
2 - .pdf
|
||||||
|
3 - .doc
|
||||||
|
4 - 其他
|
||||||
|
- None: 下载失败
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with requests.get(url, stream=True) as response:
|
||||||
|
response.raise_for_status() # 确保请求成功,否则抛出异常
|
||||||
|
|
||||||
|
# 获取Content-Type并猜测文件扩展名
|
||||||
|
content_type = response.headers.get('Content-Type', '')
|
||||||
|
extension = mimetypes.guess_extension(content_type, strict=False) or '.docx'
|
||||||
|
|
||||||
|
# 分离文件名和现有扩展名
|
||||||
|
base, ext = os.path.splitext(local_filename)
|
||||||
|
if ext.lower() != extension:
|
||||||
|
full_filename = base + extension
|
||||||
|
else:
|
||||||
|
full_filename = local_filename
|
||||||
|
|
||||||
|
# 下载并保存文件
|
||||||
|
with open(full_filename, 'wb') as file:
|
||||||
|
for chunk in response.iter_content(chunk_size=8192):
|
||||||
|
if chunk: # 避免写入空块
|
||||||
|
file.write(chunk)
|
||||||
|
|
||||||
|
# 定义扩展名到代码的映射
|
||||||
|
extension_mapping = {
|
||||||
|
'.docx': 1,
|
||||||
|
'.pdf': 2,
|
||||||
|
'.doc': 3
|
||||||
|
}
|
||||||
|
file_code = extension_mapping.get(extension.lower(), 4)
|
||||||
|
# 如果是 PDF,判断是否为扫描型
|
||||||
|
if extension.lower() == '.pdf':
|
||||||
|
print(f"Checking if the PDF is scanned: {full_filename}")
|
||||||
|
if is_scanned_pdf(full_filename):
|
||||||
|
print(f"PDF is scanned. Converting to normal PDF: {full_filename}")
|
||||||
|
# 转换扫描型 PDF -> DOCX -> 普通 PDF
|
||||||
|
intermediate_docx = pdf2docx(full_filename)
|
||||||
|
if intermediate_docx:
|
||||||
|
normal_pdf = docx2pdf(intermediate_docx)
|
||||||
|
if normal_pdf:
|
||||||
|
# 替换原始 PDF 文件
|
||||||
|
os.replace(normal_pdf, full_filename)
|
||||||
|
print(f"Replaced scanned PDF with normal PDF: {full_filename}")
|
||||||
|
return full_filename, file_code
|
||||||
|
|
||||||
|
except requests.HTTPError as e:
|
||||||
|
print(f"download: HTTP 错误: {e}")
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"download: 下载文件时出错: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"download: 发生错误: {e}")
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
def upload_file(file_path, url):
|
def upload_file(file_path, url):
|
||||||
receive_file_url = ""
|
receive_file_url = ""
|
||||||
@ -50,12 +116,22 @@ def get_filename_and_folder(file_path):
|
|||||||
def pdf2docx(local_path_in):
|
def pdf2docx(local_path_in):
|
||||||
if not local_path_in:
|
if not local_path_in:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
# 获取文件名和所在文件夹
|
||||||
|
filename, folder = get_filename_and_folder(local_path_in)
|
||||||
|
|
||||||
|
# 检查是否已经存在同名的 .docx 文件
|
||||||
|
docx_file_path = os.path.join(folder, f"{filename}.docx")
|
||||||
|
if os.path.exists(docx_file_path):
|
||||||
|
print(f"Skipping conversion, {docx_file_path} already exists.")
|
||||||
|
return docx_file_path
|
||||||
|
|
||||||
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/p2d'
|
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/p2d'
|
||||||
receive_download_url = upload_file(local_path_in, remote_url) #转换完成,得到下载链接
|
receive_download_url = upload_file(local_path_in, remote_url) # 转换完成,得到下载链接
|
||||||
# print(receive_download_url)
|
|
||||||
filename, folder = get_filename_and_folder(local_path_in) #输入输出在同一个文件夹
|
local_filename = os.path.join(folder,
|
||||||
local_filename=os.path.join(folder,filename) #输出文件名 C:\Users\Administrator\Desktop\货物标\zbfiles\6.2定版视频会议磋商文件 不带后缀
|
filename) # 输出文件名 C:\Users\Administrator\Desktop\货物标\zbfiles\6.2定版视频会议磋商文件 不带后缀
|
||||||
downloaded_filepath,file_type=download_file(receive_download_url, local_filename)
|
downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
|
||||||
print(f"format_change p2d:have downloaded file to: {downloaded_filepath}")
|
print(f"format_change p2d:have downloaded file to: {downloaded_filepath}")
|
||||||
return downloaded_filepath
|
return downloaded_filepath
|
||||||
|
|
||||||
@ -183,11 +259,22 @@ if __name__ == '__main__':
|
|||||||
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
|
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
|
||||||
# local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
|
# local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
|
||||||
# local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf"
|
# local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf"
|
||||||
local_path_in = r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile.pdf"
|
local_path_in = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile.pdf"
|
||||||
downloaded_file=pdf2docx(local_path_in)
|
downloaded_file=pdf2docx(local_path_in)
|
||||||
# downloaded_file=pdf2docx(local_path_in)
|
# # downloaded_file=pdf2docx(local_path_in)
|
||||||
# downloaded_file=docx2pdf(local_path_in)
|
# # downloaded_file=docx2pdf(local_path_in)
|
||||||
print(downloaded_file)
|
print(downloaded_file)
|
||||||
|
# test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/094%E5%AE%9A%E7%A8%BF-%E6%B9%96%E5%8C%97%E5%B7%A5%E4%B8%9A%E5%A4%A7%E5%AD%A6%E8%BD%BB%E6%AD%A6%E5%99%A8%E6%A8%A1%E6%8B%9F%E5%B0%84%E5%87%BB%E8%AE%BE%E5%A4%87%E9%87%87%E8%B4%AD%E9%A1%B9%E7%9B%AE%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6.pdf?Expires=1733410874&OSSAccessKeyId=TMP.3KdesMxbSGgK41BgYKRtckUxdQ2LUh2YMYiH1FnuupFhvRuxMbiuBAsXK9oGvHxaLvsDLQjp3db28cUK5YJSEYTGPnUevo&Signature=EjLj3KeLtj337lS1DEJO56471Tg%3D"
|
||||||
|
# local_file_name = r'C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile'
|
||||||
|
# downloaded = download_file(test_url, local_file_name)
|
||||||
|
# if not downloaded:
|
||||||
|
# print("下载文件失败或不支持的文件类型")
|
||||||
|
# downloaded_filepath, file_type = downloaded
|
||||||
|
# print(downloaded_filepath)
|
||||||
|
# print(file_type)
|
||||||
|
# # 检查文件类型
|
||||||
|
# if file_type == 4:
|
||||||
|
# print("error")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -116,13 +116,13 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
||||||
file_path=r"C:\Users\Administrator\Desktop\货物标\output1\2-招标文件(广水市教育局封闭管理)_procurement.pdf"
|
file_path=r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile.pdf"
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
||||||
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
||||||
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
|
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
|
||||||
# ress = extract_common_header(file_path)
|
# ress = extract_common_header(file_path)
|
||||||
# print(ress)
|
# print(ress)
|
||||||
# res=extract_text_by_page(file_path)
|
res=extract_text_by_page(file_path)
|
||||||
# print(res)磋商文件_tobidders_notice_part2.pdf
|
# print(res)磋商文件_tobidders_notice_part2.pdf
|
||||||
save_extracted_text_to_txt(file_path,"output.txt")
|
# save_extracted_text_to_txt(file_path,"output.txt")
|
@ -3,6 +3,9 @@ import os
|
|||||||
import requests
|
import requests
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
|
||||||
|
from flask_app.general.clean_pdf import is_scanned_pdf
|
||||||
|
from flask_app.general.format_change import pdf2docx, docx2pdf
|
||||||
|
|
||||||
|
|
||||||
def download_file(url, local_filename):
|
def download_file(url, local_filename):
|
||||||
"""
|
"""
|
||||||
@ -49,6 +52,19 @@ def download_file(url, local_filename):
|
|||||||
'.doc': 3
|
'.doc': 3
|
||||||
}
|
}
|
||||||
file_code = extension_mapping.get(extension.lower(), 4)
|
file_code = extension_mapping.get(extension.lower(), 4)
|
||||||
|
# 如果是 PDF,判断是否为扫描型
|
||||||
|
if extension.lower() == '.pdf':
|
||||||
|
print(f"Checking if the PDF is scanned: {full_filename}")
|
||||||
|
if is_scanned_pdf(full_filename):
|
||||||
|
print(f"PDF is scanned. Converting to normal PDF: {full_filename}")
|
||||||
|
# 转换扫描型 PDF -> DOCX -> 普通 PDF
|
||||||
|
intermediate_docx = pdf2docx(full_filename)
|
||||||
|
if intermediate_docx:
|
||||||
|
normal_pdf = docx2pdf(intermediate_docx)
|
||||||
|
if normal_pdf:
|
||||||
|
# 替换原始 PDF 文件
|
||||||
|
os.replace(normal_pdf, full_filename)
|
||||||
|
print(f"Replaced scanned PDF with normal PDF: {full_filename}")
|
||||||
return full_filename, file_code
|
return full_filename, file_code
|
||||||
|
|
||||||
except requests.HTTPError as e:
|
except requests.HTTPError as e:
|
||||||
@ -62,8 +78,8 @@ def download_file(url, local_filename):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# 测试下载的URL
|
# 测试下载的URL
|
||||||
test_url ="http://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/tender/444d68db95c3438ca481f029500f4f9b.pdf?Expires=1729325467&OSSAccessKeyId=LTAI5tJmL4Nvg95VjgUNvVL6&Signature=2%2Bax895ik7DoFs%2FHVYeN29%2BEiTo%3D&x-oss-process=doc%2Fedit"
|
test_url ="https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/2020-%E5%AE%89%E5%BE%BD-%E5%AE%89%E5%BE%BD%E7%9C%81%E7%94%9F%E6%80%81%E7%8E%AF%E5%A2%83%E5%8E%85%E7%94%B5%E6%A2%AF%E9%87%87%E8%B4%AD.pdf?Expires=1733410076&OSSAccessKeyId=TMP.3KdesMxbSGgK41BgYKRtckUxdQ2LUh2YMYiH1FnuupFhvRuxMbiuBAsXK9oGvHxaLvsDLQjp3db28cUK5YJSEYTGPnUevo&Signature=1cZOCTJPwOJY%2BwCfnLHc0teGTZQ%3D"
|
||||||
local_file_name = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output\\testdownload'
|
local_file_name = r'C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile'
|
||||||
downloaded = download_file(test_url, local_file_name)
|
downloaded = download_file(test_url, local_file_name)
|
||||||
if not downloaded:
|
if not downloaded:
|
||||||
print("下载文件失败或不支持的文件类型")
|
print("下载文件失败或不支持的文件类型")
|
@ -3,7 +3,7 @@
|
|||||||
from flask import Blueprint, jsonify, Response, g
|
from flask import Blueprint, jsonify, Response, g
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from flask_app.main.download import download_file
|
from flask_app.general.format_change import download_file
|
||||||
from flask_app.routes.接口_技术偏离表 import get_tech_and_business_deviation
|
from flask_app.routes.接口_技术偏离表 import get_tech_and_business_deviation
|
||||||
from flask_app.routes.utils import generate_deviation_response, validate_and_setup_logger
|
from flask_app.routes.utils import generate_deviation_response, validate_and_setup_logger
|
||||||
from flask_app.ConnectionLimiter import require_connection_limit
|
from flask_app.ConnectionLimiter import require_connection_limit
|
||||||
|
@ -5,7 +5,7 @@ import os
|
|||||||
from flask import Blueprint, jsonify, g
|
from flask import Blueprint, jsonify, g
|
||||||
|
|
||||||
from flask_app.ConnectionLimiter import require_connection_limit
|
from flask_app.ConnectionLimiter import require_connection_limit
|
||||||
from flask_app.main.download import download_file
|
from flask_app.general.format_change import download_file
|
||||||
from flask_app.routes.接口_小解析 import little_parse_main
|
from flask_app.routes.接口_小解析 import little_parse_main
|
||||||
from flask_app.routes.utils import validate_and_setup_logger
|
from flask_app.routes.utils import validate_and_setup_logger
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from flask_app.main.download import download_file
|
from flask_app.general.format_change import download_file
|
||||||
from flask_app.routes.工程标解析main import engineering_bid_main
|
from flask_app.routes.工程标解析main import engineering_bid_main
|
||||||
from flask_app.routes.货物标解析main import goods_bid_main
|
from flask_app.routes.货物标解析main import goods_bid_main
|
||||||
from flask_app.general.post_processing import outer_post_processing
|
from flask_app.general.post_processing import outer_post_processing
|
||||||
|
@ -45,8 +45,8 @@ def little_parse_goods(output_folder, pdf_path):
|
|||||||
# 上传文件并获取文件 ID
|
# 上传文件并获取文件 ID
|
||||||
file_id = upload_file(baseinfo_file_path)
|
file_id = upload_file(baseinfo_file_path)
|
||||||
# 注意:以下路径被硬编码,确保该路径存在并且正确
|
# 注意:以下路径被硬编码,确保该路径存在并且正确
|
||||||
baseinfo_prompt_file_path='flask_app/static/提示词/小解析基本信息货物标.txt'
|
# baseinfo_prompt_file_path='flask_app/static/提示词/小解析基本信息货物标.txt'
|
||||||
# baseinfo_prompt_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\小解析基本信息货物标.txt'
|
baseinfo_prompt_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\小解析基本信息货物标.txt'
|
||||||
# 从提示词文件中读取问题
|
# 从提示词文件中读取问题
|
||||||
questions = read_questions_from_file(baseinfo_prompt_file_path)
|
questions = read_questions_from_file(baseinfo_prompt_file_path)
|
||||||
# 多线程处理问题,使用指定的处理模式(2 代表使用 qianwen-long)
|
# 多线程处理问题,使用指定的处理模式(2 代表使用 qianwen-long)
|
||||||
@ -151,10 +151,10 @@ if __name__ == "__main__":
|
|||||||
# zb_type=2 #1:工程标 2:货物标
|
# zb_type=2 #1:工程标 2:货物标
|
||||||
# input_file = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\ztbfile.pdf"
|
# input_file = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\ztbfile.pdf"
|
||||||
|
|
||||||
output_folder=r"C:\Users\Administrator\Desktop\fsdownload\865a5d46-a5f8-467a-8374-c71c415d0af9\tmp"
|
output_folder=r"C:\Users\Administrator\Desktop\货物标\zbfiles\tmp"
|
||||||
zb_type=1
|
zb_type=2 #1:工程 2:货物
|
||||||
# input_file=r"C:\Users\Administrator\Desktop\fsdownload\865a5d46-a5f8-467a-8374-c71c415d0af9\ztbfile.pdf"
|
# input_file=r"C:\Users\Administrator\Desktop\fsdownload\865a5d46-a5f8-467a-8374-c71c415d0af9\ztbfile.pdf"
|
||||||
input_file=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest4_evaluation_method.pdf"
|
input_file=r"C:\Users\Administrator\Desktop\货物标\zbfiles\2020-安徽-安徽省生态环境厅电梯采购.pdf"
|
||||||
final_json_path=little_parse_main(output_folder, input_file, file_type, zb_type,"122334")
|
final_json_path=little_parse_main(output_folder, input_file, file_type, zb_type,"122334")
|
||||||
with open(final_json_path, 'r', encoding='utf-8') as f:
|
with open(final_json_path, 'r', encoding='utf-8') as f:
|
||||||
# logger.info('final_json_path:' + final_json_path)
|
# logger.info('final_json_path:' + final_json_path)
|
||||||
|
@ -29,14 +29,13 @@ def preprocess_files(output_folder, file_path, file_type,logger):
|
|||||||
logger.info("starting 文件预处理...")
|
logger.info("starting 文件预处理...")
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
logger.info("output_folder..." + output_folder)
|
logger.info("output_folder..." + output_folder)
|
||||||
|
|
||||||
# 根据文件类型处理文件路径
|
# 根据文件类型处理文件路径
|
||||||
if file_type == 1: # docx
|
if file_type == 1: # docx
|
||||||
docx_path = file_path
|
docx_path = file_path
|
||||||
pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理
|
pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理
|
||||||
elif file_type == 2: # pdf
|
elif file_type == 2: # pdf
|
||||||
pdf_path = file_path
|
pdf_path = file_path
|
||||||
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
|
docx_path = pdf2docx(pdf_path)
|
||||||
elif file_type == 3: # doc
|
elif file_type == 3: # doc
|
||||||
pdf_path = docx2pdf(file_path)
|
pdf_path = docx2pdf(file_path)
|
||||||
docx_path = doc2docx(file_path)
|
docx_path = doc2docx(file_path)
|
||||||
@ -44,8 +43,6 @@ def preprocess_files(output_folder, file_path, file_type,logger):
|
|||||||
logger.error("Unsupported file type provided. Preprocessing halted.")
|
logger.error("Unsupported file type provided. Preprocessing halted.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# # 异步上传知识库
|
|
||||||
# future_knowledge = executor.submit(addfileToKnowledge, docx_path, "招标解析" + unique_id)
|
|
||||||
# 调用截取PDF多次
|
# 调用截取PDF多次
|
||||||
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger) # index: 0->商务技术服务要求 1->评标办法 2->资格审查 3->投标人须知前附表 4->投标人须知正文
|
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger) # index: 0->商务技术服务要求 1->评标办法 2->资格审查 3->投标人须知前附表 4->投标人须知正文
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ def create_app():
|
|||||||
output_folder = getattr(g, 'output_folder', None)
|
output_folder = getattr(g, 'output_folder', None)
|
||||||
if output_folder:
|
if output_folder:
|
||||||
# 执行与output_folder相关的清理操作(例如删除临时文件)
|
# 执行与output_folder相关的清理操作(例如删除临时文件)
|
||||||
logger = app.logger # 使用 app 的 logger
|
logger = g.logger # 使用 app 的 logger
|
||||||
logger.info(f"正在清理输出文件夹: {output_folder}")
|
logger.info(f"正在清理输出文件夹: {output_folder}")
|
||||||
file_ids = read_file_ids(output_folder)
|
file_ids = read_file_ids(output_folder)
|
||||||
failed_file_ids = delete_file_by_ids(file_ids)
|
failed_file_ids = delete_file_by_ids(file_ids)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user