2.18 解决文档转换失败就报错的问题
This commit is contained in:
parent
d4a9d4edae
commit
d0dc142547
@ -20,7 +20,7 @@ def download_file(url, local_filename,enable=False):
|
||||
2 - .pdf
|
||||
3 - .doc
|
||||
4 - 其他
|
||||
- None: 下载失败
|
||||
- "": 下载失败
|
||||
"""
|
||||
try:
|
||||
with requests.get(url, stream=True) as response:
|
||||
@ -72,7 +72,7 @@ def download_file(url, local_filename,enable=False):
|
||||
except Exception as e:
|
||||
print(f"download: 发生错误: {e}")
|
||||
|
||||
return None,4
|
||||
return "",4
|
||||
|
||||
def local_file_2_url(file_path, url):
|
||||
receive_file_url = ""
|
||||
|
@ -67,6 +67,9 @@ def delete_mark(docx_path):
|
||||
如果该段落后紧跟一个包含 "w:sectPr" 的空白分节符段落,也一并删除。
|
||||
修改后的文档保存为 invalid_del.docx,并返回新文件路径。
|
||||
"""
|
||||
if not docx_path:
|
||||
print("Invalid input: docx_path is None or empty.")
|
||||
return ""
|
||||
try:
|
||||
doc = Document(docx_path)
|
||||
except KeyError as e:
|
||||
|
@ -173,19 +173,24 @@ def parse_text_by_heading(text):
|
||||
data[f"{main_number}."] = temp_title
|
||||
temp_title = None # 重置临时标题
|
||||
|
||||
# 保存阿拉伯数字的标题内容
|
||||
last_char = current_content[-1][-1] if current_content and current_content[-1] else None
|
||||
|
||||
if current_key is None or (compare_headings(current_key, new_key) and (
|
||||
len(current_content) == 0 or current_content[-1][-1] != '第')):
|
||||
not current_content or last_char != '第')):
|
||||
if current_key is not None:
|
||||
content_string = ''.join(current_content).strip()
|
||||
data[current_key] = data.get(current_key, '') + content_string
|
||||
current_key = new_key
|
||||
current_content = [line_content]
|
||||
append_newline = len(new_key.rstrip('.').split('.')) <= 2 or any(re.search(pattern, line_content) for pattern in special_section_keywords)
|
||||
append_newline = (
|
||||
len(new_key.rstrip('.').split('.')) <= 2 or
|
||||
any(re.search(pattern, line_content) for pattern in special_section_keywords)
|
||||
)
|
||||
last_main_number = new_key.split('.')[0]
|
||||
else:
|
||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
|
||||
in_special_section)
|
||||
append_newline = handle_content_append(
|
||||
current_content, line_stripped, append_newline, keywords, in_special_section
|
||||
)
|
||||
|
||||
elif dot_match:
|
||||
if in_double_hash_mode:
|
||||
@ -440,54 +445,58 @@ def extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern
|
||||
|
||||
|
||||
def convert_clause_to_json(file_path, output_folder, type=1):
|
||||
if not os.path.exists(file_path):
|
||||
print(f"The specified file does not exist: 返回空的clause_path")
|
||||
return ""
|
||||
if type == 2:
|
||||
start_pattern_1 = r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\))]?\s*$'
|
||||
start_pattern_2 = None
|
||||
end_pattern = (
|
||||
r'^第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|' # 第一部分
|
||||
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前瞻部分
|
||||
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$' # 第二部分
|
||||
)
|
||||
else:
|
||||
start_pattern_1 = (
|
||||
r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' # 匹配开头为圆括号或者中文括号并含有数字和逗号、句号等的部分
|
||||
r'[一二12][、..]+|' # 匹配包含“一、二、12”以及逗号、点号等的部分
|
||||
r'[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' # 匹配“说明、总则、名词解释”这类关键词
|
||||
)
|
||||
start_pattern_2 = (
|
||||
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前面的“见”,“与”,“”等字符
|
||||
r'(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$|' # 匹配“投标人”,“磋商”,“谈判”,“供应商”,“应答人”后跟“须知”
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)' # 添加“第X章/部分”部分
|
||||
)
|
||||
try:
|
||||
if not os.path.exists(file_path):
|
||||
print(f"The specified file does not exist: 返回空的clause_path")
|
||||
return ""
|
||||
if type == 2:
|
||||
start_pattern_1 = r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\))]?\s*$'
|
||||
start_pattern_2 = None
|
||||
end_pattern = (
|
||||
r'^第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|' # 第一部分
|
||||
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前瞻部分
|
||||
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$' # 第二部分
|
||||
)
|
||||
else:
|
||||
start_pattern_1 = (
|
||||
r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' # 匹配开头为圆括号或者中文括号并含有数字和逗号、句号等的部分
|
||||
r'[一二12][、..]+|' # 匹配包含“一、二、12”以及逗号、点号等的部分
|
||||
r'[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' # 匹配“说明、总则、名词解释”这类关键词
|
||||
)
|
||||
start_pattern_2 = (
|
||||
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前面的“见”,“与”,“”等字符
|
||||
r'(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$|' # 匹配“投标人”,“磋商”,“谈判”,“供应商”,“应答人”后跟“须知”
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)' # 添加“第X章/部分”部分
|
||||
)
|
||||
|
||||
end_pattern = (
|
||||
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
|
||||
r'(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$|'
|
||||
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附录(?:一)?[::]|'
|
||||
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附件(?:一)?[::]|'
|
||||
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附表(?:一)?[::]'
|
||||
)
|
||||
if file_path.endswith('.pdf'):
|
||||
text = extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern_2)
|
||||
# print(text)
|
||||
else:
|
||||
raise ValueError("Unsupported file format")
|
||||
parsed_data = parse_text_by_heading(text)
|
||||
# result = convert_to_json(input_path, start_word, end_pattern)
|
||||
# 检查输出文件夹是否存在,如果不存在则创建
|
||||
if not os.path.exists(output_folder):
|
||||
os.makedirs(output_folder)
|
||||
print(f"convert_clause_to_json:创建输出文件夹: {output_folder}")
|
||||
file_name = "clause1.json" if type == 1 else "clause2.json"
|
||||
# file_name = f"clause{suffix_counter}.json"
|
||||
output_path = os.path.join(output_folder, file_name)
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(parsed_data, f, indent=4, ensure_ascii=False)
|
||||
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
|
||||
return output_path
|
||||
end_pattern = (
|
||||
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
|
||||
r'(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$|'
|
||||
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附录(?:一)?[::]|'
|
||||
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附件(?:一)?[::]|'
|
||||
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附表(?:一)?[::]'
|
||||
)
|
||||
if file_path.endswith('.pdf'):
|
||||
text = extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern_2)
|
||||
# print(text)
|
||||
else:
|
||||
raise ValueError("Unsupported file format")
|
||||
parsed_data = parse_text_by_heading(text)
|
||||
# result = convert_to_json(input_path, start_word, end_pattern)
|
||||
# 检查输出文件夹是否存在,如果不存在则创建
|
||||
if not os.path.exists(output_folder):
|
||||
os.makedirs(output_folder)
|
||||
print(f"convert_clause_to_json:创建输出文件夹: {output_folder}")
|
||||
file_name = "clause1.json" if type == 1 else "clause2.json"
|
||||
# file_name = f"clause{suffix_counter}.json"
|
||||
output_path = os.path.join(output_folder, file_name)
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(parsed_data, f, indent=4, ensure_ascii=False)
|
||||
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
|
||||
return output_path
|
||||
except Exception as e:
|
||||
print(f"Error in convert_clause_to_json: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -195,14 +195,15 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
|
||||
|
||||
if __name__ == '__main__':
|
||||
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
||||
pdf_path=r'C:\Users\Administrator\Downloads\(完全没有解析进度)1_南网超高压公司2024年第四批服务公开招标项目(2024-FW-4-S-ZB2) (1).pdf'
|
||||
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f\zytest.pdf"
|
||||
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\output2\广水市妇幼招标文件最新(W改)_evaluation_method.pdf"
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
||||
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
||||
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
|
||||
# ress = extract_common_header(pdf_path)
|
||||
# print(ress)
|
||||
# print("-----------------")
|
||||
res=extract_text_by_page_fitz(pdf_path)
|
||||
ress = extract_common_header(pdf_path)
|
||||
print(ress)
|
||||
print("-----------------")
|
||||
extract_text_by_page(pdf_path)
|
||||
# res=extract_text_by_page_fitz(pdf_path)
|
||||
# print(res)磋商文件_tobidders_notice_part2.pdf
|
||||
# save_extracted_text_to_txt(file_path,"output.txt")
|
||||
|
@ -41,7 +41,7 @@ def get_deviation(): #提供商务、技术偏离的数据
|
||||
downloaded_filename = os.path.join(output_folder, filename)
|
||||
# 下载文件
|
||||
downloaded_filepath, file_type = download_file(file_url, downloaded_filename)
|
||||
if downloaded_filepath is None or file_type == 4:
|
||||
if not downloaded_filepath or file_type == 4:
|
||||
logger.error("下载地址不存在或不支持的文件类型!")
|
||||
log_error_unique_id(unique_id, 3)
|
||||
response = create_response(
|
||||
|
@ -67,7 +67,7 @@ def download_and_process_file(file_url, zb_type):
|
||||
mapped_zb_type = 2 if zb_type == 3 else zb_type
|
||||
downloaded_filepath, file_type = download_file(file_url, downloaded_filename,True)
|
||||
|
||||
if downloaded_filepath is None or file_type == 4:
|
||||
if not downloaded_filepath or file_type == 4:
|
||||
return None
|
||||
|
||||
logger.info(f"Local file path: {downloaded_filepath}")
|
||||
|
@ -59,25 +59,12 @@ def process_and_stream(file_url, zb_type):
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
downloaded = download_file(file_url, downloaded_filename,True)
|
||||
if not downloaded:
|
||||
downloaded_filepath, file_type = download_file(file_url, downloaded_filename, True)
|
||||
if not downloaded_filepath or file_type == 4:
|
||||
logger.error("下载文件失败或不支持的文件类型")
|
||||
log_error_unique_id(unique_id,1) # 记录失败的 unique_id
|
||||
log_error_unique_id(unique_id, 1)
|
||||
error_response = create_response(
|
||||
message='文件处理失败',
|
||||
status='error',
|
||||
data=''
|
||||
)
|
||||
yield sse_format(error_response)
|
||||
return
|
||||
|
||||
downloaded_filepath, file_type = downloaded
|
||||
|
||||
if file_type == 4:
|
||||
logger.error("不支持的文件类型")
|
||||
log_error_unique_id(unique_id,1) # 记录失败的 unique_id
|
||||
error_response = create_response(
|
||||
message='不支持的文件类型',
|
||||
message='下载文件失败或不支持的文件类型',
|
||||
status='error',
|
||||
data=''
|
||||
)
|
||||
|
@ -288,13 +288,14 @@ def engineering_bid_main(output_folder, file_path, file_type, unique_id):
|
||||
#TODO:基本信息,判断是否这里,打勾逻辑取消了。
|
||||
if __name__ == "__main__":
|
||||
start_time = time.time()
|
||||
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test1"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f"
|
||||
file_type = 2 #1:docx 2:pdf 3:其他
|
||||
input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf"
|
||||
input_file = r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f\zytest.pdf"
|
||||
print("yes")
|
||||
# for output in engineering_bid_main(output_folder, input_file, file_type, "zytest"):
|
||||
# print(output)
|
||||
preprocess_files(output_folder,input_file,2,"121")
|
||||
for output in engineering_bid_main(output_folder, input_file, file_type, "zytest"):
|
||||
print(output)
|
||||
# preprocess_files(output_folder,input_file,2,"121")
|
||||
# engineering_bid_main(output_folder,input_file,2,"111")
|
||||
end_time = time.time()
|
||||
elapsed_time = end_time - start_time # 计算耗时
|
||||
print(f"Function execution took {elapsed_time} seconds.")
|
||||
|
@ -310,17 +310,17 @@ if __name__ == "__main__":
|
||||
unique_id = "uuidzyzy11"
|
||||
logger = get_global_logger(unique_id)
|
||||
|
||||
output_folder = "flask_app/static/output/zytest1"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\货物标\little_parse"
|
||||
file_type = 2 # 1:docx 2:pdf 3:其他
|
||||
input_file = r"C:\Users\Administrator\Desktop\fsdownload\e5c8ca13-6043-49e5-a156-685bc1aabb58\ztbfile.pdf"
|
||||
input_file = r"C:\Users\Administrator\Desktop\货物标\little_parse\ztbfile.pdf"
|
||||
start_time = time.time()
|
||||
|
||||
# preprocess_files(output_folder, input_file, file_type, logger)
|
||||
# 创建生成器
|
||||
generator = goods_bid_main(output_folder, input_file, file_type, unique_id)
|
||||
# 迭代生成器,逐步获取和处理结果
|
||||
for output in generator:
|
||||
print(output)
|
||||
preprocess_files(output_folder, input_file, file_type, logger)
|
||||
# # 创建生成器
|
||||
# generator = goods_bid_main(output_folder, input_file, file_type, unique_id)
|
||||
# # 迭代生成器,逐步获取和处理结果
|
||||
# for output in generator:
|
||||
# print(output)
|
||||
|
||||
end_time = time.time()
|
||||
elapsed_time = end_time - start_time # 计算耗时
|
||||
|
@ -311,11 +311,11 @@ if __name__ == "__main__":
|
||||
logger = get_global_logger("123")
|
||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
||||
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
||||
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\2c3e2291-6804-4ef0-b4a8-6f457edd5709\ztbfile.pdf"
|
||||
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f\zytest.pdf"
|
||||
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
||||
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\货物标\output4"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f"
|
||||
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
||||
selection = 4 # 例如:1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 procurement 6-invalid
|
||||
selection = 2 # 例如:1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 procurement 6-invalid
|
||||
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
||||
print(generated_files)
|
||||
|
Loading…
x
Reference in New Issue
Block a user