2.18 解决文档转换失败就报错的问题

This commit is contained in:
zy123 2025-02-18 21:11:37 +08:00
parent d4a9d4edae
commit d0dc142547
10 changed files with 95 additions and 94 deletions

View File

@ -20,7 +20,7 @@ def download_file(url, local_filename,enable=False):
2 - .pdf
3 - .doc
4 - 其他
- None: 下载失败
- "": 下载失败
"""
try:
with requests.get(url, stream=True) as response:
@ -72,7 +72,7 @@ def download_file(url, local_filename,enable=False):
except Exception as e:
print(f"download: 发生错误: {e}")
return None,4
return "",4
def local_file_2_url(file_path, url):
receive_file_url = ""

View File

@ -67,6 +67,9 @@ def delete_mark(docx_path):
如果该段落后紧跟一个包含 "w:sectPr" 的空白分节符段落也一并删除
修改后的文档保存为 invalid_del.docx并返回新文件路径
"""
if not docx_path:
print("Invalid input: docx_path is None or empty.")
return ""
try:
doc = Document(docx_path)
except KeyError as e:

View File

@ -173,19 +173,24 @@ def parse_text_by_heading(text):
data[f"{main_number}."] = temp_title
temp_title = None # 重置临时标题
# 保存阿拉伯数字的标题内容
last_char = current_content[-1][-1] if current_content and current_content[-1] else None
if current_key is None or (compare_headings(current_key, new_key) and (
len(current_content) == 0 or current_content[-1][-1] != '')):
not current_content or last_char != '')):
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
current_key = new_key
current_content = [line_content]
append_newline = len(new_key.rstrip('.').split('.')) <= 2 or any(re.search(pattern, line_content) for pattern in special_section_keywords)
append_newline = (
len(new_key.rstrip('.').split('.')) <= 2 or
any(re.search(pattern, line_content) for pattern in special_section_keywords)
)
last_main_number = new_key.split('.')[0]
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
in_special_section)
append_newline = handle_content_append(
current_content, line_stripped, append_newline, keywords, in_special_section
)
elif dot_match:
if in_double_hash_mode:
@ -440,54 +445,58 @@ def extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern
def convert_clause_to_json(file_path, output_folder, type=1):
if not os.path.exists(file_path):
print(f"The specified file does not exist: 返回空的clause_path")
return ""
if type == 2:
start_pattern_1 = r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)]?\s*$'
start_pattern_2 = None
end_pattern = (
r'^第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|' # 第一部分
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前瞻部分
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$' # 第二部分
)
else:
start_pattern_1 = (
r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' # 匹配开头为圆括号或者中文括号并含有数字和逗号、句号等的部分
r'[一二12][、..]+|' # 匹配包含“一、二、12”以及逗号、点号等的部分
r'[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' # 匹配“说明、总则、名词解释”这类关键词
)
start_pattern_2 = (
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前面的“见”,“与”,“”等字符
r'(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$|' # 匹配“投标人”,“磋商”,“谈判”,“供应商”,“应答人”后跟“须知”
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)' # 添加“第X章/部分”部分
)
try:
if not os.path.exists(file_path):
print(f"The specified file does not exist: 返回空的clause_path")
return ""
if type == 2:
start_pattern_1 = r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)]?\s*$'
start_pattern_2 = None
end_pattern = (
r'^第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|' # 第一部分
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前瞻部分
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$' # 第二部分
)
else:
start_pattern_1 = (
r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' # 匹配开头为圆括号或者中文括号并含有数字和逗号、句号等的部分
r'[一二12][、..]+|' # 匹配包含“一、二、12”以及逗号、点号等的部分
r'[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' # 匹配“说明、总则、名词解释”这类关键词
)
start_pattern_2 = (
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前面的“见”,“与”,“”等字符
r'(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$|' # 匹配“投标人”,“磋商”,“谈判”,“供应商”,“应答人”后跟“须知”
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)' # 添加“第X章/部分”部分
)
end_pattern = (
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
r'(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$|'
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附录(?:一)?[:]|'
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附件(?:一)?[:]|'
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附表(?:一)?[:]'
)
if file_path.endswith('.pdf'):
text = extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern_2)
# print(text)
else:
raise ValueError("Unsupported file format")
parsed_data = parse_text_by_heading(text)
# result = convert_to_json(input_path, start_word, end_pattern)
# 检查输出文件夹是否存在,如果不存在则创建
if not os.path.exists(output_folder):
os.makedirs(output_folder)
print(f"convert_clause_to_json:创建输出文件夹: {output_folder}")
file_name = "clause1.json" if type == 1 else "clause2.json"
# file_name = f"clause{suffix_counter}.json"
output_path = os.path.join(output_folder, file_name)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(parsed_data, f, indent=4, ensure_ascii=False)
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
return output_path
end_pattern = (
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
r'(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$|'
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附录(?:一)?[:]|'
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附件(?:一)?[:]|'
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附表(?:一)?[:]'
)
if file_path.endswith('.pdf'):
text = extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern_2)
# print(text)
else:
raise ValueError("Unsupported file format")
parsed_data = parse_text_by_heading(text)
# result = convert_to_json(input_path, start_word, end_pattern)
# 检查输出文件夹是否存在,如果不存在则创建
if not os.path.exists(output_folder):
os.makedirs(output_folder)
print(f"convert_clause_to_json:创建输出文件夹: {output_folder}")
file_name = "clause1.json" if type == 1 else "clause2.json"
# file_name = f"clause{suffix_counter}.json"
output_path = os.path.join(output_folder, file_name)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(parsed_data, f, indent=4, ensure_ascii=False)
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
return output_path
except Exception as e:
print(f"Error in convert_clause_to_json: {e}")
return ""
if __name__ == "__main__":

View File

@ -195,14 +195,15 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
if __name__ == '__main__':
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
pdf_path=r'C:\Users\Administrator\Downloads\完全没有解析进度1_南网超高压公司2024年第四批服务公开招标项目2024-FW-4-S-ZB2 (1).pdf'
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f\zytest.pdf"
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\output2\广水市妇幼招标文件最新W改_evaluation_method.pdf"
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
# ress = extract_common_header(pdf_path)
# print(ress)
# print("-----------------")
res=extract_text_by_page_fitz(pdf_path)
ress = extract_common_header(pdf_path)
print(ress)
print("-----------------")
extract_text_by_page(pdf_path)
# res=extract_text_by_page_fitz(pdf_path)
# print(res)磋商文件_tobidders_notice_part2.pdf
# save_extracted_text_to_txt(file_path,"output.txt")

View File

@ -41,7 +41,7 @@ def get_deviation(): #提供商务、技术偏离的数据
downloaded_filename = os.path.join(output_folder, filename)
# 下载文件
downloaded_filepath, file_type = download_file(file_url, downloaded_filename)
if downloaded_filepath is None or file_type == 4:
if not downloaded_filepath or file_type == 4:
logger.error("下载地址不存在或不支持的文件类型!")
log_error_unique_id(unique_id, 3)
response = create_response(

View File

@ -67,7 +67,7 @@ def download_and_process_file(file_url, zb_type):
mapped_zb_type = 2 if zb_type == 3 else zb_type
downloaded_filepath, file_type = download_file(file_url, downloaded_filename,True)
if downloaded_filepath is None or file_type == 4:
if not downloaded_filepath or file_type == 4:
return None
logger.info(f"Local file path: {downloaded_filepath}")

View File

@ -59,25 +59,12 @@ def process_and_stream(file_url, zb_type):
start_time = time.time()
try:
downloaded = download_file(file_url, downloaded_filename,True)
if not downloaded:
downloaded_filepath, file_type = download_file(file_url, downloaded_filename, True)
if not downloaded_filepath or file_type == 4:
logger.error("下载文件失败或不支持的文件类型")
log_error_unique_id(unique_id,1) # 记录失败的 unique_id
log_error_unique_id(unique_id, 1)
error_response = create_response(
message='文件处理失败',
status='error',
data=''
)
yield sse_format(error_response)
return
downloaded_filepath, file_type = downloaded
if file_type == 4:
logger.error("不支持的文件类型")
log_error_unique_id(unique_id,1) # 记录失败的 unique_id
error_response = create_response(
message='不支持的文件类型',
message='下载文件失败或不支持的文件类型',
status='error',
data=''
)

View File

@ -288,13 +288,14 @@ def engineering_bid_main(output_folder, file_path, file_type, unique_id):
#TODO:基本信息,判断是否这里,打勾逻辑取消了。
if __name__ == "__main__":
start_time = time.time()
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test1"
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f"
file_type = 2 #1:docx 2:pdf 3:其他
input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf"
input_file = r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f\zytest.pdf"
print("yes")
# for output in engineering_bid_main(output_folder, input_file, file_type, "zytest"):
# print(output)
preprocess_files(output_folder,input_file,2,"121")
for output in engineering_bid_main(output_folder, input_file, file_type, "zytest"):
print(output)
# preprocess_files(output_folder,input_file,2,"121")
# engineering_bid_main(output_folder,input_file,2,"111")
end_time = time.time()
elapsed_time = end_time - start_time # 计算耗时
print(f"Function execution took {elapsed_time} seconds.")

View File

@ -310,17 +310,17 @@ if __name__ == "__main__":
unique_id = "uuidzyzy11"
logger = get_global_logger(unique_id)
output_folder = "flask_app/static/output/zytest1"
output_folder = r"C:\Users\Administrator\Desktop\货物标\little_parse"
file_type = 2 # 1:docx 2:pdf 3:其他
input_file = r"C:\Users\Administrator\Desktop\fsdownload\e5c8ca13-6043-49e5-a156-685bc1aabb58\ztbfile.pdf"
input_file = r"C:\Users\Administrator\Desktop\货物标\little_parse\ztbfile.pdf"
start_time = time.time()
# preprocess_files(output_folder, input_file, file_type, logger)
# 创建生成器
generator = goods_bid_main(output_folder, input_file, file_type, unique_id)
# 迭代生成器,逐步获取和处理结果
for output in generator:
print(output)
preprocess_files(output_folder, input_file, file_type, logger)
# # 创建生成器
# generator = goods_bid_main(output_folder, input_file, file_type, unique_id)
# # 迭代生成器,逐步获取和处理结果
# for output in generator:
# print(output)
end_time = time.time()
elapsed_time = end_time - start_time # 计算耗时

View File

@ -311,11 +311,11 @@ if __name__ == "__main__":
logger = get_global_logger("123")
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\2c3e2291-6804-4ef0-b4a8-6f457edd5709\ztbfile.pdf"
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f\zytest.pdf"
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
output_folder = r"C:\Users\Administrator\Desktop\货物标\output4"
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f"
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
selection = 4 # 例如1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 procurement 6-invalid
selection = 2 # 例如1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 procurement 6-invalid
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
print(generated_files)