diff --git a/flask_app/general/format_change.py b/flask_app/general/format_change.py index 33d2da0..531b5fe 100644 --- a/flask_app/general/format_change.py +++ b/flask_app/general/format_change.py @@ -20,7 +20,7 @@ def download_file(url, local_filename,enable=False): 2 - .pdf 3 - .doc 4 - 其他 - - None: 下载失败 + - "": 下载失败 """ try: with requests.get(url, stream=True) as response: @@ -72,7 +72,7 @@ def download_file(url, local_filename,enable=False): except Exception as e: print(f"download: 发生错误: {e}") - return None,4 + return "",4 def local_file_2_url(file_path, url): receive_file_url = "" diff --git a/flask_app/general/insert_del_pagemark.py b/flask_app/general/insert_del_pagemark.py index 9cdc67f..3faa449 100644 --- a/flask_app/general/insert_del_pagemark.py +++ b/flask_app/general/insert_del_pagemark.py @@ -67,6 +67,9 @@ def delete_mark(docx_path): 如果该段落后紧跟一个包含 "w:sectPr" 的空白分节符段落,也一并删除。 修改后的文档保存为 invalid_del.docx,并返回新文件路径。 """ + if not docx_path: + print("Invalid input: docx_path is None or empty.") + return "" try: doc = Document(docx_path) except KeyError as e: diff --git a/flask_app/general/投标人须知正文条款提取成json文件.py b/flask_app/general/投标人须知正文条款提取成json文件.py index 318b544..bc7936f 100644 --- a/flask_app/general/投标人须知正文条款提取成json文件.py +++ b/flask_app/general/投标人须知正文条款提取成json文件.py @@ -173,19 +173,24 @@ def parse_text_by_heading(text): data[f"{main_number}."] = temp_title temp_title = None # 重置临时标题 - # 保存阿拉伯数字的标题内容 + last_char = current_content[-1][-1] if current_content and current_content[-1] else None + if current_key is None or (compare_headings(current_key, new_key) and ( - len(current_content) == 0 or current_content[-1][-1] != '第')): + not current_content or last_char != '第')): if current_key is not None: content_string = ''.join(current_content).strip() data[current_key] = data.get(current_key, '') + content_string current_key = new_key current_content = [line_content] - append_newline = len(new_key.rstrip('.').split('.')) <= 2 or any(re.search(pattern, line_content) for pattern in special_section_keywords) + append_newline = ( + len(new_key.rstrip('.').split('.')) <= 2 or + any(re.search(pattern, line_content) for pattern in special_section_keywords) + ) last_main_number = new_key.split('.')[0] else: - append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, - in_special_section) + append_newline = handle_content_append( + current_content, line_stripped, append_newline, keywords, in_special_section + ) elif dot_match: if in_double_hash_mode: @@ -440,54 +445,58 @@ def extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern def convert_clause_to_json(file_path, output_folder, type=1): - if not os.path.exists(file_path): - print(f"The specified file does not exist: 返回空的clause_path") - return "" - if type == 2: - start_pattern_1 = r'.*(?