From d0dc1425474246eeacbffb5acf7e9a3fab509571 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Tue, 18 Feb 2025 21:11:37 +0800 Subject: [PATCH] =?UTF-8?q?2.18=20=E8=A7=A3=E5=86=B3=E6=96=87=E6=A1=A3?= =?UTF-8?q?=E8=BD=AC=E6=8D=A2=E5=A4=B1=E8=B4=A5=E5=B0=B1=E6=8A=A5=E9=94=99?= =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/format_change.py | 4 +- flask_app/general/insert_del_pagemark.py | 3 + .../投标人须知正文条款提取成json文件.py | 113 ++++++++++-------- flask_app/general/读取文件/按页读取pdf.py | 11 +- flask_app/routes/get_deviation.py | 2 +- flask_app/routes/little_zbparse.py | 2 +- flask_app/routes/upload.py | 21 +--- flask_app/routes/工程标解析main.py | 11 +- flask_app/routes/货物标解析main.py | 16 +-- flask_app/货物标/截取pdf货物标版.py | 6 +- 10 files changed, 95 insertions(+), 94 deletions(-) diff --git a/flask_app/general/format_change.py b/flask_app/general/format_change.py index 33d2da0..531b5fe 100644 --- a/flask_app/general/format_change.py +++ b/flask_app/general/format_change.py @@ -20,7 +20,7 @@ def download_file(url, local_filename,enable=False): 2 - .pdf 3 - .doc 4 - 其他 - - None: 下载失败 + - "": 下载失败 """ try: with requests.get(url, stream=True) as response: @@ -72,7 +72,7 @@ def download_file(url, local_filename,enable=False): except Exception as e: print(f"download: 发生错误: {e}") - return None,4 + return "",4 def local_file_2_url(file_path, url): receive_file_url = "" diff --git a/flask_app/general/insert_del_pagemark.py b/flask_app/general/insert_del_pagemark.py index 9cdc67f..3faa449 100644 --- a/flask_app/general/insert_del_pagemark.py +++ b/flask_app/general/insert_del_pagemark.py @@ -67,6 +67,9 @@ def delete_mark(docx_path): 如果该段落后紧跟一个包含 "w:sectPr" 的空白分节符段落,也一并删除。 修改后的文档保存为 invalid_del.docx,并返回新文件路径。 """ + if not docx_path: + print("Invalid input: docx_path is None or empty.") + return "" try: doc = Document(docx_path) except KeyError as e: diff --git a/flask_app/general/投标人须知正文条款提取成json文件.py b/flask_app/general/投标人须知正文条款提取成json文件.py index 318b544..bc7936f 100644 --- a/flask_app/general/投标人须知正文条款提取成json文件.py +++ b/flask_app/general/投标人须知正文条款提取成json文件.py @@ -173,19 +173,24 @@ def parse_text_by_heading(text): data[f"{main_number}."] = temp_title temp_title = None # 重置临时标题 - # 保存阿拉伯数字的标题内容 + last_char = current_content[-1][-1] if current_content and current_content[-1] else None + if current_key is None or (compare_headings(current_key, new_key) and ( - len(current_content) == 0 or current_content[-1][-1] != '第')): + not current_content or last_char != '第')): if current_key is not None: content_string = ''.join(current_content).strip() data[current_key] = data.get(current_key, '') + content_string current_key = new_key current_content = [line_content] - append_newline = len(new_key.rstrip('.').split('.')) <= 2 or any(re.search(pattern, line_content) for pattern in special_section_keywords) + append_newline = ( + len(new_key.rstrip('.').split('.')) <= 2 or + any(re.search(pattern, line_content) for pattern in special_section_keywords) + ) last_main_number = new_key.split('.')[0] else: - append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, - in_special_section) + append_newline = handle_content_append( + current_content, line_stripped, append_newline, keywords, in_special_section + ) elif dot_match: if in_double_hash_mode: @@ -440,54 +445,58 @@ def extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern def convert_clause_to_json(file_path, output_folder, type=1): - if not os.path.exists(file_path): - print(f"The specified file does not exist: 返回空的clause_path") - return "" - if type == 2: - start_pattern_1 = r'.*(?