From d0dc1425474246eeacbffb5acf7e9a3fab509571 Mon Sep 17 00:00:00 2001
From: zy123 <646228430@qq.com>
Date: Tue, 18 Feb 2025 21:11:37 +0800
Subject: [PATCH] =?UTF-8?q?2.18=20=E8=A7=A3=E5=86=B3=E6=96=87=E6=A1=A3?=
 =?UTF-8?q?=E8=BD=AC=E6=8D=A2=E5=A4=B1=E8=B4=A5=E5=B0=B1=E6=8A=A5=E9=94=99?=
 =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 flask_app/general/format_change.py            |   4 +-
 flask_app/general/insert_del_pagemark.py      |   3 +
 .../投标人须知正文条款提取成json文件.py       | 113 ++++++++++--------
 flask_app/general/读取文件/按页读取pdf.py     |  11 +-
 flask_app/routes/get_deviation.py             |   2 +-
 flask_app/routes/little_zbparse.py            |   2 +-
 flask_app/routes/upload.py                    |  21 +---
 flask_app/routes/工程标解析main.py            |  11 +-
 flask_app/routes/货物标解析main.py            |  16 +--
 flask_app/货物标/截取pdf货物标版.py           |   6 +-
 10 files changed, 95 insertions(+), 94 deletions(-)

diff --git a/flask_app/general/format_change.py b/flask_app/general/format_change.py
index 33d2da0..531b5fe 100644
--- a/flask_app/general/format_change.py
+++ b/flask_app/general/format_change.py
@@ -20,7 +20,7 @@ def download_file(url, local_filename,enable=False):
                  2 - .pdf
                  3 - .doc
                  4 - 其他
-    - None: 下载失败
+    - "": 下载失败
     """
     try:
         with requests.get(url, stream=True) as response:
@@ -72,7 +72,7 @@ def download_file(url, local_filename,enable=False):
     except Exception as e:
         print(f"download: 发生错误: {e}")
 
-    return None,4
+    return "",4
 
 def local_file_2_url(file_path, url):
     receive_file_url = ""
diff --git a/flask_app/general/insert_del_pagemark.py b/flask_app/general/insert_del_pagemark.py
index 9cdc67f..3faa449 100644
--- a/flask_app/general/insert_del_pagemark.py
+++ b/flask_app/general/insert_del_pagemark.py
@@ -67,6 +67,9 @@ def delete_mark(docx_path):
     如果该段落后紧跟一个包含 "w:sectPr" 的空白分节符段落，也一并删除。
     修改后的文档保存为 invalid_del.docx，并返回新文件路径。
     """
+    if not docx_path:
+        print("Invalid input: docx_path is None or empty.")
+        return ""
     try:
         doc = Document(docx_path)
     except KeyError as e:
diff --git a/flask_app/general/投标人须知正文条款提取成json文件.py b/flask_app/general/投标人须知正文条款提取成json文件.py
index 318b544..bc7936f 100644
--- a/flask_app/general/投标人须知正文条款提取成json文件.py
+++ b/flask_app/general/投标人须知正文条款提取成json文件.py
@@ -173,19 +173,24 @@ def parse_text_by_heading(text):
                 data[f"{main_number}."] = temp_title
                 temp_title = None  # 重置临时标题
 
-            # 保存阿拉伯数字的标题内容
+            last_char = current_content[-1][-1] if current_content and current_content[-1] else None
+
             if current_key is None or (compare_headings(current_key, new_key) and (
-                    len(current_content) == 0 or current_content[-1][-1] != '第')):
+                    not current_content or last_char != '第')):
                 if current_key is not None:
                     content_string = ''.join(current_content).strip()
                     data[current_key] = data.get(current_key, '') + content_string
                 current_key = new_key
                 current_content = [line_content]
-                append_newline = len(new_key.rstrip('.').split('.')) <= 2 or any(re.search(pattern, line_content) for pattern in special_section_keywords)
+                append_newline = (
+                        len(new_key.rstrip('.').split('.')) <= 2 or
+                        any(re.search(pattern, line_content) for pattern in special_section_keywords)
+                )
                 last_main_number = new_key.split('.')[0]
             else:
-                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
-                                                       in_special_section)
+                append_newline = handle_content_append(
+                    current_content, line_stripped, append_newline, keywords, in_special_section
+                )
 
         elif dot_match:
             if in_double_hash_mode:
@@ -440,54 +445,58 @@ def extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern
 
 
 def convert_clause_to_json(file_path, output_folder, type=1):
-    if not os.path.exists(file_path):
-        print(f"The specified file does not exist: 返回空的clause_path")
-        return ""
-    if type == 2:
-        start_pattern_1 = r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)）]?\s*$'
-        start_pattern_2 = None
-        end_pattern = (
-            r'^第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|'  # 第一部分
-            r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'  # 否定前瞻部分
-            r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$'  # 第二部分
-        )
-    else:
-        start_pattern_1 = (
-            r'^\s*(?:[（(]\s*[一二12]?\s*[)）]\s*[、．.]*|'  # 匹配开头为圆括号或者中文括号并含有数字和逗号、句号等的部分
-            r'[一二12][、．.]+|'  # 匹配包含“一、二、12”以及逗号、点号等的部分
-            r'[、．.]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'  # 匹配“说明、总则、名词解释”这类关键词
-        )
-        start_pattern_2 = (
-            r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'  # 否定前面的“见”，“与”，“”等字符
-            r'(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$|'  # 匹配“投标人”，“磋商”，“谈判”，“供应商”，“应答人”后跟“须知”
-            r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)'  # 添加“第X章/部分”部分
-        )
+    try:
+        if not os.path.exists(file_path):
+            print(f"The specified file does not exist: 返回空的clause_path")
+            return ""
+        if type == 2:
+            start_pattern_1 = r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)）]?\s*$'
+            start_pattern_2 = None
+            end_pattern = (
+                r'^第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|'  # 第一部分
+                r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'  # 否定前瞻部分
+                r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$'  # 第二部分
+            )
+        else:
+            start_pattern_1 = (
+                r'^\s*(?:[（(]\s*[一二12]?\s*[)）]\s*[、．.]*|'  # 匹配开头为圆括号或者中文括号并含有数字和逗号、句号等的部分
+                r'[一二12][、．.]+|'  # 匹配包含“一、二、12”以及逗号、点号等的部分
+                r'[、．.]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'  # 匹配“说明、总则、名词解释”这类关键词
+            )
+            start_pattern_2 = (
+                r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'  # 否定前面的“见”，“与”，“”等字符
+                r'(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$|'  # 匹配“投标人”，“磋商”，“谈判”，“供应商”，“应答人”后跟“须知”
+                r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)'  # 添加“第X章/部分”部分
+            )
 
-        end_pattern = (
-            r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
-            r'(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$|'
-            r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附录(?:一)?[：:]|'
-            r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附件(?:一)?[：:]|'
-            r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附表(?:一)?[：:]'
-        )
-    if file_path.endswith('.pdf'):
-        text = extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern_2)
-        # print(text)
-    else:
-        raise ValueError("Unsupported file format")
-    parsed_data = parse_text_by_heading(text)
-    # result = convert_to_json(input_path, start_word, end_pattern)
-    # 检查输出文件夹是否存在，如果不存在则创建
-    if not os.path.exists(output_folder):
-        os.makedirs(output_folder)
-        print(f"convert_clause_to_json:创建输出文件夹: {output_folder}")
-    file_name = "clause1.json" if type == 1 else "clause2.json"
-    # file_name = f"clause{suffix_counter}.json"
-    output_path = os.path.join(output_folder, file_name)
-    with open(output_path, 'w', encoding='utf-8') as f:
-        json.dump(parsed_data, f, indent=4, ensure_ascii=False)
-    print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
-    return output_path
+            end_pattern = (
+                r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
+                r'(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$|'
+                r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附录(?:一)?[：:]|'
+                r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附件(?:一)?[：:]|'
+                r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附表(?:一)?[：:]'
+            )
+        if file_path.endswith('.pdf'):
+            text = extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern_2)
+            # print(text)
+        else:
+            raise ValueError("Unsupported file format")
+        parsed_data = parse_text_by_heading(text)
+        # result = convert_to_json(input_path, start_word, end_pattern)
+        # 检查输出文件夹是否存在，如果不存在则创建
+        if not os.path.exists(output_folder):
+            os.makedirs(output_folder)
+            print(f"convert_clause_to_json:创建输出文件夹: {output_folder}")
+        file_name = "clause1.json" if type == 1 else "clause2.json"
+        # file_name = f"clause{suffix_counter}.json"
+        output_path = os.path.join(output_folder, file_name)
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(parsed_data, f, indent=4, ensure_ascii=False)
+        print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
+        return output_path
+    except Exception as e:
+        print(f"Error in convert_clause_to_json: {e}")
+        return ""
 
 
 if __name__ == "__main__":
diff --git a/flask_app/general/读取文件/按页读取pdf.py b/flask_app/general/读取文件/按页读取pdf.py
index 2076306..e44a222 100644
--- a/flask_app/general/读取文件/按页读取pdf.py
+++ b/flask_app/general/读取文件/按页读取pdf.py
@@ -195,14 +195,15 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
 
 if __name__ == '__main__':
     # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
-    pdf_path=r'C:\Users\Administrator\Downloads\（完全没有解析进度）1_南网超高压公司2024年第四批服务公开招标项目（2024-FW-4-S-ZB2） (1).pdf'
+    pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f\zytest.pdf"
     # pdf_path=r"C:\Users\Administrator\Desktop\货物标\output2\广水市妇幼招标文件最新（W改）_evaluation_method.pdf"
     # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
     # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
     # file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
-    # ress = extract_common_header(pdf_path)
-    # print(ress)
-    # print("-----------------")
-    res=extract_text_by_page_fitz(pdf_path)
+    ress = extract_common_header(pdf_path)
+    print(ress)
+    print("-----------------")
+    extract_text_by_page(pdf_path)
+    # res=extract_text_by_page_fitz(pdf_path)
     # print(res)磋商文件_tobidders_notice_part2.pdf
     # save_extracted_text_to_txt(file_path,"output.txt")
diff --git a/flask_app/routes/get_deviation.py b/flask_app/routes/get_deviation.py
index bb46ca2..faef76a 100644
--- a/flask_app/routes/get_deviation.py
+++ b/flask_app/routes/get_deviation.py
@@ -41,7 +41,7 @@ def get_deviation():                  #提供商务、技术偏离的数据
             downloaded_filename = os.path.join(output_folder, filename)
             # 下载文件
             downloaded_filepath, file_type = download_file(file_url, downloaded_filename)
-            if downloaded_filepath is None or file_type == 4:
+            if not downloaded_filepath or file_type == 4:
                 logger.error("下载地址不存在或不支持的文件类型！")
                 log_error_unique_id(unique_id, 3)
                 response = create_response(
diff --git a/flask_app/routes/little_zbparse.py b/flask_app/routes/little_zbparse.py
index 521a260..8645c0f 100644
--- a/flask_app/routes/little_zbparse.py
+++ b/flask_app/routes/little_zbparse.py
@@ -67,7 +67,7 @@ def download_and_process_file(file_url, zb_type):
     mapped_zb_type = 2 if zb_type == 3 else zb_type
     downloaded_filepath, file_type = download_file(file_url, downloaded_filename,True)
 
-    if downloaded_filepath is None or file_type == 4:
+    if not downloaded_filepath or file_type == 4:
         return None
 
     logger.info(f"Local file path: {downloaded_filepath}")
diff --git a/flask_app/routes/upload.py b/flask_app/routes/upload.py
index cf08d46..4fa4b05 100644
--- a/flask_app/routes/upload.py
+++ b/flask_app/routes/upload.py
@@ -59,25 +59,12 @@ def process_and_stream(file_url, zb_type):
     start_time = time.time()
 
     try:
-        downloaded = download_file(file_url, downloaded_filename,True)
-        if not downloaded:
+        downloaded_filepath, file_type = download_file(file_url, downloaded_filename, True)
+        if not downloaded_filepath or file_type == 4:
             logger.error("下载文件失败或不支持的文件类型")
-            log_error_unique_id(unique_id,1)  # 记录失败的 unique_id
+            log_error_unique_id(unique_id, 1)
             error_response = create_response(
-                message='文件处理失败',
-                status='error',
-                data=''
-            )
-            yield sse_format(error_response)
-            return
-
-        downloaded_filepath, file_type = downloaded
-
-        if file_type == 4:
-            logger.error("不支持的文件类型")
-            log_error_unique_id(unique_id,1)  # 记录失败的 unique_id
-            error_response = create_response(
-                message='不支持的文件类型',
+                message='下载文件失败或不支持的文件类型',
                 status='error',
                 data=''
             )
diff --git a/flask_app/routes/工程标解析main.py b/flask_app/routes/工程标解析main.py
index 5174b71..27771e7 100644
--- a/flask_app/routes/工程标解析main.py
+++ b/flask_app/routes/工程标解析main.py
@@ -288,13 +288,14 @@ def engineering_bid_main(output_folder, file_path, file_type, unique_id):
 #TODO:基本信息，判断是否这里，打勾逻辑取消了。
 if __name__ == "__main__":
     start_time = time.time()
-    output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test1"
+    output_folder = r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f"
     file_type = 2        #1:docx 2:pdf 3:其他
-    input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf"
+    input_file = r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f\zytest.pdf"
     print("yes")
-    # for output in engineering_bid_main(output_folder, input_file, file_type, "zytest"):
-    #     print(output)
-    preprocess_files(output_folder,input_file,2,"121")
+    for output in engineering_bid_main(output_folder, input_file, file_type, "zytest"):
+        print(output)
+    # preprocess_files(output_folder,input_file,2,"121")
+    # engineering_bid_main(output_folder,input_file,2,"111")
     end_time = time.time()
     elapsed_time = end_time - start_time  # 计算耗时
     print(f"Function execution took {elapsed_time} seconds.")
diff --git a/flask_app/routes/货物标解析main.py b/flask_app/routes/货物标解析main.py
index a33b6d0..4fa4f44 100644
--- a/flask_app/routes/货物标解析main.py
+++ b/flask_app/routes/货物标解析main.py
@@ -310,17 +310,17 @@ if __name__ == "__main__":
     unique_id = "uuidzyzy11"
     logger = get_global_logger(unique_id)
 
-    output_folder = "flask_app/static/output/zytest1"
+    output_folder = r"C:\Users\Administrator\Desktop\货物标\little_parse"
     file_type = 2  # 1:docx 2:pdf 3:其他
-    input_file = r"C:\Users\Administrator\Desktop\fsdownload\e5c8ca13-6043-49e5-a156-685bc1aabb58\ztbfile.pdf"
+    input_file = r"C:\Users\Administrator\Desktop\货物标\little_parse\ztbfile.pdf"
     start_time = time.time()
 
-    # preprocess_files(output_folder, input_file, file_type, logger)
-    # 创建生成器
-    generator = goods_bid_main(output_folder, input_file, file_type, unique_id)
-    # 迭代生成器，逐步获取和处理结果
-    for output in generator:
-        print(output)
+    preprocess_files(output_folder, input_file, file_type, logger)
+    # # 创建生成器
+    # generator = goods_bid_main(output_folder, input_file, file_type, unique_id)
+    # # 迭代生成器，逐步获取和处理结果
+    # for output in generator:
+    #     print(output)
 
     end_time = time.time()
     elapsed_time = end_time - start_time  # 计算耗时
diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py
index 0e08b9f..87e33df 100644
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@@ -311,11 +311,11 @@ if __name__ == "__main__":
     logger = get_global_logger("123")
     # input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
     # pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
-    pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\2c3e2291-6804-4ef0-b4a8-6f457edd5709\ztbfile.pdf"
+    pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f\zytest.pdf"
     # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件（广水市教育局封闭管理）.pdf"
     # pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
-    output_folder = r"C:\Users\Administrator\Desktop\货物标\output4"
+    output_folder = r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f"
     # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
-    selection = 4  # 例如：1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求 procurement  6-invalid
+    selection = 2  # 例如：1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求 procurement  6-invalid
     generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
     print(generated_files)