2.18 解决文档转换失败就报错的问题

2025-02-18 21:11:37 +08:00 · 2025-02-18 21:11:37 +08:00 · d0dc142547
commit d0dc142547
parent d4a9d4edae
10 changed files with 95 additions and 94 deletions
--- a/flask_app/general/format_change.py
+++ b/flask_app/general/format_change.py
@ -20,7 +20,7 @@ def download_file(url, local_filename,enable=False):
                 2 - .pdf
                 3 - .doc
                 4 - 其他
-    - None: 下载失败
+    - "": 下载失败
    """
    try:
        with requests.get(url, stream=True) as response:
@ -72,7 +72,7 @@ def download_file(url, local_filename,enable=False):
    except Exception as e:
        print(f"download: 发生错误: {e}")

-    return None,4
+    return "",4

 def local_file_2_url(file_path, url):
    receive_file_url = ""
--- a/flask_app/general/insert_del_pagemark.py
+++ b/flask_app/general/insert_del_pagemark.py
@ -67,6 +67,9 @@ def delete_mark(docx_path):
    如果该段落后紧跟一个包含 "w:sectPr" 的空白分节符段落，也一并删除。
    修改后的文档保存为 invalid_del.docx，并返回新文件路径。
    """
+    if not docx_path:
+        print("Invalid input: docx_path is None or empty.")
+        return ""
    try:
        doc = Document(docx_path)
    except KeyError as e:
--- a/flask_app/general/投标人须知正文条款提取成json文件.py
+++ b/flask_app/general/投标人须知正文条款提取成json文件.py
@ -173,19 +173,24 @@ def parse_text_by_heading(text):
                data[f"{main_number}."] = temp_title
                temp_title = None  # 重置临时标题

-            # 保存阿拉伯数字的标题内容
+            last_char = current_content[-1][-1] if current_content and current_content[-1] else None
+
            if current_key is None or (compare_headings(current_key, new_key) and (
-                    len(current_content) == 0 or current_content[-1][-1] != '第')):
+                    not current_content or last_char != '第')):
                if current_key is not None:
                    content_string = ''.join(current_content).strip()
                    data[current_key] = data.get(current_key, '') + content_string
                current_key = new_key
                current_content = [line_content]
-                append_newline = len(new_key.rstrip('.').split('.')) <= 2 or any(re.search(pattern, line_content) for pattern in special_section_keywords)
+                append_newline = (
+                        len(new_key.rstrip('.').split('.')) <= 2 or
+                        any(re.search(pattern, line_content) for pattern in special_section_keywords)
+                )
                last_main_number = new_key.split('.')[0]
            else:
-                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
-                                                       in_special_section)
+                append_newline = handle_content_append(
+                    current_content, line_stripped, append_newline, keywords, in_special_section
+                )

        elif dot_match:
            if in_double_hash_mode:
@ -440,54 +445,58 @@ def extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern


 def convert_clause_to_json(file_path, output_folder, type=1):
-    if not os.path.exists(file_path):
-        print(f"The specified file does not exist: 返回空的clause_path")
-        return ""
-    if type == 2:
-        start_pattern_1 = r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)）]?\s*$'
-        start_pattern_2 = None
-        end_pattern = (
-            r'^第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|'  # 第一部分
-            r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'  # 否定前瞻部分
-            r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$'  # 第二部分
-        )
-    else:
-        start_pattern_1 = (
-            r'^\s*(?:[（(]\s*[一二12]?\s*[)）]\s*[、．.]*|'  # 匹配开头为圆括号或者中文括号并含有数字和逗号、句号等的部分
-            r'[一二12][、．.]+|'  # 匹配包含“一、二、12”以及逗号、点号等的部分
-            r'[、．.]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'  # 匹配“说明、总则、名词解释”这类关键词
-        )
-        start_pattern_2 = (
-            r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'  # 否定前面的“见”，“与”，“”等字符
-            r'(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$|'  # 匹配“投标人”，“磋商”，“谈判”，“供应商”，“应答人”后跟“须知”
-            r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)'  # 添加“第X章/部分”部分
-        )
+    try:
+        if not os.path.exists(file_path):
+            print(f"The specified file does not exist: 返回空的clause_path")
+            return ""
+        if type == 2:
+            start_pattern_1 = r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)）]?\s*$'
+            start_pattern_2 = None
+            end_pattern = (
+                r'^第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|'  # 第一部分
+                r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'  # 否定前瞻部分
+                r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$'  # 第二部分
+            )
+        else:
+            start_pattern_1 = (
+                r'^\s*(?:[（(]\s*[一二12]?\s*[)）]\s*[、．.]*|'  # 匹配开头为圆括号或者中文括号并含有数字和逗号、句号等的部分
+                r'[一二12][、．.]+|'  # 匹配包含“一、二、12”以及逗号、点号等的部分
+                r'[、．.]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'  # 匹配“说明、总则、名词解释”这类关键词
+            )
+            start_pattern_2 = (
+                r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'  # 否定前面的“见”，“与”，“”等字符
+                r'(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$|'  # 匹配“投标人”，“磋商”，“谈判”，“供应商”，“应答人”后跟“须知”
+                r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)'  # 添加“第X章/部分”部分
+            )

-        end_pattern = (
-            r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
-            r'(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$|'
-            r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附录(?:一)?[：:]|'
-            r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附件(?:一)?[：:]|'
-            r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附表(?:一)?[：:]'
-        )
-    if file_path.endswith('.pdf'):
-        text = extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern_2)
-        # print(text)
-    else:
-        raise ValueError("Unsupported file format")
-    parsed_data = parse_text_by_heading(text)
-    # result = convert_to_json(input_path, start_word, end_pattern)
-    # 检查输出文件夹是否存在，如果不存在则创建
-    if not os.path.exists(output_folder):
-        os.makedirs(output_folder)
-        print(f"convert_clause_to_json:创建输出文件夹: {output_folder}")
-    file_name = "clause1.json" if type == 1 else "clause2.json"
-    # file_name = f"clause{suffix_counter}.json"
-    output_path = os.path.join(output_folder, file_name)
-    with open(output_path, 'w', encoding='utf-8') as f:
-        json.dump(parsed_data, f, indent=4, ensure_ascii=False)
-    print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
-    return output_path
+            end_pattern = (
+                r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
+                r'(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$|'
+                r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附录(?:一)?[：:]|'
+                r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附件(?:一)?[：:]|'
+                r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)附表(?:一)?[：:]'
+            )
+        if file_path.endswith('.pdf'):
+            text = extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern_2)
+            # print(text)
+        else:
+            raise ValueError("Unsupported file format")
+        parsed_data = parse_text_by_heading(text)
+        # result = convert_to_json(input_path, start_word, end_pattern)
+        # 检查输出文件夹是否存在，如果不存在则创建
+        if not os.path.exists(output_folder):
+            os.makedirs(output_folder)
+            print(f"convert_clause_to_json:创建输出文件夹: {output_folder}")
+        file_name = "clause1.json" if type == 1 else "clause2.json"
+        # file_name = f"clause{suffix_counter}.json"
+        output_path = os.path.join(output_folder, file_name)
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(parsed_data, f, indent=4, ensure_ascii=False)
+        print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
+        return output_path
+    except Exception as e:
+        print(f"Error in convert_clause_to_json: {e}")
+        return ""


 if __name__ == "__main__":
--- a/flask_app/general/读取文件/按页读取pdf.py
+++ b/flask_app/general/读取文件/按页读取pdf.py
@ -195,14 +195,15 @@ def save_extracted_text_to_txt(pdf_path, txt_path):

 if __name__ == '__main__':
    # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
-    pdf_path=r'C:\Users\Administrator\Downloads\（完全没有解析进度）1_南网超高压公司2024年第四批服务公开招标项目（2024-FW-4-S-ZB2） (1).pdf'
+    pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f\zytest.pdf"
    # pdf_path=r"C:\Users\Administrator\Desktop\货物标\output2\广水市妇幼招标文件最新（W改）_evaluation_method.pdf"
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
    # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
    # file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
-    # ress = extract_common_header(pdf_path)
-    # print(ress)
-    # print("-----------------")
-    res=extract_text_by_page_fitz(pdf_path)
+    ress = extract_common_header(pdf_path)
+    print(ress)
+    print("-----------------")
+    extract_text_by_page(pdf_path)
+    # res=extract_text_by_page_fitz(pdf_path)
    # print(res)磋商文件_tobidders_notice_part2.pdf
    # save_extracted_text_to_txt(file_path,"output.txt")
--- a/flask_app/routes/get_deviation.py
+++ b/flask_app/routes/get_deviation.py
@ -41,7 +41,7 @@ def get_deviation():                  #提供商务、技术偏离的数据
            downloaded_filename = os.path.join(output_folder, filename)
            # 下载文件
            downloaded_filepath, file_type = download_file(file_url, downloaded_filename)
-            if downloaded_filepath is None or file_type == 4:
+            if not downloaded_filepath or file_type == 4:
                logger.error("下载地址不存在或不支持的文件类型！")
                log_error_unique_id(unique_id, 3)
                response = create_response(
--- a/flask_app/routes/little_zbparse.py
+++ b/flask_app/routes/little_zbparse.py
@ -67,7 +67,7 @@ def download_and_process_file(file_url, zb_type):
    mapped_zb_type = 2 if zb_type == 3 else zb_type
    downloaded_filepath, file_type = download_file(file_url, downloaded_filename,True)

-    if downloaded_filepath is None or file_type == 4:
+    if not downloaded_filepath or file_type == 4:
        return None

    logger.info(f"Local file path: {downloaded_filepath}")
--- a/flask_app/routes/upload.py
+++ b/flask_app/routes/upload.py
@ -59,25 +59,12 @@ def process_and_stream(file_url, zb_type):
    start_time = time.time()

    try:
-        downloaded = download_file(file_url, downloaded_filename,True)
-        if not downloaded:
+        downloaded_filepath, file_type = download_file(file_url, downloaded_filename, True)
+        if not downloaded_filepath or file_type == 4:
            logger.error("下载文件失败或不支持的文件类型")
-            log_error_unique_id(unique_id,1)  # 记录失败的 unique_id
+            log_error_unique_id(unique_id, 1)
            error_response = create_response(
-                message='文件处理失败',
-                status='error',
-                data=''
-            )
-            yield sse_format(error_response)
-            return
-
-        downloaded_filepath, file_type = downloaded
-
-        if file_type == 4:
-            logger.error("不支持的文件类型")
-            log_error_unique_id(unique_id,1)  # 记录失败的 unique_id
-            error_response = create_response(
-                message='不支持的文件类型',
+                message='下载文件失败或不支持的文件类型',
                status='error',
                data=''
            )
--- a/flask_app/routes/工程标解析main.py
+++ b/flask_app/routes/工程标解析main.py
@ -288,13 +288,14 @@ def engineering_bid_main(output_folder, file_path, file_type, unique_id):
 #TODO:基本信息，判断是否这里，打勾逻辑取消了。
 if __name__ == "__main__":
    start_time = time.time()
-    output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test1"
+    output_folder = r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f"
    file_type = 2        #1:docx 2:pdf 3:其他
-    input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf"
+    input_file = r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f\zytest.pdf"
    print("yes")
-    # for output in engineering_bid_main(output_folder, input_file, file_type, "zytest"):
-    #     print(output)
-    preprocess_files(output_folder,input_file,2,"121")
+    for output in engineering_bid_main(output_folder, input_file, file_type, "zytest"):
+        print(output)
+    # preprocess_files(output_folder,input_file,2,"121")
+    # engineering_bid_main(output_folder,input_file,2,"111")
    end_time = time.time()
    elapsed_time = end_time - start_time  # 计算耗时
    print(f"Function execution took {elapsed_time} seconds.")
--- a/flask_app/routes/货物标解析main.py
+++ b/flask_app/routes/货物标解析main.py
@ -310,17 +310,17 @@ if __name__ == "__main__":
    unique_id = "uuidzyzy11"
    logger = get_global_logger(unique_id)

-    output_folder = "flask_app/static/output/zytest1"
+    output_folder = r"C:\Users\Administrator\Desktop\货物标\little_parse"
    file_type = 2  # 1:docx 2:pdf 3:其他
-    input_file = r"C:\Users\Administrator\Desktop\fsdownload\e5c8ca13-6043-49e5-a156-685bc1aabb58\ztbfile.pdf"
+    input_file = r"C:\Users\Administrator\Desktop\货物标\little_parse\ztbfile.pdf"
    start_time = time.time()

-    # preprocess_files(output_folder, input_file, file_type, logger)
-    # 创建生成器
-    generator = goods_bid_main(output_folder, input_file, file_type, unique_id)
-    # 迭代生成器，逐步获取和处理结果
-    for output in generator:
-        print(output)
+    preprocess_files(output_folder, input_file, file_type, logger)
+    # # 创建生成器
+    # generator = goods_bid_main(output_folder, input_file, file_type, unique_id)
+    # # 迭代生成器，逐步获取和处理结果
+    # for output in generator:
+    #     print(output)

    end_time = time.time()
    elapsed_time = end_time - start_time  # 计算耗时
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -311,11 +311,11 @@ if __name__ == "__main__":
    logger = get_global_logger("123")
    # input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
    # pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
-    pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\2c3e2291-6804-4ef0-b4a8-6f457edd5709\ztbfile.pdf"
+    pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f\zytest.pdf"
    # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件（广水市教育局封闭管理）.pdf"
    # pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
-    output_folder = r"C:\Users\Administrator\Desktop\货物标\output4"
+    output_folder = r"C:\Users\Administrator\Desktop\fsdownload\bb2747ad-5578-4b3b-b60f-3a2d1b672b6f"
    # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
-    selection = 4  # 例如：1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求 procurement  6-invalid
+    selection = 2  # 例如：1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求 procurement  6-invalid
    generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
    print(generated_files)