2.15 应该是pypdf2库的问题

2025-02-15 22:30:15 +08:00 · 2025-02-15 22:30:15 +08:00 · aea934cc31
commit aea934cc31
parent 110d1767f4
7 changed files with 102 additions and 96 deletions
--- a/flask_app/general/format_change.py
+++ b/flask_app/general/format_change.py
@ -205,7 +205,7 @@ def get_pdf_page_count(file_path):
 if __name__ == '__main__':
    # 替换为你的文件路径和API URL
    # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf"
-    local_path_in = r"C:\Users\Administrator\Downloads\基础公司枞阳经开区新能源汽车零部件产业园基础设施建设项目二期桩基工程劳务分包.doc"
+    local_path_in = r"C:\Users\Administrator\Desktop\fsdownload\bf0346dc-8711-43f5-839e-e4076acea84a\tmp\invalid_added.pdf"
    # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
    # local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
    # local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf"
@ -213,8 +213,8 @@ if __name__ == '__main__':
    # intermediate_docx = pdf2docx(local_path_in)
    # if intermediate_docx:
    #     normal_pdf = docx2pdf(intermediate_docx, force=True)
-    # # downloaded_file=pdf2docx(local_path_in)
-    downloaded_file=docx2pdf(local_path_in)
+    downloaded_file=pdf2docx(local_path_in)
+    # downloaded_file=docx2pdf(local_path_in)
    print(downloaded_file)

    # test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7.pdf?Expires=1736852995&OSSAccessKeyId=TMP.3Kg1oKKcsSWb7DXNe4F56bfGfKY5nNWUi274p39HyY7GR3mghMCaFWy69Fi83SBab6PmSkErh4JUD4yAxAGzVVx2hxxoxm&Signature=rmxS5lett4MzWdksDI57EujCklw%3"
--- a/flask_app/general/insert_del_pagemark.py
+++ b/flask_app/general/insert_del_pagemark.py
@ -108,9 +108,7 @@ if __name__ == '__main__':
    # input=r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\ztbfile.pdf'
    # input=r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\tmp\ztbfile_invalid.pdf'
    # output=insert_mark(input)
-    doc_path = r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\tmp\invalid_added.docx'
+    doc_path = r'C:\Users\Administrator\Desktop\fsdownload\bf0346dc-8711-43f5-839e-e4076acea84a\invalid_added.docx'
    res=delete_mark(doc_path)
-    if res:
-        print(res)
-    else:
-        print("No")
+    if not res:
+        print("yes")
--- a/flask_app/general/截取pdf_main.py
+++ b/flask_app/general/截取pdf_main.py
@ -1,14 +1,13 @@
 import concurrent.futures
 import os
 import time
-
+from memory_profiler import profile
 from flask_app.general.merge_pdfs import merge_selected_pdfs
 from flask_app.general.截取pdf通用函数 import check_pdf_pages
 from flask_app.general.通用功能函数 import get_global_logger
 from flask_app.工程标.截取pdf工程标版 import truncate_pdf_main_engineering
 from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main_goods

-
 def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selections=None):
    """
    统一的 PDF 截取和合并函数，支持 'goods' 和 'engineering' 两种模式，
--- a/flask_app/general/读取文件/clean_pdf.py
+++ b/flask_app/general/读取文件/clean_pdf.py
@ -303,7 +303,7 @@ def is_pure_image(docx_path, percentage=0.3):
    return True

 if __name__ == '__main__':
-    pdf_path=r"C:\Users\Administrator\Desktop\ztbfile.pdf"
+    pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\bf0346dc-8711-43f5-839e-e4076acea84a\ztbfile.pdf"
    res=is_scanned_pdf(pdf_path)
    if res:
        print("扫描型")
--- a/flask_app/routes/upload.py
+++ b/flask_app/routes/upload.py
@ -136,56 +136,56 @@ def process_and_stream(file_url, zb_type):

        base_end_time = time.time()
        logger.info(f"分段解析完成，耗时：{base_end_time - start_time:.2f} 秒")
-        # #此时前端已完整接收到解析的所有内容，后面的内容与前端展示无关，主要是后处理：1.extracted_result，关键信息存储 2.技术偏离表 3.商务偏离表 4.投标人需提交的证明材料（目前后端存储了，前端还未展示）
-        # #后处理开始！！！
-        # output_json_path = os.path.join(output_folder, 'final_result.json')
-        # extracted_info_path = os.path.join(output_folder, 'extracted_result.json')
-        # includes = ["基础信息", "资格审查", "商务评分", "技术评分", "无效标与废标项", "投标文件要求", "开评定标流程"]
-        # final_result, extracted_info, tech_deviation, tech_star_deviation, business_deviation, business_star_deviation, zigefuhe_deviation, proof_materials = outer_post_processing(
-        #     combined_data, includes, good_list)    #后处理 生成 extracted_info、商务 技术偏离数据 以及证明材料返给后端
-        #
-        # #后处理完毕！后面都是生成响应返回，不额外修改数据
-        # tech_deviation_response, tech_deviation_star_response, zigefuhe_deviation_response, shangwu_deviation_response, shangwu_star_deviation_response, proof_materials_response = generate_deviation_response(
-        #     tech_deviation, tech_star_deviation, business_deviation, business_star_deviation, zigefuhe_deviation,
-        #     proof_materials, logger)    #生成规范的响应
-        #
-        # # 使用通用响应函数
-        # yield sse_format(tech_deviation_response)
-        # yield sse_format(tech_deviation_star_response)
-        # yield sse_format(zigefuhe_deviation_response)
-        # yield sse_format(shangwu_deviation_response)
-        # yield sse_format(shangwu_star_deviation_response)
-        # yield sse_format(proof_materials_response)
-        #
-        # try:
-        #     with open(extracted_info_path, 'w', encoding='utf-8') as json_file:
-        #         json.dump(extracted_info, json_file, ensure_ascii=False, indent=4)
-        #     logger.info(f"摘取后的数据已保存到 '{extracted_info_path}'")
-        # except IOError as e:
-        #     logger.error(f"保存JSON文件时出错: {e}")
-        #     log_error_unique_id(unique_id,1)  # 记录失败的 unique_id
-        #
-        # try:
-        #     with open(output_json_path, 'w', encoding='utf-8') as json_file:
-        #         json.dump(final_result, json_file, ensure_ascii=False, indent=4)
-        #     logger.info(f"合并后的数据已保存到 '{output_json_path}'")
-        # except IOError as e:
-        #     logger.error(f"保存JSON文件时出错: {e}")
-        #     log_error_unique_id(unique_id,1)  # 记录失败的 unique_id
-        #
-        # extracted_info_response = create_response(
-        #     message='extracted_info',
-        #     status='success',
-        #     data=json.dumps(extracted_info, ensure_ascii=False)
-        # )
-        # yield sse_format(extracted_info_response)
-        #
-        # complete_response = create_response(
-        #     message='Combined_data',
-        #     status='success',
-        #     data=json.dumps(final_result, ensure_ascii=False)
-        # )
-        # yield sse_format(complete_response)
+        #此时前端已完整接收到解析的所有内容，后面的内容与前端展示无关，主要是后处理：1.extracted_result，关键信息存储 2.技术偏离表 3.商务偏离表 4.投标人需提交的证明材料（目前后端存储了，前端还未展示）
+        #后处理开始！！！
+        output_json_path = os.path.join(output_folder, 'final_result.json')
+        extracted_info_path = os.path.join(output_folder, 'extracted_result.json')
+        includes = ["基础信息", "资格审查", "商务评分", "技术评分", "无效标与废标项", "投标文件要求", "开评定标流程"]
+        final_result, extracted_info, tech_deviation, tech_star_deviation, business_deviation, business_star_deviation, zigefuhe_deviation, proof_materials = outer_post_processing(
+            combined_data, includes, good_list)    #后处理 生成 extracted_info、商务 技术偏离数据 以及证明材料返给后端
+
+        #后处理完毕！后面都是生成响应返回，不额外修改数据
+        tech_deviation_response, tech_deviation_star_response, zigefuhe_deviation_response, shangwu_deviation_response, shangwu_star_deviation_response, proof_materials_response = generate_deviation_response(
+            tech_deviation, tech_star_deviation, business_deviation, business_star_deviation, zigefuhe_deviation,
+            proof_materials, logger)    #生成规范的响应
+
+        # 使用通用响应函数
+        yield sse_format(tech_deviation_response)
+        yield sse_format(tech_deviation_star_response)
+        yield sse_format(zigefuhe_deviation_response)
+        yield sse_format(shangwu_deviation_response)
+        yield sse_format(shangwu_star_deviation_response)
+        yield sse_format(proof_materials_response)
+
+        try:
+            with open(extracted_info_path, 'w', encoding='utf-8') as json_file:
+                json.dump(extracted_info, json_file, ensure_ascii=False, indent=4)
+            logger.info(f"摘取后的数据已保存到 '{extracted_info_path}'")
+        except IOError as e:
+            logger.error(f"保存JSON文件时出错: {e}")
+            log_error_unique_id(unique_id,1)  # 记录失败的 unique_id
+
+        try:
+            with open(output_json_path, 'w', encoding='utf-8') as json_file:
+                json.dump(final_result, json_file, ensure_ascii=False, indent=4)
+            logger.info(f"合并后的数据已保存到 '{output_json_path}'")
+        except IOError as e:
+            logger.error(f"保存JSON文件时出错: {e}")
+            log_error_unique_id(unique_id,1)  # 记录失败的 unique_id
+
+        extracted_info_response = create_response(
+            message='extracted_info',
+            status='success',
+            data=json.dumps(extracted_info, ensure_ascii=False)
+        )
+        yield sse_format(extracted_info_response)
+
+        complete_response = create_response(
+            message='Combined_data',
+            status='success',
+            data=json.dumps(final_result, ensure_ascii=False)
+        )
+        yield sse_format(complete_response)

        final_response = create_response(     #目前后端的逻辑是读取到'data'中有个'END',就终止连接
            message='文件上传并处理成功',
--- a/flask_app/routes/工程标解析main.py
+++ b/flask_app/routes/工程标解析main.py
@ -89,6 +89,11 @@ def preprocess_files(output_folder, file_path, file_type,logger):
    else:
        invalid_deleted_docx = file_path
        invalid_added_docx = ''
+    try:
+        # 尝试加载 .docx 文件
+        doc = Document(invalid_deleted_docx)
+    except Exception as e:
+        invalid_deleted_docx=pdf_path if pdf_path!="" else file_path          #文档转换还是失败！最后的手段保证文件可用！
    end_time=time.time()
    logger.info(f"文件预处理总耗时：{end_time - start_time:.2f} 秒")

--- a/flask_app/routes/货物标解析main.py
+++ b/flask_app/routes/货物标解析main.py
@ -43,7 +43,6 @@ def preprocess_files(output_folder, file_path, file_type,logger):
        truncate_files=['','','','','','',file_path,'']  #纯图片，无需切片
    # print("切割出的文件："+str(truncate_files))
    # 处理各个部分
-    # invalid_docpath = invalid_added_docx  # docx截取无效标部分
    procurement_path = truncate_files[5]  # 采购需求
    evaluation_method_path = truncate_files[1]  # 评标办法
    qualification_path = truncate_files[2]  # 资格审查
@ -55,25 +54,30 @@ def preprocess_files(output_folder, file_path, file_type,logger):
    invalid_path = truncate_files[6] if truncate_files[6] != "" else pdf_path    #无效标（投标文件格式\合同条款之前的内容）
    truncate_endtime = time.time()
    logger.info(f"文件切分CPU耗时：{truncate_endtime - start_time:.2f} 秒")
-    # if not is_pure_image_flag:
-    #     invalid_added_pdf = insert_mark(invalid_path)
-    #     change_start=time.time()
-    #     invalid_added_docx = pdf2docx(invalid_added_pdf)  # 有标记的invalid_path，用于废标项提取，使用正则。
-    #     change_end=time.time()
-    #     logger.info(f"文档转换p2d耗时：{change_end - change_start:.2f} 秒")
-    #     try:
-    #         # 尝试加载 .docx 文件
-    #         doc = Document(invalid_added_docx)
-    #         # print("yes")
-    #     except Exception as e:
-    #         # 捕获异常并打印错误信息
-    #         invalid_added_docx=pdf2docx(invalid_path)
-    #     invalid_deleted_docx = delete_mark(invalid_added_docx)  # 无标记的invalid_path
-    #     if not invalid_deleted_docx:
-    #         invalid_deleted_docx=pdf2docx(invalid_path)
-    # else:    #主要是节约了pdf2docx的一块钱
-    invalid_deleted_docx=file_path
-    invalid_added_docx=''            #由于传入的docx是纯图片型，正则是提取不到的，需要调用大模型。
+    if not is_pure_image_flag:
+        invalid_added_pdf = insert_mark(invalid_path)
+        change_start=time.time()
+        invalid_added_docx = pdf2docx(invalid_added_pdf)  # 有标记的invalid_path，用于废标项提取，使用正则。
+        change_end=time.time()
+        logger.info(f"文档转换p2d耗时：{change_end - change_start:.2f} 秒")
+        try:
+            # 尝试加载 .docx 文件
+            doc = Document(invalid_added_docx)
+            # print("yes")
+        except Exception as e:
+            # 捕获异常并打印错误信息
+            invalid_added_docx=pdf2docx(invalid_path)
+        invalid_deleted_docx = delete_mark(invalid_added_docx)  # 无标记的invalid_path
+        if not invalid_deleted_docx:
+            invalid_deleted_docx=pdf2docx(invalid_path)
+    else:    #主要是节约了pdf2docx的一块钱
+        invalid_deleted_docx=file_path
+        invalid_added_docx=''            #由于传入的docx是纯图片型，正则是提取不到的，需要调用大模型。
+    try:
+        # 尝试加载 .docx 文件
+        doc = Document(invalid_deleted_docx)
+    except Exception as e:
+        invalid_deleted_docx=pdf_path if pdf_path!="" else file_path          #文档转换还是失败！最后的手段保证文件可用！
    end_time = time.time()
    logger.info(f"文件预处理总耗时：{end_time - start_time:.2f} 秒")

@ -231,21 +235,21 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # 立即启动不依赖 knowledge_name 和 index 的任务
        futures = {
-            # 'evaluation_standards': executor.submit(fetch_evaluation_standards,processed_data['invalid_deleted_docx'],   #技术评分 商务评分
-            #                                         processed_data['evaluation_method_path'],logger),
-            #
-            # 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_added_docx'],   #无效标与废标项
-            #                                         output_folder,logger),
+            'evaluation_standards': executor.submit(fetch_evaluation_standards,processed_data['invalid_deleted_docx'],   #技术评分 商务评分
+                                                    processed_data['evaluation_method_path'],logger),

-            # 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_deleted_docx'],processed_data['merged_baseinfo_path'],
-            #                                                   processed_data['clause_path'],logger),    #投标文件要求
-            #
-            # 'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_deleted_docx'],processed_data['merged_baseinfo_path'],
-            #                                processed_data['clause_path'],logger),    #开评定标流程
+            'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_added_docx'],   #无效标与废标项
+                                                    output_folder,logger),
+
+            'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_deleted_docx'],processed_data['merged_baseinfo_path'],
+                                                              processed_data['clause_path'],logger),    #投标文件要求
+
+            'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_deleted_docx'],processed_data['merged_baseinfo_path'],
+                                           processed_data['clause_path'],logger),    #开评定标流程
+
+            'base_info': executor.submit(fetch_project_basic_info, processed_data['invalid_deleted_docx'],processed_data['merged_baseinfo_path'],   #基础信息
+                                         processed_data['procurement_path'],processed_data['clause_path'],logger),

-            # 'base_info': executor.submit(fetch_project_basic_info, processed_data['invalid_deleted_docx'],processed_data['merged_baseinfo_path'],   #基础信息
-            #                              processed_data['procurement_path'],processed_data['clause_path'],logger),
-            #
            'qualification_review': executor.submit(fetch_qualification_review, processed_data['invalid_deleted_docx'],      #资格审查
                                                    processed_data['qualification_path'],
                                                    processed_data['notice_path'],logger),
@ -280,8 +284,8 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
                    }
                    yield json.dumps(default_evaluation, ensure_ascii=False)
                # yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
-        # if collected_good_list is not None:
-        #     yield json.dumps({'good_list': transform_json_values(collected_good_list)}, ensure_ascii=False)
+        if collected_good_list is not None:
+            yield json.dumps({'good_list': transform_json_values(collected_good_list)}, ensure_ascii=False)

 #TODO:小解析考虑提速：1：直接pdf转文本，再切分。后期考虑。