zbparse/flask_app/货物标/提取采购需求main.py

import concurrent.futures
import json
import time
from  flask_app.general.format_change import get_pdf_page_count
from flask_app.general.doubao import pdf2txt
from flask_app.general.file2markdown import convert_file_to_markdown
from flask_app.general.format_change import pdf2docx
from flask_app.货物标.技术参数要求提取 import get_technical_requirements
from flask_app.general.通义千问long import upload_file
from flask_app.货物标.商务服务其他要求提取 import get_business_requirements


#TODO:目前若截取不到采购需求，默认上传整份procurement_path=invalid_path,再转md格式，后续可以判断，这种可以就转docx，不再转md了
def fetch_procurement_reqs(procurement_path, invalid_path):
    #procurement_path可能是pdf\docx
    # 定义默认的 procurement_reqs 字典
    DEFAULT_PROCUREMENT_REQS = {
        "采购需求": {},
        "技术要求": "",
        "商务要求": "",
        "服务要求": "",
        "其他要求": ""
    }

    # 如果 procurement_docpath 是空字符串，直接返回包含空字符串的字典
    if not procurement_path:
        return DEFAULT_PROCUREMENT_REQS.copy()

    try:
        if procurement_path == invalid_path:
            # 读取 PDF 页码数
            page_count = get_pdf_page_count(procurement_path)

            if page_count > 80:  # 如果页码数大于 580
                model_type = 0
                processed_filepath = ""
            else:
                model_type = 1
                processed_filepath = convert_file_to_markdown(procurement_path)  # 转markdown格式
        else:
            model_type = 1
            processed_filepath = convert_file_to_markdown(procurement_path)  # 转markdown格式
        # processed_filepath = pdf2txt(procurement_path)  # 纯文本提取
        # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
        with concurrent.futures.ThreadPoolExecutor() as executor:
            # 提交任务给线程池
            future_technical = executor.submit(get_technical_requirements, invalid_path, processed_filepath,model_type)
            time.sleep(0.5)  # 保持原有的延时
            future_business = executor.submit(get_business_requirements, procurement_path, processed_filepath,model_type)
            # 获取并行任务的结果
            technical_requirements = future_technical.result()
            business_requirements = future_business.result()
        # 构建最终的采购需求字典
        procurement_reqs = {
            "采购需求": technical_requirements.get("采购需求", {})
        }

        procurement_reqs.update(business_requirements)

        # 如果需要确保所有默认键存在，可以取消下面的注释
        # for key, default_value in DEFAULT_PROCUREMENT_REQS.items():
        #     procurement_reqs.setdefault(key, default_value)

        return procurement_reqs

    except Exception as e:
        print(f"Error in fetch_procurement_reqs: {e}")
        # 在出错时返回默认的包含空字符串的字典
        return DEFAULT_PROCUREMENT_REQS.copy()


# TODO:技术要求可以在技术参数之后执行，把完整的技术参数输入，问大模型，除了上述内容还有哪些，这样的话把技术标和其他的区分开。

if __name__ == "__main__":
    start_time = time.time()
    output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output"
    # file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_procurement.pdf"
    procurement_path = r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile_procurement.pdf"
    procurement_docpath = r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a"
    invalid_path = r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile.pdf"
    res = fetch_procurement_reqs(procurement_path, invalid_path)
    print(json.dumps(res, ensure_ascii=False, indent=4))
    end_time = time.time()
    print("耗时:" + str(end_time - start_time))
-.23测试分段

											
										
										
											2024-09-23 15:49:30 +08:00
+								import concurrent.futures
-.13

											
										
										
											2024-09-13 15:03:55 +08:00
+								import json
-.23测试分段

											
										
										
											2024-09-23 15:49:30 +08:00
+								import time
-.19 修复豆包模型使用的bug

											
										
										
											2024-12-19 14:39:35 +08:00
+								from  flask_app.general.format_change import get_pdf_page_count
-.23 修改商务要求bug

											
										
										
											2024-11-23 15:38:52 +08:00
+								from flask_app.general.doubao import pdf2txt
-.17 无效投标、废标

											
										
										
											2024-12-17 14:47:19 +08:00
+								from flask_app.general.file2markdown import convert_file_to_markdown
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								from flask_app.general.format_change import pdf2docx
-.25切分改为多线程

											
										
										
											2024-10-25 17:50:20 +08:00
+								from flask_app.货物标.技术参数要求提取 import get_technical_requirements
-.22代码结构优化

											
										
										
											2024-10-22 10:06:22 +08:00
+								from flask_app.general.通义千问long import upload_file
-.15货物标完整版

											
										
										
											2024-10-15 21:03:02 +08:00
+								from flask_app.货物标.商务服务其他要求提取 import get_business_requirements
-.22代码结构优化

											
										
										
											2024-10-22 10:06:22 +08:00
-.18 截取pdf

											
										
										
											2024-12-19 12:18:50 +08:00
 								#TODO:目前若截取不到采购需求，默认上传整份procurement_path=invalid_path,再转md格式，后续可以判断，这种可以就转docx，不再转md了
-.16

											
										
										
											2024-11-16 16:14:53 +08:00
+								def fetch_procurement_reqs(procurement_path, invalid_path):
-.17 无效投标、废标

											
										
										
											2024-12-17 15:42:20 +08:00
+								    #procurement_path可能是pdf\docx
-.17 小解析货物标

											
										
										
											2024-10-17 15:33:58 +08:00
+								    # 定义默认的 procurement_reqs 字典
 								    DEFAULT_PROCUREMENT_REQS = {
-.18 截取pdf

											
										
										
											2024-12-19 12:18:50 +08:00
+								        "采购需求": {},
-.17 小解析货物标

											
										
										
											2024-10-17 15:33:58 +08:00
+								        "技术要求": "",
 								        "商务要求": "",
 								        "服务要求": "",
 								        "其他要求": ""
 								    }
-.8采购要求更加完整

											
										
										
											2024-11-08 15:44:29 +08:00
 								    # 如果 procurement_docpath 是空字符串，直接返回包含空字符串的字典
-.17 修复了提取货物标的Bug，无效投标bug修复

											
										
										
											2024-11-17 17:27:05 +08:00
+								    if not procurement_path:
-.17 小解析货物标

											
										
										
											2024-10-17 15:33:58 +08:00
+								        return DEFAULT_PROCUREMENT_REQS.copy()
-.8采购要求更加完整

											
										
										
											2024-11-08 15:44:29 +08:00
-.17 小解析货物标

											
										
										
											2024-10-17 15:33:58 +08:00
+								    try:
-.19 修复豆包模型使用的bug

											
										
										
											2024-12-19 14:21:33 +08:00
+								        if procurement_path == invalid_path:
 								            # 读取 PDF 页码数
-.19 修复豆包模型使用的bug

											
										
										
											2024-12-19 14:13:49 +08:00
+								            page_count = get_pdf_page_count(procurement_path)
-.19 修复豆包模型使用的bug

											
										
										
											2024-12-19 14:21:33 +08:00
-.19 修复豆包模型使用的bug

											
										
										
											2024-12-19 14:34:05 +08:00
+								            if page_count > 80:  # 如果页码数大于 580
-.19 修复豆包模型使用的bug

											
										
										
											2024-12-19 14:21:33 +08:00
+								                model_type = 0
 								                processed_filepath = ""
 								            else:
 								                model_type = 1
 								                processed_filepath = convert_file_to_markdown(procurement_path)  # 转markdown格式
-.19 修复豆包模型使用的bug

											
										
										
											2024-12-19 14:13:49 +08:00
+								        else:
 								            model_type = 1
-.19 修复豆包模型使用的bug

											
										
										
											2024-12-19 14:21:33 +08:00
+								            processed_filepath = convert_file_to_markdown(procurement_path)  # 转markdown格式
-.12 资格、符合性提示词优化

											
										
										
											2024-12-12 10:22:22 +08:00
+								        # processed_filepath = pdf2txt(procurement_path)  # 纯文本提取
-.17 小解析货物标

											
										
										
											2024-10-17 15:33:58 +08:00
+								        # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
 								        with concurrent.futures.ThreadPoolExecutor() as executor:
 								            # 提交任务给线程池
-.19 修复豆包模型使用的bug

											
										
										
											2024-12-19 14:13:49 +08:00
+								            future_technical = executor.submit(get_technical_requirements, invalid_path, processed_filepath,model_type)
-.8采购要求更加完整

											
										
										
											2024-11-08 15:44:29 +08:00
+								            time.sleep(0.5)  # 保持原有的延时
-.19 invalid_path转md格式前增加判断

											
										
										
											2024-12-19 15:12:42 +08:00
+								            future_business = executor.submit(get_business_requirements, procurement_path, processed_filepath,model_type)
-.17 小解析货物标

											
										
										
											2024-10-17 15:33:58 +08:00
+								            # 获取并行任务的结果
 								            technical_requirements = future_technical.result()
 								            business_requirements = future_business.result()
-.8采购要求更加完整

											
										
										
											2024-11-08 15:44:29 +08:00
+								        # 构建最终的采购需求字典
-.17 小解析货物标

											
										
										
											2024-10-17 15:33:58 +08:00
+								        procurement_reqs = {
-.21 解决了复杂、多层的采购需求bug

											
										
										
											2024-11-21 16:22:22 +08:00
+								            "采购需求": technical_requirements.get("采购需求", {})
-.17 小解析货物标

											
										
										
											2024-10-17 15:33:58 +08:00
+								        }
-.8采购要求更加完整

											
										
										
											2024-11-08 15:44:29 +08:00
+								        procurement_reqs.update(business_requirements)
 								        # 如果需要确保所有默认键存在，可以取消下面的注释
 								        # for key, default_value in DEFAULT_PROCUREMENT_REQS.items():
 								        #     procurement_reqs.setdefault(key, default_value)
-.17 小解析货物标

											
										
										
											2024-10-17 15:33:58 +08:00
+								        return procurement_reqs
-.23测试分段

											
										
										
											2024-09-23 15:49:30 +08:00
-.17 小解析货物标

											
										
										
											2024-10-17 15:33:58 +08:00
+								    except Exception as e:
 								        print(f"Error in fetch_procurement_reqs: {e}")
 								        # 在出错时返回默认的包含空字符串的字典
 								        return DEFAULT_PROCUREMENT_REQS.copy()
-.13

											
										
										
											2024-09-13 15:03:55 +08:00
-.23 修改商务要求bug

											
										
										
											2024-11-23 15:38:52 +08:00
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								# TODO:技术要求可以在技术参数之后执行，把完整的技术参数输入，问大模型，除了上述内容还有哪些，这样的话把技术标和其他的区分开。
-.28 技术参数提取

											
										
										
											2024-11-28 17:27:44 +08:00
-.13

											
										
										
											2024-09-13 15:03:55 +08:00
+								if __name__ == "__main__":
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    start_time = time.time()
-.13

											
										
										
											2024-09-13 15:03:55 +08:00
+								    output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output"
-.15货物标完整版

											
										
										
											2024-10-15 20:57:58 +08:00
+								    # file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_procurement.pdf"
-.19 invalid_path转md格式前增加判断

											
										
										
											2024-12-19 17:32:39 +08:00
+								    procurement_path = r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile_procurement.pdf"
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    procurement_docpath = r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a"
-.19 invalid_path转md格式前增加判断

											
										
										
											2024-12-19 17:32:39 +08:00
+								    invalid_path = r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile.pdf"
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    res = fetch_procurement_reqs(procurement_path, invalid_path)
-.23测试分段

											
										
										
											2024-09-23 15:49:30 +08:00
+								    print(json.dumps(res, ensure_ascii=False, indent=4))
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    end_time = time.time()
 								    print("耗时:" + str(end_time - start_time))