zbparse/flask_app/general/doubao.py

import os

import PyPDF2
import requests
from flask_app.general.file2markdown import convert_pdf_to_markdown

from flask_app.general.clean_pdf import extract_common_header, clean_page_content


def pdf2txt(file_path):
    common_header = extract_common_header(file_path)
    # print(f"公共抬头：{common_header}")
    # print("--------------------正文开始-------------------")
    result = ""
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        # print(f"Total pages: {num_pages}")
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text = page.extract_text()
            if text:
                # print(f"--------第{page_num}页-----------")
                cleaned_text = clean_page_content(text,common_header)
                # print(cleaned_text)
                result += cleaned_text
                # print(f"Page {page_num + 1} Content:\n{cleaned_text}")
            else:
                print(f"Page {page_num + 1} is empty or text could not be extracted.")
        directory = os.path.dirname(os.path.abspath(file_path))
        output_path = os.path.join(directory, 'extract.txt')
        # 将结果保存到 extract.txt 文件中
        try:
            with open(output_path, 'w', encoding='utf-8') as output_file:
                output_file.write(result)
            print(f"提取内容已保存到: {output_path}")
        except IOError as e:
            print(f"写入文件时发生错误: {e}")
        # 返回保存的文件路径
        return output_path

def read_txt_to_string(file_path):
    """
    读取txt文件内容并返回一个包含所有内容的字符串，保持原有格式。

    参数:
    - file_path (str): txt文件的路径

    返回:
    - str: 包含文件内容的字符串
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:  # 确保使用适当的编码
            content = file.read()  # 使用 read() 保持文件格式
        return content
    except FileNotFoundError:
        return "错误：文件未找到。"
    except Exception as e:
        return f"错误：读取文件时发生错误。详细信息：{e}"
def doubao_model(full_user_query):
    print("call doubao...")
    # 相关参数
    url = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
    api_key = "ad0c363f-1f23-4b13-aba3-698a4f8c3eb8"
    model_name = "ep-20241115114052-2clpd"  # ep-20241115114052-2clpd 豆包Pro 32k模型    ep-20241117112931-4zb6n 128k

    # 大模型提取页码
    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer " + api_key
    }
    data = {
        "model": model_name,
        "messages": [
            {
                "role": "user",
                "content": full_user_query
            }
        ],
        "temperature":0.2
    }
    response = requests.post(url, headers=headers, json=data)

    if response.status_code == 200:
        # print(response.json()["choices"][0]["message"]["content"])
        return response.json()["choices"][0]["message"]["content"]
    else:
        print(f"请求失败，状态码：{response.status_code}，错误信息：{response.text}")


def generate_full_user_query(file_path, prompt_template):
    """
    根据文件路径和提示词模板生成完整的user_query。

    参数：
    - file_path (str): 需要解析的文件路径。
    - prompt_template (str): 包含{full_text}占位符的提示词模板。

    返回：
    - str: 完整的user_query。
    """
    # 假设extract_text_by_page已经定义，用于提取文件内容
    full_text=read_txt_to_string(file_path)
    # 格式化提示词，将提取的文件内容插入到模板中
    user_query = prompt_template.format(full_text=full_text)

    return user_query

#7.文件内容为markdown格式， 表格特殊情况处理：对于表格数据，可能存在原始pdf转换markdown时跨页导致同一个货物名称（或系统名称）分隔在上下两个单元格内，你需要通过上下文语义判断是否合并之后才是完整且正确的货物名称（或系统名称）。
if __name__ == "__main__":
    txt_path = r"output.txt"
    pdf_path_1 = "D:/bid_generator/task_folder/9a447eb0-24b8-4f51-8164-d91a62edea25/tmp/bid_format.pdf"
    pdf_path_2 = r"C:\Users\Administrator\Desktop\货物标\output1\竞争性谈判文件_procurement.pdf"
    prompt_template = '''
        任务：解析采购文件，提取采购需求，并以JSON格式返回。

        要求与指南：
        1. 精准定位：运用文档理解能力，找到文件中的采购需求部分。
        2. 系统归属：若货物明确属于某个系统，则将其作为该系统的二级键。
        3. 非清单形式处理：若未出现采购清单，则从表格或文字中摘取系统和货物信息。
        4. 软件需求：对于软件应用需求，列出系统模块构成，并作为系统键值的一部分。
        5. 系统功能：若文中提及系统功能，则在系统值中添加'系统功能'二级键，不展开具体内容。
        6. 完整性：确保不遗漏系统内的货物，也不添加未提及的内容。

        输出格式：
        1.JSON格式，最外层键名为'采购需求'。
        2.嵌套键名为系统或货物名称，与原文保持一致。
        3.键值应为空对象（{{}}），仅返回名称。
        4.不包含'说明'、'规格'、'技术参数'等列内容。
        5.层次关系用嵌套键值对表示。
        6.最后一级键内值留空或填'未知'（如数量较多或未知内容）。

        特殊情况处理：
            同一层级下同名但采购要求不同的货物，以'货物名-编号'区分，编号从1递增。

        示例输出结构：
        {{
            "采购需求": {{
                "交换机-1": {{}},
                "交换机-2": {{}},
                "门禁管理系统": {{
                    // 可包含其他货物或模块
                }},
                "交通监控视频子系统": {{
                    "系统功能": {{}},
                    "高清视频抓拍像机": {{}},
                    "补光灯": {{}}
                }},
                "LED全彩显示屏": {{}}
                // 其他系统和货物
            }}
        }}

        文件内容（已包含）：{full_text}

        注意事项：
        1.严格按照上述要求执行，确保输出准确性和规范性。
        2.如有任何疑问或不确定内容，请保留原文描述，必要时使用'未知'标注。
    '''
    # processed_filepath = convert_pdf_to_markdown(pdf_path_2)  # 转markdown格式
    # processed_filepath = pdf2txt(pdf_path_2)                   #纯文本提取
    # user_query=generate_full_user_query(processed_filepath,prompt_template)
    user_query="一年有多少天？"
    res=doubao_model(user_query)
    print(res)
    # print("--------------------")
    # print(user_query)