125 lines
5.4 KiB
Python
125 lines
5.4 KiB
Python
|
import PyPDF2
|
|||
|
import requests
|
|||
|
|
|||
|
|
|||
|
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
|
|||
|
|
|||
|
|
|||
|
def extract_text_by_page(file_path):
|
|||
|
common_header = extract_common_header(file_path)
|
|||
|
# print(f"公共抬头:{common_header}")
|
|||
|
# print("--------------------正文开始-------------------")
|
|||
|
result = ""
|
|||
|
with open(file_path, 'rb') as file:
|
|||
|
reader = PyPDF2.PdfReader(file)
|
|||
|
num_pages = len(reader.pages)
|
|||
|
# print(f"Total pages: {num_pages}")
|
|||
|
for page_num in range(num_pages):
|
|||
|
page = reader.pages[page_num]
|
|||
|
text = page.extract_text()
|
|||
|
if text:
|
|||
|
# print(f"--------第{page_num}页-----------")
|
|||
|
cleaned_text = clean_page_content(text,common_header)
|
|||
|
# print(cleaned_text)
|
|||
|
result += cleaned_text
|
|||
|
# print(f"Page {page_num + 1} Content:\n{cleaned_text}")
|
|||
|
else:
|
|||
|
print(f"Page {page_num + 1} is empty or text could not be extracted.")
|
|||
|
return result
|
|||
|
|
|||
|
def db_call_with_title(file_path):
|
|||
|
# 相关参数
|
|||
|
url = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
|
|||
|
api_key = "ad0c363f-1f23-4b13-aba3-698a4f8c3eb8"
|
|||
|
model_name = "ep-20241115114052-2clpd" # 豆包Pro 32k模型
|
|||
|
|
|||
|
# 取txt内容提问
|
|||
|
# with open(txt_path, 'r', encoding='utf-8') as f:
|
|||
|
# full_text = f.read()
|
|||
|
# print(len(full_text))
|
|||
|
|
|||
|
# 取pdf内容提问
|
|||
|
full_text_json = extract_text_json_by_page(file_path)
|
|||
|
|
|||
|
# 大模型提取目录
|
|||
|
# query = f"""
|
|||
|
# 任务:根据所提供的投标文件格式文件,将其中商务部分的目录准确提出。
|
|||
|
# 要求:1.尽可能保证与原文内容一致,不要进行总结归纳。
|
|||
|
# 2.如果文件中包含目录,其可能是虚假的目录,一切以正文内容为准。
|
|||
|
# 3.所提供文件可能包含技术标或其他内容,要求仅提取商务标内容。
|
|||
|
# 4.按所给出的格式输出,不要输出任何其他内容,目录保留正确的层级关系。
|
|||
|
# 输出格式:{{
|
|||
|
# "一级目录": {{
|
|||
|
# "二级目录": {{
|
|||
|
# "最内层目录": {{}}
|
|||
|
# }}
|
|||
|
# }}
|
|||
|
# }}
|
|||
|
# 文件内容:{full_text}
|
|||
|
# """
|
|||
|
|
|||
|
# 大模型提取页码
|
|||
|
query = f"""
|
|||
|
任务:根据所提供的投标文件json格式文件,将其中商务部分的页码准确提出。
|
|||
|
要求:1.所提供的json文件键为页码,值为当前页码的文本,充分理解后提取商务部分的内容页码。
|
|||
|
2.如果文件中包含目录,其可能是虚假的目录,一切以正文内容为准。
|
|||
|
3.所提供文件可能包含技术标或其他内容,要求仅提取商务标相关的页码。并且在最后将子目录的页码范围整合到"页码范围"中。
|
|||
|
4.按所给出的格式输出,不要输出任何其他内容,目录保留正确的层级关系。
|
|||
|
5.如果商务部分穿插如技术标的相关内容,在"商务部分"中的page_range中需要将这段跳过。
|
|||
|
举例:假如12到15页为技术标部分,其他都是商务标时,按列表包含二元组的样式展示出来,如[(1,11),(16,33)]。
|
|||
|
如果这种情况没有发生,则输出完整的页码范围,如[(4,20)]
|
|||
|
输出格式:{{
|
|||
|
"商务部分": {{
|
|||
|
"一级目录": {{
|
|||
|
start_page: 开始页码
|
|||
|
end_page: 结束页码
|
|||
|
children: {{
|
|||
|
"二级目录": {{
|
|||
|
start_page: 开始页码
|
|||
|
end_page: 结束页码
|
|||
|
children: {{
|
|||
|
"最内层目录": {{
|
|||
|
start_page: 开始页码
|
|||
|
end_page: 结束页码
|
|||
|
}}
|
|||
|
}}
|
|||
|
}}
|
|||
|
}}
|
|||
|
}}
|
|||
|
}}
|
|||
|
"页码范围": [(4,33)]
|
|||
|
}}
|
|||
|
文件内容:{full_text_json}
|
|||
|
"""
|
|||
|
|
|||
|
headers = {
|
|||
|
"Content-Type": "application/json",
|
|||
|
"Authorization": "Bearer " + api_key
|
|||
|
}
|
|||
|
data = {
|
|||
|
"model": model_name,
|
|||
|
"messages": [
|
|||
|
{
|
|||
|
"role": "system",
|
|||
|
"content": "你是一个专业的标书制作人,现在需要你对传入的数据进行充分理解后,回答我的问题。"
|
|||
|
},
|
|||
|
{
|
|||
|
"role": "user",
|
|||
|
"content": query
|
|||
|
}
|
|||
|
]
|
|||
|
}
|
|||
|
|
|||
|
response = requests.post(url, headers=headers, json=data)
|
|||
|
|
|||
|
if response.status_code == 200:
|
|||
|
print(response.json()["choices"][0]["message"]["content"])
|
|||
|
else:
|
|||
|
print(f"请求失败,状态码:{response.status_code},错误信息:{response.text}")
|
|||
|
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
txt_path = "D:/files/bid1/bid_format.txt"
|
|||
|
pdf_path_1 = "D:/bid_generator/task_folder/9a447eb0-24b8-4f51-8164-d91a62edea25/tmp/bid_format.pdf"
|
|||
|
pdf_path_2 = "D:/files/page_test/技术标穿插测试文件.pdf"
|
|||
|
db_call_with_title(pdf_path_2)
|