zbparse/flask_app/货物标/商务服务其他要求提取.py

106 lines
5.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- encoding:utf-8 -*-
import json
import re
from PyPDF2 import PdfReader
from flask_app.general.json_utils import combine_json_results
from flask_app.general.多线程提问 import multi_threading
from flask_app.general.通义千问long import upload_file
from flask_app.货物标.截取pdf货物标版 import extract_common_header, clean_page_content
#正则表达式判断原文中是否有商务、服务、其他要求
def find_exists(truncate_file, required_keys):
if not truncate_file:
return ["技术要求","商务要求", "服务要求", "其他要求"]
common_header = extract_common_header(truncate_file)
pdf_document = PdfReader(truncate_file)
text = ""
begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|'
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|'
r'^[一二三四五六七八九十百千]+、\s*采购清单', re.MULTILINE)
end_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
for page in pdf_document.pages:
page_text = page.extract_text() or ""
cleaned_text = clean_page_content(page_text, common_header)
text += cleaned_text + '\n'
start_match = re.search(begin_pattern, text)
if not start_match:
return []
start_index = start_match.end()
end_match = re.search(end_pattern, text[start_index:])
if end_match:
end_index = start_index + end_match.start()
relevant_text = text[start_index:end_index]
else:
relevant_text = text[start_index:]
relevant_text=re.sub(r'\s+', '', relevant_text) #删除换行符 空格
# print(relevant_text)
# Custom logic for "服务要求"
matched_requirements = []
punctuation = r"[,。?!、;:,.?!]*"
for req in required_keys:
if re.search(re.escape(req), relevant_text):
if req == "服务要求":
# 提取所有包含"服务要求"的行
lines = [line for line in relevant_text.split('\n') if req in line]
# 检查是否存在'技术'紧跟在'服务要求'前面(中间只有标点,标点是可选的)
pattern = r'技术\s*' + punctuation + re.escape(req)
# 如果存在'技术'紧跟'服务要求'并且中间仅有标点(可选),则不添加该要求
if not any(re.search(pattern, line) for line in lines):
matched_requirements.append(req)
else:
matched_requirements.append(req)
return matched_requirements
def generate_queries(truncate_file, required_keys):
key_list = find_exists(truncate_file, required_keys)
queries = []
user_query_template = "这是一份货物标中采购要求部分的内容,请告诉我\"{}\"是什么请以json格式返回结果外层键名是\"{}\",内层键值对中的键名是原文中的标题或者是你对相关子要求的总结,而键值需要完全与原文保持一致,不可擅自总结删减,注意你无需回答采购清单中具体设备的技术参数要求,仅需从正文部分开始提取,"
for key in key_list:
query_base = user_query_template.format(key, key)
other_keys = [k for k in key_list if k != key]
if other_keys:
query_base += "也不需要回答\"{}\"中的内容,".format("\"\"".join(other_keys))
query_base += "若相关要求不存在,在键值中填'未知'"
queries.append(query_base)
# print(query_base)
return queries
def get_business_requirements(truncate_file,file_id):
# required_keys = ["技术要求","商务要求", "服务要求", "其他要求"]
# queries = generate_queries(truncate_file, required_keys)
#一起问了,效率慢点,但内容准
queries=[
"""
请你根据该货物类招标文件中的采购要求部分内容请告诉我文档中技术要求、服务要求、商务要求、其他要求分别是什么请以json格式返回结果可以用嵌套键值对的形式组织回答默认情况下外层键名是'技术要求''服务要求''商务要求''其他要求',内层键名是原文中的相应子标题或者是你对相关子要求的总结,而键值需要完全与原文保持一致,不可擅自总结删减。注意你无需回答采购清单中具体设备的技术参数要求,仅需从正文部分开始提取。以下是你需要考虑的特殊情况:如果原文中技术要求与服务要求在一块,那么你应该用外键'技术、服务要求'替换默认外键'技术要求''服务要求',若相关要求不存在,对应的键值设为'未知'。以下为示例输出,仅供格式参考:
{
"技术、服务要求":"相关技术要求以及服务要求",
"商务要求":"相关商务要求",
"其他要求""其他要求内容"
}
"""
]
results = multi_threading(queries, "", file_id, 2)
business_requirements = [res for _, res in results] if results else []
# Combine and fill missing keys with default values
final_res = combine_json_results(business_requirements)
# final_res.update({key: final_res.get(key, "") for key in required_keys})
return final_res
if __name__ == "__main__":
truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\469d2aee-9024-4993-896e-2ac7322d41b7\\ztbfile_procurement.docx"
file_id = upload_file(truncate_file)
res=get_business_requirements(truncate_file,file_id)
print(json.dumps(res, ensure_ascii=False, indent=4))