95 lines
4.3 KiB
Python
95 lines
4.3 KiB
Python
import json
|
||
import re
|
||
from PyPDF2 import PdfReader
|
||
from flask_app.general.json_utils import combine_json_results
|
||
from flask_app.general.多线程提问 import multi_threading
|
||
from flask_app.general.通义千问long import upload_file
|
||
from flask_app.货物标.截取pdf货物标版 import extract_common_header, clean_page_content
|
||
|
||
|
||
#正则表达式判断原文中是否有商务、服务、其他要求
|
||
def find_exists(truncate_file, required_keys):
|
||
if not truncate_file:
|
||
return ["商务要求", "服务要求", "其他要求"]
|
||
common_header = extract_common_header(truncate_file)
|
||
pdf_document = PdfReader(truncate_file)
|
||
text = ""
|
||
begin_pattern = re.compile(
|
||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
|
||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|'
|
||
r'^[一二三四五六七八九十百千]+、\s*采购清单', re.MULTILINE)
|
||
end_pattern = re.compile(
|
||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||
|
||
for page in pdf_document.pages:
|
||
page_text = page.extract_text() or ""
|
||
cleaned_text = clean_page_content(page_text, common_header)
|
||
text += cleaned_text + '\n'
|
||
|
||
start_match = re.search(begin_pattern, text)
|
||
if not start_match:
|
||
return []
|
||
|
||
start_index = start_match.end()
|
||
end_match = re.search(end_pattern, text[start_index:])
|
||
|
||
if end_match:
|
||
end_index = start_index + end_match.start()
|
||
relevant_text = text[start_index:end_index]
|
||
else:
|
||
relevant_text = text[start_index:]
|
||
relevant_text=re.sub(r'\s+', '', relevant_text) #删除换行符 空格
|
||
# print(relevant_text)
|
||
# Custom logic for "服务要求"
|
||
matched_requirements = []
|
||
punctuation = r"[,。?!、;:,.?!]*"
|
||
for req in required_keys:
|
||
if re.search(re.escape(req), relevant_text):
|
||
if req == "服务要求":
|
||
# 提取所有包含"服务要求"的行
|
||
lines = [line for line in relevant_text.split('\n') if req in line]
|
||
# 检查是否存在'技术'紧跟在'服务要求'前面(中间只有标点,标点是可选的)
|
||
pattern = r'技术\s*' + punctuation + re.escape(req)
|
||
# 如果存在'技术'紧跟'服务要求'并且中间仅有标点(可选),则不添加该要求
|
||
if not any(re.search(pattern, line) for line in lines):
|
||
matched_requirements.append(req)
|
||
else:
|
||
matched_requirements.append(req)
|
||
|
||
return matched_requirements
|
||
|
||
|
||
def generate_queries(truncate_file, required_keys):
|
||
key_list = find_exists(truncate_file, required_keys)
|
||
queries = []
|
||
user_query_template = "这是一份货物标中采购要求部分的内容,请告诉我\"{}\"是什么,请以json格式返回结果,外层键名是\"{}\",内层键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减,注意你无需回答具体设备的技术要求。"
|
||
for key in key_list:
|
||
query_base = user_query_template.format(key, key)
|
||
other_keys = [k for k in key_list if k != key]
|
||
if other_keys:
|
||
query_base += "也不需要回答\"{}\"中的内容,".format("\"和\"".join(other_keys))
|
||
query_base += "若相关要求不存在,在键值中填'未知'。"
|
||
queries.append(query_base)
|
||
# print(query_base)
|
||
return queries
|
||
|
||
|
||
def get_business_requirements(truncate_file,file_id):
|
||
required_keys = ["商务要求", "服务要求", "其他要求"]
|
||
queries = generate_queries(truncate_file, required_keys)
|
||
results = multi_threading(queries, "", file_id, 2)
|
||
business_requirements = [res for _, res in results] if results else []
|
||
|
||
# Combine and fill missing keys with default values
|
||
final_res = combine_json_results(business_requirements)
|
||
final_res.update({key: final_res.get(key, "") for key in required_keys})
|
||
|
||
return final_res
|
||
|
||
|
||
if __name__ == "__main__":
|
||
truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
||
file_id = upload_file(truncate_file)
|
||
res=get_business_requirements(truncate_file,file_id)
|
||
print(json.dumps(res, ensure_ascii=False, indent=4))
|