zbparse/flask_app/货物标/商务服务其他要求提取.py
2024-10-23 20:33:41 +08:00

93 lines
4.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import re
from PyPDF2 import PdfReader
from flask_app.general.json_utils import combine_json_results
from flask_app.general.多线程提问 import multi_threading
from flask_app.general.通义千问long import upload_file
from flask_app.货物标.截取pdf货物标版 import extract_common_header, clean_page_content
#正则表达式判断原文中是否有商务、服务、其他要求
def find_exists(truncate_file, required_keys):
common_header = extract_common_header(truncate_file)
pdf_document = PdfReader(truncate_file)
text = ""
begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|'
r'^[一二三四五六七八九十百千]+、\s*采购清单', re.MULTILINE)
end_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
for page in pdf_document.pages:
page_text = page.extract_text() or ""
cleaned_text = clean_page_content(page_text, common_header)
text += cleaned_text + '\n'
start_match = re.search(begin_pattern, text)
if not start_match:
return []
start_index = start_match.end()
end_match = re.search(end_pattern, text[start_index:])
if end_match:
end_index = start_index + end_match.start()
relevant_text = text[start_index:end_index]
else:
relevant_text = text[start_index:]
relevant_text=re.sub(r'\s+', '', relevant_text) #删除换行符 空格
# print(relevant_text)
# Custom logic for "服务要求"
matched_requirements = []
punctuation = r"[,。?!、;:,.?!]*"
for req in required_keys:
if re.search(re.escape(req), relevant_text):
if req == "服务要求":
# 提取所有包含"服务要求"的行
lines = [line for line in relevant_text.split('\n') if req in line]
# 检查是否存在'技术'紧跟在'服务要求'前面(中间只有标点,标点是可选的)
pattern = r'技术\s*' + punctuation + re.escape(req)
# 如果存在'技术'紧跟'服务要求'并且中间仅有标点(可选),则不添加该要求
if not any(re.search(pattern, line) for line in lines):
matched_requirements.append(req)
else:
matched_requirements.append(req)
return matched_requirements
def generate_queries(truncate_file, required_keys):
key_list = find_exists(truncate_file, required_keys)
queries = []
user_query_template = "这是一份货物标中采购要求部分的内容,请告诉我\"{}\"是什么请以json格式返回结果外层键名是\"{}\",内层键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减,注意你无需回答具体设备的技术要求。"
for key in key_list:
query_base = user_query_template.format(key, key)
other_keys = [k for k in key_list if k != key]
if other_keys:
query_base += "也不需要回答\"{}\"中的内容,".format("\"\"".join(other_keys))
query_base += "若相关要求不存在,在键值中填'未知'"
queries.append(query_base)
# print(query_base)
return queries
def get_business_requirements(truncate_file,file_id):
required_keys = ["商务要求", "服务要求", "其他要求"]
queries = generate_queries(truncate_file, required_keys)
results = multi_threading(queries, "", file_id, 2)
business_requirements = [res for _, res in results] if results else []
# Combine and fill missing keys with default values
final_res = combine_json_results(business_requirements)
final_res.update({key: final_res.get(key, "") for key in required_keys})
return final_res
if __name__ == "__main__":
truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
file_id = upload_file(truncate_file)
res=get_business_requirements(truncate_file,file_id)
print(json.dumps(res, ensure_ascii=False, indent=4))