zbparse/flask_app/货物标/商务服务其他要求提取.py

93 lines
4.2 KiB
Python
Raw Normal View History

2024-09-18 11:57:17 +08:00
import json
import re
from PyPDF2 import PdfReader
from flask_app.main.json_utils import clean_json_string, combine_json_results
from flask_app.main.多线程提问 import multi_threading
from flask_app.main.通义千问long import qianwen_long, upload_file
from flask_app.货物标.货物标截取pdf import extract_common_header, clean_page_content
#正则表达式判断原文中是否有商务、服务、其他要求
2024-09-18 11:57:17 +08:00
def find_exists(truncate_file, required_keys):
common_header = extract_common_header(truncate_file)
pdf_document = PdfReader(truncate_file)
text = ""
begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|'
r'^[一二三四五六七八九十百千]+、\s*采购清单', re.MULTILINE)
end_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
for page in pdf_document.pages:
page_text = page.extract_text() or ""
cleaned_text = clean_page_content(page_text, common_header)
text += cleaned_text + '\n'
start_match = re.search(begin_pattern, text)
if not start_match:
return []
start_index = start_match.end()
end_match = re.search(end_pattern, text[start_index:])
if end_match:
end_index = start_index + end_match.start()
relevant_text = text[start_index:end_index]
else:
relevant_text = text[start_index:]
relevant_text=re.sub(r'\s+', '', relevant_text) #删除换行符 空格
2024-10-15 20:57:58 +08:00
# print(relevant_text)
2024-09-18 11:57:17 +08:00
# Custom logic for "服务要求"
matched_requirements = []
punctuation = r"[,。?!、;:,.?!]*"
for req in required_keys:
if re.search(re.escape(req), relevant_text):
if req == "服务要求":
# 提取所有包含"服务要求"的行
lines = [line for line in relevant_text.split('\n') if req in line]
# 检查是否存在'技术'紧跟在'服务要求'前面(中间只有标点,标点是可选的)
pattern = r'技术\s*' + punctuation + re.escape(req)
# 如果存在'技术'紧跟'服务要求'并且中间仅有标点(可选),则不添加该要求
if not any(re.search(pattern, line) for line in lines):
matched_requirements.append(req)
else:
matched_requirements.append(req)
return matched_requirements
def generate_queries(truncate_file, required_keys):
key_list = find_exists(truncate_file, required_keys)
queries = []
2024-10-15 20:57:58 +08:00
user_query_template = "这是一份货物标中采购要求部分的内容,请告诉我\"{}\"是什么请以json格式返回结果外层键名是\"{}\",内层键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减,注意你无需回答具体设备的技术要求。"
2024-09-18 11:57:17 +08:00
for key in key_list:
query_base = user_query_template.format(key, key)
other_keys = [k for k in key_list if k != key]
if other_keys:
query_base += "也不需要回答\"{}\"中的内容,".format("\"\"".join(other_keys))
query_base += "若相关要求不存在,在键值中填'未知'"
queries.append(query_base)
2024-10-15 20:57:58 +08:00
# print(query_base)
2024-09-18 11:57:17 +08:00
return queries
2024-09-23 15:49:30 +08:00
def get_business_requirements(truncate_file,file_id):
2024-09-18 11:57:17 +08:00
required_keys = ["商务要求", "服务要求", "其他要求"]
queries = generate_queries(truncate_file, required_keys)
results = multi_threading(queries, "", file_id, 2)
business_requirements = [res for _, res in results] if results else []
# Combine and fill missing keys with default values
final_res = combine_json_results(business_requirements)
final_res.update({key: final_res.get(key, "") for key in required_keys})
2024-09-23 15:49:30 +08:00
return final_res
2024-09-18 11:57:17 +08:00
if __name__ == "__main__":
truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
2024-09-23 15:49:30 +08:00
file_id = upload_file(truncate_file)
res=get_business_requirements(truncate_file,file_id)
print(json.dumps(res, ensure_ascii=False, indent=4))