import json import re from PyPDF2 import PdfReader from flask_app.main.json_utils import clean_json_string, combine_json_results from flask_app.main.多线程提问 import multi_threading from flask_app.main.通义千问long import qianwen_long, upload_file from flask_app.货物标.货物标截取pdf import extract_common_header, clean_page_content #正则表达式判断原文中是否有商务、服务、其他要求 def find_exists(truncate_file, required_keys): common_header = extract_common_header(truncate_file) pdf_document = PdfReader(truncate_file) text = "" begin_pattern = re.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|' r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|' r'^[一二三四五六七八九十百千]+、\s*采购清单', re.MULTILINE) end_pattern = re.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) for page in pdf_document.pages: page_text = page.extract_text() or "" cleaned_text = clean_page_content(page_text, common_header) text += cleaned_text + '\n' start_match = re.search(begin_pattern, text) if not start_match: return [] start_index = start_match.end() end_match = re.search(end_pattern, text[start_index:]) if end_match: end_index = start_index + end_match.start() relevant_text = text[start_index:end_index] else: relevant_text = text[start_index:] relevant_text=re.sub(r'\s+', '', relevant_text) #删除换行符 空格 # print(relevant_text) # Custom logic for "服务要求" matched_requirements = [] punctuation = r"[,。?!、;:,.?!]*" for req in required_keys: if re.search(re.escape(req), relevant_text): if req == "服务要求": # 提取所有包含"服务要求"的行 lines = [line for line in relevant_text.split('\n') if req in line] # 检查是否存在'技术'紧跟在'服务要求'前面(中间只有标点,标点是可选的) pattern = r'技术\s*' + punctuation + re.escape(req) # 如果存在'技术'紧跟'服务要求'并且中间仅有标点(可选),则不添加该要求 if not any(re.search(pattern, line) for line in lines): matched_requirements.append(req) else: matched_requirements.append(req) return matched_requirements def generate_queries(truncate_file, required_keys): key_list = find_exists(truncate_file, required_keys) queries = [] user_query_template = "这是一份货物标中采购要求部分的内容,请告诉我\"{}\"是什么,请以json格式返回结果,外层键名是\"{}\",内层键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减,注意你无需回答具体设备的技术要求。" for key in key_list: query_base = user_query_template.format(key, key) other_keys = [k for k in key_list if k != key] if other_keys: query_base += "也不需要回答\"{}\"中的内容,".format("\"和\"".join(other_keys)) query_base += "若相关要求不存在,在键值中填'未知'。" queries.append(query_base) # print(query_base) return queries def get_business_requirements(truncate_file,file_id): required_keys = ["商务要求", "服务要求", "其他要求"] queries = generate_queries(truncate_file, required_keys) results = multi_threading(queries, "", file_id, 2) business_requirements = [res for _, res in results] if results else [] # Combine and fill missing keys with default values final_res = combine_json_results(business_requirements) final_res.update({key: final_res.get(key, "") for key in required_keys}) return final_res if __name__ == "__main__": truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf" file_id = upload_file(truncate_file) res=get_business_requirements(truncate_file,file_id) print(json.dumps(res, ensure_ascii=False, indent=4))