This commit is contained in:
zy123 2024-12-04 15:31:08 +08:00
parent d8f7718511
commit d15c094578
6 changed files with 93 additions and 68 deletions

View File

@ -2,6 +2,8 @@ import os
import PyPDF2 import PyPDF2
import requests import requests
from ratelimit import sleep_and_retry, limits
from flask_app.general.file2markdown import convert_pdf_to_markdown from flask_app.general.file2markdown import convert_pdf_to_markdown
from flask_app.general.clean_pdf import extract_common_header, clean_page_content from flask_app.general.clean_pdf import extract_common_header, clean_page_content
@ -58,7 +60,8 @@ def read_txt_to_string(file_path):
except Exception as e: except Exception as e:
return f"错误:读取文件时发生错误。详细信息:{e}" return f"错误:读取文件时发生错误。详细信息:{e}"
@sleep_and_retry
@limits(calls=10, period=1) # 每秒最多调用10次
def doubao_model(full_user_query): def doubao_model(full_user_query):
print("call doubao...") print("call doubao...")
# 相关参数 # 相关参数

View File

@ -48,12 +48,12 @@ def convert_pdf_to_markdown(file_path):
resp = textin.recognize_pdf2md(image, { resp = textin.recognize_pdf2md(image, {
'page_start': 0, 'page_start': 0,
'page_count': 50, # 设置解析页数为50页 'page_count': 50, # 设置解析页数为50页
'table_flavor': 'md', # html 按html语法输出表格 'table_flavor': 'html', # html 按html语法输出表格
'parse_mode': 'scan', # 设置解析模式为scan模式 'parse_mode': 'auto', # 设置解析模式为scan模式
'page_details': 0, # 不包含页面细节 'page_details': 0, # 不包含页面细节
'markdown_details': 1, 'markdown_details': 1,
'apply_document_tree': 1, 'apply_document_tree': 1,
'dpi': 144 # 分辨率设置为144 dpi 'dpi': 216 # 分辨率设置默认为144 dpi
}) })
print("request time: ", resp.elapsed.total_seconds()) print("request time: ", resp.elapsed.total_seconds())
data = json.loads(resp.text) data = json.loads(resp.text)

View File

@ -131,11 +131,11 @@ def generate_queries(truncate_file, required_keys):
query_base += "也不需要回答\"{}\"中的内容,".format("\"\"".join(other_keys)) query_base += "也不需要回答\"{}\"中的内容,".format("\"\"".join(other_keys))
query_base += "若相关要求不存在,在键值中填'未知'" query_base += "若相关要求不存在,在键值中填'未知'"
queries.append(query_base) queries.append(query_base)
# print(query_base)
return queries return queries
def generate_template(required_keys, type=1): def generate_template(required_keys,processed_filepath, type=1):
full_text=read_txt_to_string(processed_filepath)
# 定义每个键对应的示例内容 # 定义每个键对应的示例内容
example_content1 = { example_content1 = {
"技术要求": ["相关技术要求1", "相关技术要求2"], "技术要求": ["相关技术要求1", "相关技术要求2"],
@ -249,27 +249,26 @@ def generate_template(required_keys, type=1):
{tech_json_example1_str} {tech_json_example1_str}
示例 2嵌套键值对形式 示例 2嵌套键值对形式
{tech_json_example2_str} {tech_json_example2_str}
文件内容{full_text}
""" """
return user_query_template return user_query_template
def get_business_requirements(procurement_path,procurement_docpath): def get_business_requirements(procurement_path,processed_filepath):
file_id = upload_file(procurement_docpath)
print(file_id)
required_keys = ["\s*术\s*要\s*求", "\s*务\s*要\s*求", "\s*务\s*要\s*求", "\s*他\s*要\s*求","\s*体\s*要\s*求","\s*设\s*要\s*求","\s*度\s*要\s*求","\s*期\s*要\s*求","\s*保\s*要\s*求","\s*训\s*要\s*求","\s*后\s*要\s*求"] required_keys = ["\s*术\s*要\s*求", "\s*务\s*要\s*求", "\s*务\s*要\s*求", "\s*他\s*要\s*求","\s*体\s*要\s*求","\s*设\s*要\s*求","\s*度\s*要\s*求","\s*期\s*要\s*求","\s*保\s*要\s*求","\s*训\s*要\s*求","\s*后\s*要\s*求"]
contained_keys = find_exists(procurement_path, required_keys) contained_keys = find_exists(procurement_path, required_keys)
print(contained_keys) print(contained_keys)
if not contained_keys: if not contained_keys:
return {} return {}
# queries = generate_queries(truncate_file, contained_keys) busi_user_query = generate_template(contained_keys, processed_filepath,1)
busi_user_query = generate_template(contained_keys, 1) tech_user_query = generate_template(contained_keys, processed_filepath,2)
tech_user_query = generate_template(contained_keys, 2)
final_res={} final_res={}
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
futures = [] futures = []
if busi_user_query: if busi_user_query:
futures.append(executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1)) futures.append(executor.submit(doubao_model,busi_user_query))
if tech_user_query: if tech_user_query:
futures.append(executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1)) futures.append(executor.submit(doubao_model,tech_user_query))
# 获取结果 # 获取结果
for future in concurrent.futures.as_completed(futures): for future in concurrent.futures.as_completed(futures):
try: try:

View File

@ -353,29 +353,31 @@ def generate_prompt(judge_res, full_text=None):
base_prompt += "\n注意事项:\n1.严格按照上述要求执行,确保输出准确性和规范性。\n" base_prompt += "\n注意事项:\n1.严格按照上述要求执行,确保输出准确性和规范性。\n"
return base_prompt return base_prompt
#文件内容以markdown格式组织其中表格部分若有以html语法组织 def get_technical_requirements(invalid_path,processed_filepath):
def get_technical_requirements(file_path,invalid_path,processed_filepath): file_id=""
# docx_file_path=pdf2docx(file_path) model_type=1 #默认使用豆包
file_id=upload_file(file_path) #目前传入的为docx文档 # 防止截取失败
first_query_template="该文件是否说明了采购需求,即需要采购哪些货物?如果有,请回答'',否则,回答''" #防止截取失败 first_query_template="""该文件是否说明了采购需求,即需要采购哪些货物?如果有,请回答'',否则,回答''
judge_res=qianwen_long(file_id,first_query_template) 文件内容{full_text}
"""
judge_query=generate_full_user_query(processed_filepath,first_query_template)
judge_res=doubao_model(judge_query)
if '' in judge_res: if '' in judge_res:
model_type=0 #使用qianwen-long+invalid_path
print("no!调用invalid_path") print("no!调用invalid_path")
file_id=upload_file(invalid_path) file_id=upload_file(invalid_path)
user_query = generate_prompt(judge_res) user_query = generate_prompt(judge_res)
model_res=qianwen_long(file_id,user_query) model_res=qianwen_long(file_id,user_query)
print(model_res) print(model_res)
else: else:
# processed_filepath = convert_pdf_to_markdown(file_path) # 转markdown格式
# processed_filepath=r"C:\Users\Administrator\Desktop\货物标\extract_files\107国道.txt"
full_text = read_txt_to_string(processed_filepath) full_text = read_txt_to_string(processed_filepath)
user_query=generate_prompt(judge_res,full_text) user_query=generate_prompt(judge_res,full_text)
model_res=doubao_model(user_query) model_res=doubao_model(user_query)
print(model_res) print(model_res)
cleaned_res = clean_json_string(model_res) #转字典 cleaned_res = clean_json_string(model_res) #转字典
processed_data=truncate_system_keys(cleaned_res['采购需求']) processed_data=truncate_system_keys(cleaned_res['采购需求']) #防止嵌套层级过长
key_paths, grouped_paths, good_list, data_copy= generate_key_paths(processed_data) # 提取需要采购的货物清单 key_list交通监控视频子系统.高清视频抓拍像机 ... grouped_paths是同一系统下同时有'交换机-1'和'交换机-2',提取'交换机' 输出eg:{'交通标志.标志牌铝板', '交通信号灯.交换机'} key_paths, grouped_paths, good_list, data_copy= generate_key_paths(processed_data) # 提取需要采购的货物清单 key_list交通监控视频子系统.高清视频抓拍像机 ... grouped_paths是同一系统下同时有'交换机-1'和'交换机-2',提取'交换机' 输出eg:{'交通标志.标志牌铝板', '交通信号灯.交换机'}
modified_data=rename_keys(data_copy) modified_data=rename_keys(data_copy) #
user_query_template = """请根据货物标中采购要求部分的内容,告诉我\"{}\"的技术参数或采购要求是什么。请以 JSON 格式返回结果,键名为\"{}\",键值为一个列表,列表中包含若干描述\"{}\"的技术参数或采购要求的字符串,请按原文内容回答,保留三角▲、五角★和序号,不可擅自增删内容,尤其是不可擅自添加序号。 user_query_template = """请根据货物标中采购要求部分的内容,告诉我\"{}\"的技术参数或采购要求是什么。请以 JSON 格式返回结果,键名为\"{}\",键值为一个列表,列表中包含若干描述\"{}\"的技术参数或采购要求的字符串,请按原文内容回答,保留三角▲、五角★和序号,不可擅自增删内容,尤其是不可擅自添加序号。
要求与指南 要求与指南
@ -401,6 +403,8 @@ def get_technical_requirements(file_path,invalid_path,processed_filepath):
"协议routes 接口开放:具备;▲支持标准 ONVIF 协议与第三方厂家设备进行互联;支持 GB/T28181应提供 SDK" "协议routes 接口开放:具备;▲支持标准 ONVIF 协议与第三方厂家设备进行互联;支持 GB/T28181应提供 SDK"
] ]
}} }}
文件内容:{}
""" """
user_query_template_two="""请根据货物标中采购要求部分的内容,告诉我\"{}\"的技术参数或采购要求是什么。由于该货物存在 {} 种不同的采购要求或技术参数,请逐一列出,并以 JSON 格式返回结果。请以'货物名-编号'区分多种型号,编号为从 1 开始的自然数,依次递增,即第一个键名为\"{}-1\";键值为一个列表,列表中包含若干描述\"{}\"的技术参数(或采购要求)的字符串,请按原文内容回答,保留三角▲、五角★和序号(若有),不可擅自增删内容,尤其是不可擅自添加序号。 user_query_template_two="""请根据货物标中采购要求部分的内容,告诉我\"{}\"的技术参数或采购要求是什么。由于该货物存在 {} 种不同的采购要求或技术参数,请逐一列出,并以 JSON 格式返回结果。请以'货物名-编号'区分多种型号,编号为从 1 开始的自然数,依次递增,即第一个键名为\"{}-1\";键值为一个列表,列表中包含若干描述\"{}\"的技术参数(或采购要求)的字符串,请按原文内容回答,保留三角▲、五角★和序号(若有),不可擅自增删内容,尤其是不可擅自添加序号。
@ -434,31 +438,35 @@ def get_technical_requirements(file_path,invalid_path,processed_filepath):
"支持夜视", "支持云存储" "支持夜视", "支持云存储"
] ]
}} }}
文件内容{}
""" """
queries = [] queries = []
for key in key_paths: for key in key_paths:
# 将键中的 '.' 替换为 '下的' # 将键中的 '.' 替换为 '下的'
modified_key = key.replace('.', '下的') modified_key = key.replace('.', '下的')
# 使用修改后的键填充第一个占位符,原始键填充第二个占位符 # 使用修改后的键填充第一个占位符,原始键填充第二个占位符
# full_text = read_txt_to_string(processed_filepath) if model_type:
# new_query = user_query_template.format(modified_key, key, modified_key,full_text) #转豆包后取消注释 full_text = read_txt_to_string(processed_filepath)
new_query = user_query_template.format(modified_key, key, modified_key,full_text) #转豆包后取消注释
else:
new_query = user_query_template.format(modified_key, key, modified_key) new_query = user_query_template.format(modified_key, key, modified_key)
queries.append(new_query) queries.append(new_query)
# 处理 grouped_paths 中的项,应用 user_query_template_two # 处理 grouped_paths 中的项,应用 user_query_template_two
for grouped_dict in grouped_paths: for grouped_dict in grouped_paths:
for grouped_key, grouped_key_cnt in grouped_dict.items(): for grouped_key, grouped_key_cnt in grouped_dict.items():
# 将键中的 '.' 替换为 '下的' # 将键中的 '.' 替换为 '下的'
modified_grouped_key = grouped_key.replace('.', '下的') modified_grouped_key = grouped_key.replace('.', '下的')
# 使用修改后的键填充第一个占位符,原始键填充第二个占位符 if model_type:
# 如果需要使用 full_text可以取消注释并提供相应的实现 full_text = read_txt_to_string(processed_filepath)
# full_text = read_txt_to_string(processed_filepath) new_query = user_query_template_two.format(modified_grouped_key, grouped_key_cnt,grouped_key, modified_grouped_key, full_text)
# new_query = user_query_template_two.format(modified_grouped_key, grouped_key, modified_grouped_key, full_text) else:
# 根据您的需求,生成新的查询字符串 new_query = user_query_template_two.format(modified_grouped_key, grouped_key_cnt, grouped_key,modified_grouped_key)
new_query = user_query_template_two.format(modified_grouped_key, grouped_key_cnt,grouped_key, modified_grouped_key)
queries.append(new_query) queries.append(new_query)
results = multi_threading(queries, "", file_id, 2) #通义 if model_type:
# results = multi_threading(queries, "", "", 3) #豆包 results = multi_threading(queries, "", "", 3) #豆包
else:
results = multi_threading(queries, "", file_id, 2) # 豆包
technical_requirements = [] technical_requirements = []
if not results: if not results:
print("errror!未获得大模型的回答!") print("errror!未获得大模型的回答!")
@ -505,15 +513,12 @@ if __name__ == "__main__":
# truncate_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\469d2aee-9024-4993-896e-2ac7322d41b7\\ztbfile_procurement.docx" # truncate_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\469d2aee-9024-4993-896e-2ac7322d41b7\\ztbfile_procurement.docx"
truncate_docfile=r"C:\Users\Administrator\Desktop\货物标\output1\6_2定版视频会议磋商文件_procurement.docx" truncate_docfile=r"C:\Users\Administrator\Desktop\货物标\output1\6_2定版视频会议磋商文件_procurement.docx"
truncate_file=r'C:\Users\Administrator\Desktop\货物标\output1\6.2定版视频会议磋商文件_procurement.pdf' truncate_file=r'C:\Users\Administrator\Desktop\货物标\output1\6.2定版视频会议磋商文件_procurement.pdf'
# invalid_path="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile.pdf"
# truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile_procurement.docx"
# output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\tmp"
# file_id = upload_file(truncate_file)
invalid_path=r"C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\ztbfile.pdf" invalid_path=r"C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\ztbfile.pdf"
# file_id=upload_file(truncate_file) # file_id=upload_file(truncate_file)
processed_filepath = pdf2txt(truncate_file) # processed_filepath = pdf2txt(truncate_file)
# processed_filepath=r"C:\Users\Administrator\Desktop\fsdownload\e702f1e6-095d-443d-bb7d-ef2e42037cb1\金水河沿线排涝泵站提档升级项目.txt" # processed_filepath = convert_pdf_to_markdown(truncate_file)
res=get_technical_requirements(truncate_docfile,invalid_path,processed_filepath) processed_filepath=r"C:\Users\Administrator\Desktop\货物标\extract_files\6.2定版视频会议磋商文件html.txt"
res=get_technical_requirements(invalid_path,processed_filepath)
json_string = json.dumps(res, ensure_ascii=False, indent=4) json_string = json.dumps(res, ensure_ascii=False, indent=4)
print(json_string) print(json_string)
# # input_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1" # # input_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"

View File

@ -187,12 +187,25 @@ def restructure_data(data):
def get_prefixes(s): def get_prefixes(s):
prefixes = [] prefixes = []
for i in range(len(s)): for i in range(len(s)):
if s[i] == '': if s[i] in ['', ':']:
prefixes.append(s[:i+1]) prefixes.append(s[:i+1])
return prefixes return prefixes
# 定义删除公共前缀的函数 # 定义删除公共前缀的函数
def remove_common_prefixes(string_list): def remove_common_prefixes(string_list, min_occurrence=3):
"""
删除列表中所有满足出现次数>= min_occurrence 的公共前缀
Args:
string_list (list): 字符串列表
min_occurrence (int): 前缀至少出现的次数
Returns:
list: 删除公共前缀后的字符串列表
"""
if not string_list:
return string_list
# 构建前缀到字符串集合的映射 # 构建前缀到字符串集合的映射
prefix_to_strings = {} prefix_to_strings = {}
for s in string_list: for s in string_list:
@ -202,19 +215,30 @@ def remove_common_prefixes(string_list):
if prefix not in prefix_to_strings: if prefix not in prefix_to_strings:
prefix_to_strings[prefix] = set() prefix_to_strings[prefix] = set()
prefix_to_strings[prefix].add(s) prefix_to_strings[prefix].add(s)
# 找出至少在两个字符串中出现的前缀
prefixes_occuring_in_multiple_strings = [prefix for prefix, strings in prefix_to_strings.items() if len(strings) >=2] # 找出所有出现次数 >= min_occurrence 的前缀
# 对每个字符串,找到其匹配的最长前缀并删除 qualifying_prefixes = [prefix for prefix, strings in prefix_to_strings.items() if len(strings) >= min_occurrence]
if not qualifying_prefixes:
# 没有满足条件的公共前缀,返回原列表
return string_list
# 为了确保较长的前缀先被匹配,按长度降序排序
qualifying_prefixes.sort(key=len, reverse=True)
# 对每个字符串,循环删除所有匹配的前缀
new_string_list = [] new_string_list = []
for s in string_list: for s in string_list:
applicable_prefixes = [prefix for prefix in prefixes_occuring_in_multiple_strings if s.startswith(prefix)] original_s = s
if applicable_prefixes: changed = True
# 找到最长的前缀 while changed:
longest_prefix = max(applicable_prefixes, key=len) changed = False
# 删除前缀 for prefix in qualifying_prefixes:
new_s = s[len(longest_prefix):] if s.startswith(prefix):
new_string_list.append(new_s) s = s[len(prefix):]
else: changed = True
# 一旦删除一个前缀,重新开始检查,以处理可能的多个前缀
break
new_string_list.append(s) new_string_list.append(s)
return new_string_list return new_string_list

View File

@ -12,7 +12,6 @@ from flask_app.货物标.商务服务其他要求提取 import get_business_requ
# 获取采购清单 # 获取采购清单
def fetch_procurement_reqs(procurement_path, invalid_path): def fetch_procurement_reqs(procurement_path, invalid_path):
procurement_docpath = pdf2docx(procurement_path) # 采购需求docx
# 定义默认的 procurement_reqs 字典 # 定义默认的 procurement_reqs 字典
DEFAULT_PROCUREMENT_REQS = { DEFAULT_PROCUREMENT_REQS = {
"采购需求": "", "采购需求": "",
@ -27,16 +26,15 @@ def fetch_procurement_reqs(procurement_path, invalid_path):
return DEFAULT_PROCUREMENT_REQS.copy() return DEFAULT_PROCUREMENT_REQS.copy()
try: try:
# processed_filepath = convert_pdf_to_markdown(procurement_path) # 转markdown格式 processed_filepath = convert_pdf_to_markdown(procurement_path) # 转markdown格式
processed_filepath = pdf2txt(procurement_path) # 纯文本提取 # processed_filepath = pdf2txt(procurement_path) # 纯文本提取
# 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
with concurrent.futures.ThreadPoolExecutor() as executor: with concurrent.futures.ThreadPoolExecutor() as executor:
# 提交任务给线程池 # 提交任务给线程池
future_technical = executor.submit(get_technical_requirements, procurement_docpath, invalid_path, future_technical = executor.submit(get_technical_requirements, invalid_path,processed_filepath)
processed_filepath)
time.sleep(0.5) # 保持原有的延时 time.sleep(0.5) # 保持原有的延时
future_business = executor.submit(get_business_requirements, procurement_path, procurement_docpath) future_business = executor.submit(get_business_requirements, procurement_path, processed_filepath)
# 获取并行任务的结果 # 获取并行任务的结果
technical_requirements = future_technical.result() technical_requirements = future_technical.result()
@ -49,10 +47,6 @@ def fetch_procurement_reqs(procurement_path, invalid_path):
procurement_reqs.update(business_requirements) procurement_reqs.update(business_requirements)
# 如果需要确保所有默认键存在,可以取消下面的注释
# for key, default_value in DEFAULT_PROCUREMENT_REQS.items():
# procurement_reqs.setdefault(key, default_value)
return procurement_reqs return procurement_reqs
except Exception as e: except Exception as e:
@ -67,7 +61,7 @@ if __name__ == "__main__":
start_time = time.time() start_time = time.time()
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output" output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output"
# file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_procurement.pdf" # file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_procurement.pdf"
procurement_path = r"C:\Users\Administrator\Desktop\货物标\output1\陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1)_procurement.pdf" procurement_path = r"C:\Users\Administrator\Desktop\货物标\output1\6.2定版视频会议磋商文件_procurement.pdf"
procurement_docpath = r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a" procurement_docpath = r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a"
invalid_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\db79e9e0-830e-442c-8cb6-1d036215f8ff\\ztbfile.pdf" invalid_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\db79e9e0-830e-442c-8cb6-1d036215f8ff\\ztbfile.pdf"
res = fetch_procurement_reqs(procurement_path, invalid_path) res = fetch_procurement_reqs(procurement_path, invalid_path)