11.5货物标截取优化

This commit is contained in:
zy123 2024-11-05 16:29:32 +08:00
parent d0e7f060c8
commit ccab078ac3
5 changed files with 208 additions and 94 deletions

View File

@ -184,9 +184,9 @@ if __name__ == '__main__':
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
# local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
local_path_in="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\6.2定版视频会议磋商文件.doc"
# downloaded_file=doc2docx(local_path_in)
downloaded_file=doc2docx(local_path_in)
# downloaded_file=pdf2docx(local_path_in)
downloaded_file=docx2pdf(local_path_in)
# downloaded_file=docx2pdf(local_path_in)
print(downloaded_file)

View File

@ -95,7 +95,7 @@ def extract_text_by_page(file_path):
if __name__ == '__main__':
file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\交警支队机动车查验监管系统项目采购.pdf"
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'

View File

@ -16,8 +16,6 @@ def get_global_logger(unique_id):
logger = None
def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header):
try:
# 获取文件基本名称
@ -238,7 +236,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
begin_pattern = r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:])'
end_pattern = re.compile(
r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:])(?!.*(?:资质|能力|信誉)).*$|'
r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、]\s*$|\s*评标(办法|方法)前附表\s*$|投标人须知',
r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()]\s*$|\s*评标(办法|方法)前附表\s*$|投标人须知',
re.MULTILINE
)
start_page = None

View File

@ -110,12 +110,14 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
if output_suffix == "tobidders_notice":
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
start_page, mid_page, end_page = extract_pages_tobidders_notice(
pdf_document, begin_pattern, begin_page, common_header, exclusion_pattern
pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern
)
if start_page is None or end_page is None or mid_page is None:
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header)
if not start_page or not mid_page or not end_page:
print(f"三次提取tobidders_notice均失败!!{pdf_path}")
return "",""
# if start_page is None or end_page is None or mid_page is None:
# print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
# return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page)
path1 = save_extracted_pages(pdf_document, start_page, mid_page, pdf_path, output_folder, "tobidders_notice_part1")
path2 = save_extracted_pages(pdf_document, mid_page, end_page, pdf_path, output_folder, "tobidders_notice_part2")
@ -138,7 +140,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
if start_page is None or end_page is None:
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header)
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page)
elif output_suffix == "qualification1":
truncate_pdf_main(pdf_path, output_folder, 2, "qualification3") # 合并'资格审查'章节和'评标办法'章节
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
@ -175,28 +177,33 @@ def get_patterns_for_evaluation_method():
return begin_pattern, end_pattern
# def get_patterns_for_qualification():
# # # 原始匹配逻辑
# # begin_pattern_original = re.compile(
# # r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE)
# # end_pattern_original = re.compile(
# # r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
#
# # 新匹配逻辑
# begin_pattern_new = re.compile(
# r'^资格性检查', re.MULTILINE)
# end_pattern_new = re.compile(
# r'^附件\s*\d+|^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
#
# return begin_pattern_new, end_pattern_new
def get_patterns_for_qualification():
# # 原始匹配逻辑
# begin_pattern_original = re.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE)
# end_pattern_original = re.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
# 新匹配逻辑
begin_pattern_new = re.compile(
r'^资格性检查', re.MULTILINE)
end_pattern_new = re.compile(
r'^附件\s*\d+|^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
return begin_pattern_new, end_pattern_new
def get_patterns_for_qualification2():
# begin_pattern = re.compile(
# r'^(?:附录(?:一|1)?[:]|附件(?:一|1)?[:]|附表(?:一|1)?[:]|资格性检查|资格审查表).*(?:资格|符合性)?.*$',
# re.MULTILINE
# )
begin_pattern = re.compile(
r'^(?:附录(?:一|1)?[:]|附件(?:一|1)?[:]|附表(?:一|1)?[:]).*(?:资格|符合性).*$',
r'^(?:附录(?:一|1)?[:]?|附件(?:一|1)?[:]?|附表(?:一|1)?[:]?|资格性检查|资格审查表|符合性审查表)', #目前可以匹配 附录3,因为允许了'附录'匹配
re.MULTILINE
)
end_pattern = re.compile(
r'^(?!.*(?:资格|符合))(?:附录.*?[:]|附件.*?[:]|附表.*?[:]).*$|'
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
r'^(?!.*(?:资格|符合))(?:附录.*?[:]|附件.*?[:]|附表.*?[:]|附件\s*\d+).*$|'
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', #更宽松的匹配
re.MULTILINE
)
return begin_pattern, end_pattern
@ -239,7 +246,7 @@ def get_patterns_for_notice_twice():
# break
# return start_page, mid_page, end_page
def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, common_header, exclusion_pattern):
def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern):
"""
从PDF文档中提取起始页中间页和结束页
@ -274,7 +281,7 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, comm
end_page = None
chapter_type = None # 用于存储“章”或“部分”
combined_mid_pattern = None # 中间页的组合模式
pdf_document = PdfReader(pdf_path)
for i, page in enumerate(pdf_document.pages):
text = page.extract_text() or ""
cleaned_text = clean_page_content(text, common_header)
@ -345,8 +352,7 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, comm
)
else:
# 如果提供了固定的 end_pattern则使用默认的 mid_pattern
base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则)'
base_mid_pattern = r'.*[(]?\s*[一二12]?[)]?\s*[、..]*\s*(说\s*明|总\s*则)\s*$' #可以匹配"东莞市粤隆招标有限公司编制 46一、说明"
combined_mid_pattern = re.compile(
rf'{base_mid_pattern}',
re.MULTILINE
@ -355,6 +361,8 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, comm
# 识别中间页
if start_page is not None and mid_page is None and combined_mid_pattern:
if (start_page+1==i) and re.search(local_begin_pattern,cleaned_text):
continue
if re.search(combined_mid_pattern, cleaned_text):
mid_page = i
@ -374,23 +382,83 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, comm
# 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取
if not (start_page and mid_page and end_page):
# 定义新的 begin_pattern 和 end_pattern
new_begin_pattern = re.compile(
r'.*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|'
r'(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表',
re.MULTILINE
)
new_end_pattern = re.compile(
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*$',
re.MULTILINE
)
# 第二次提取尝试,使用新的模式
start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern)
print(f"第二次尝试 tobidders_notice!{pdf_path}")
pdf_document = PdfReader(pdf_path)
start_page,mid_page,end_page=extract_pages_twice_tobidders_notice(pdf_document,common_header,begin_page)
if start_page and end_page and mid_page:
return start_page, mid_page, end_page
else:
# 定义新的 begin_pattern 和 end_pattern
new_begin_pattern = re.compile(
r'.*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|'
r'(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表',
re.MULTILINE
)
new_end_pattern = re.compile(
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*$',
re.MULTILINE
)
print("第三次尝试 tobidders_notice! ")
# 第二次提取尝试,使用新的模式
start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern)
return start_page, mid_page, end_page
def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header):
#投标人须知分为两个章节
# def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page):
# begin_pattern = re.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+'
# )
# end_pattern = re.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)' # 捕获中文部分
# )
# exclusion_words = ["合同", "评标", "开标","评审","采购","资格"] # 在这里添加需要排除的关键词
#
# pdf_document = PdfReader(pdf_path)
# exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
#
# # 提取第一部分
# start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
# if start_page1 is None or end_page1 is None:
# print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
# return "", ""
#
# # 保存第一部分的路径
# path1 = save_extracted_pages(pdf_document, start_page1, end_page1, pdf_path, output_folder,
# "tobidders_notice_part1")
#
# # 提取第二部分
# start_page2 = end_page1
#
# # 检查end_page1页面的内容
# text = pdf_document.pages[end_page1].extract_text() or ""
# cleaned_text = clean_page_content(text, common_header)
# match = end_pattern.search(cleaned_text)
#
# if match:
# # 获取匹配到的中文部分
# chapter_title = match.group(1)
# # 检查是否包含排除关键词
# if any(word in chapter_title for word in exclusion_words):
# # 如果包含排除关键词,直接返回相同的路径
# return path1, path1
#
# # 如果不包含排除关键词,继续提取第二部分
# _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
# exclusion_pattern)
#
# if end_page2 is None:
# print(f"second: {output_suffix} 未找到第二部分的结束页在文件 {pdf_path} 中!")
# return path1, path1
#
# # 保存第二部分的路径
# path2 = save_extracted_pages(pdf_document, start_page2, end_page2, pdf_path, output_folder,
# "tobidders_notice_part2")
#
# return path1, path2
def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page):
begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+'
)
@ -399,18 +467,16 @@ def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix,
)
exclusion_words = ["合同", "评标", "开标","评审","采购","资格"] # 在这里添加需要排除的关键词
pdf_document = PdfReader(pdf_path)
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
# 提取第一部分
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, -1, common_header)
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
if start_page1 is None or end_page1 is None:
print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
return "", ""
return "", "",""
# 保存第一部分的路径
path1 = save_extracted_pages(pdf_document, start_page1, end_page1, pdf_path, output_folder,
"tobidders_notice_part1")
# # 保存第一部分的路径
# path1 = save_extracted_pages(pdf_document, start_page1, end_page1, pdf_path, output_folder,
# "tobidders_notice_part1")
# 提取第二部分
start_page2 = end_page1
@ -426,52 +492,71 @@ def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix,
# 检查是否包含排除关键词
if any(word in chapter_title for word in exclusion_words):
# 如果包含排除关键词,直接返回相同的路径
return path1, path1
return start_page1, end_page1,end_page1
# 如果不包含排除关键词,继续提取第二部分
_, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
exclusion_pattern)
if end_page2 is None:
print(f"second: {output_suffix} 未找到第二部分的结束页在文件 {pdf_path} 中!")
return path1, path1
return start_page1, end_page1,end_page1
# 保存第二部分的路径
path2 = save_extracted_pages(pdf_document, start_page2, end_page2, pdf_path, output_folder,
"tobidders_notice_part2")
# # 保存第二部分的路径
# path2 = save_extracted_pages(pdf_document, start_page2, end_page2, pdf_path, output_folder,
# "tobidders_notice_part2")
return path1, path2
return start_page1, end_page1,end_page2
def extract_pages_qualification(pdf_document,begin_page,begin_pattern,end_pattern, common_header):
start_page = None
end_page = None
include_keywords = ["资格审查", "资质审查", "符合性审查"]
exclude_keywords = ["声明函", "承诺函"]
# 从章节开始后的位置进行检查
for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
# 确定起始页需在last_begin_index之后
if (
any(keyword in cleaned_text for keyword in include_keywords) and
all(keyword not in cleaned_text for keyword in exclude_keywords) and
start_page is None
):
if begin_pattern.search(cleaned_text):
print("附件")
start_page = i
# 确定结束页
if start_page is not None and end_pattern.search(cleaned_text):
if i > start_page:
end_page = i
break # 找到结束页后退出循环
return start_page,end_page
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page):
try:
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
pdf_document = PdfReader(pdf_path)
patterns = None
begin_page = 0
start_page = None
end_page = None
if output_suffix == "procurement":
patterns = [get_patterns_for_procurement()]
begin_page = 5
elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3":
patterns = [get_patterns_for_evaluation_method()]
begin_page = 5
elif output_suffix == "qualification1":
patterns = [get_patterns_for_qualification(),get_patterns_for_qualification2()]
begin_page = 5
patterns = [get_patterns_for_qualification()]
elif output_suffix == "notice":
patterns = [get_patterns_for_notice(),get_patterns_for_notice_twice()]
begin_page = 0
if patterns:
for pattern_pair in patterns:
# print(pattern_pair[0])
# print(pattern_pair[1])
start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
common_header,
exclusion_pattern, output_suffix)
if output_suffix=="qualification1":
start_page,end_page=extract_pages_qualification(pdf_document,begin_page,pattern_pair[0],pattern_pair[1],common_header)
else:
start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
common_header,exclusion_pattern, output_suffix)
if start_page is not None and end_page is not None:
break
@ -544,7 +629,7 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo
def get_start_and_common_header(input_path):
common_header = extract_common_header(input_path)
last_begin_index = 0
begin_pattern = re.compile(r'.*(?:招标公告|邀请书|邀请函|投标邀请)[\)]?\s*$',re.MULTILINE)
begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',re.MULTILINE)
pdf_document = PdfReader(input_path)
for i, page in enumerate(pdf_document.pages):
if i > 10:
@ -604,8 +689,8 @@ def process_input(input_path, output_folder, selection, output_suffix):
# 根据选择设置对应的模式和结束模式
if selection == 1:
begin_pattern = re.compile(r'.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$', re.MULTILINE)
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$', re.MULTILINE)
end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE)
local_output_suffix = "notice"
elif selection == 2:
begin_pattern = re.compile(
@ -737,9 +822,10 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}")
return truncate_files
#TODO:截取还有问题
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况类似工程标 唐山投标只有正文,没有附表
#ztbfile.pdf jiao通 广水农商行门禁控制主机及基础验证设备采购项目——磋商文件定稿三次_qualification2.pdf 唐山 包头
if __name__ == "__main__":
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
# input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\f8b793b5-aa60-42d3-ae59-a3f474e06610\\ztbfile.pdf"
@ -752,6 +838,6 @@ if __name__ == "__main__":
# selections = [1,4]
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
# print(files)
selection = 1# 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
selection = 4# 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print(generated_files)

View File

@ -2,12 +2,12 @@
import json
import os
import re
from flask_app.general.通义千问long import upload_file
from flask_app.general.通义千问long import upload_file, qianwen_long
from flask_app.general.多线程提问 import multi_threading
from flask_app.general.json_utils import clean_json_string
from flask_app.货物标.投标人须知正文条款提取成json文件货物标版 import convert_clause_to_json
import copy
import concurrent.futures
# 这个字典可能有嵌套,你需要遍历里面的键名,对键名作判断,而不是键值,具体是这样的:如果处于同一层级的键的数量>1并且键名全由数字或点号组成。那么就将这些序号键名全部删除重新组织成一个字典格式的数据你可以考虑用字符串列表来保持部分平级的数据
# 对于同级的键,如果数量>1且键名都统一那么将键名去掉用列表保持它们的键值
@ -379,23 +379,53 @@ def combine_qualification_review(invalid_path, output_folder, qualification_path
}
}
def process_file(file_path):
def process_file(file_path, invalid_path):
file_id = upload_file(file_path)
user_queries = [
"该招标文件中规定的资格性审查标准是怎样的请以json格式给出外层为'资格性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关符合性性审查的内容。",
"该招标文件中规定的符合性审查标准是怎样的请以json格式给出外层为'符合性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关资格性审查的内容。"
]
results = multi_threading(user_queries, "", file_id, 2)
combined_res = {}
for _, response in results:
if response:
# print(response)
cleaned_data = clean_json_string(response)
processed = process_dict(preprocess_dict(cleaned_data))
combined_res.update(processed)
else:
print(f"Warning: No response for a query.")
first_query = """
该文档中是否有关于资格性审查标准的具体内容,是否有关于符合性审查标准的具体内容?请以json格式给出回答,外键分别为'资格性审查''符合性审查',键值仅限于'','',输出格式示例如下:
{
"资格性审查":"",
"符合性审查":""
}
"""
qianwen_ans = clean_json_string(qianwen_long(file_id, first_query))
user_queries = [
{
"key": "资格性审查",
"query": "该招标文件中规定的资格性审查标准是怎样的请以json格式给出外层为'资格性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关符合性性审查的内容。"
},
{
"key": "符合性审查",
"query": "该招标文件中规定的符合性审查标准是怎样的请以json格式给出外层为'符合性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关资格性审查的内容。"
}
]
combined_res = {}
file_id2 = None # 延迟上传 invalid_path
def process_single_query(query_info):
nonlocal file_id2
key = query_info["key"]
query = query_info["query"]
# 根据键值决定使用哪个 file_id
if qianwen_ans.get(key) == "":
if not file_id2:
file_id2 = upload_file(invalid_path)
current_file_id = file_id2
else:
current_file_id = file_id
# 调用大模型获取回答
ans = qianwen_long(current_file_id, query)
cleaned_data = clean_json_string(ans)
processed = process_dict(preprocess_dict(cleaned_data))
return processed
# 使用线程池并行处理查询
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
futures = [executor.submit(process_single_query, q) for q in user_queries]
for future in concurrent.futures.as_completed(futures):
result = future.result()
combined_res.update(result)
return combined_res
try: