1.17 小bug
This commit is contained in:
parent
7e8f368324
commit
94b945e6b1
@ -48,7 +48,8 @@ def continue_answer(original_query, original_answer, model_type=1, file_id=None)
|
|||||||
|
|
||||||
# 根据模型类型选择调用的模型
|
# 根据模型类型选择调用的模型
|
||||||
if model_type == 1: # 使用 doubao_model
|
if model_type == 1: # 使用 doubao_model
|
||||||
model_res = doubao_model(continue_query)
|
# model_res = doubao_model(continue_query)
|
||||||
|
model_res=qianwen_plus(continue_query)
|
||||||
elif model_type == 2: # 使用 qianwen_long
|
elif model_type == 2: # 使用 qianwen_long
|
||||||
if file_id is None:
|
if file_id is None:
|
||||||
raise ValueError("file_id 必须在使用 qianwen_long 模型时提供!")
|
raise ValueError("file_id 必须在使用 qianwen_long 模型时提供!")
|
||||||
|
@ -13,7 +13,8 @@ from dashscope import Assistants, Messages, Runs, Threads
|
|||||||
from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
|
from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
|
||||||
|
|
||||||
from flask_app.general.doubao import doubao_model
|
from flask_app.general.doubao import doubao_model
|
||||||
from flask_app.general.通义千问long import qianwen_long, upload_file
|
from flask_app.general.通义千问long import qianwen_long, upload_file, qianwen_plus
|
||||||
|
|
||||||
prompt = """
|
prompt = """
|
||||||
# 角色
|
# 角色
|
||||||
你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。
|
你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。
|
||||||
@ -165,11 +166,12 @@ def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type
|
|||||||
result_queue.put((ans_index, (question, qianwen_res)))
|
result_queue.put((ans_index, (question, qianwen_res)))
|
||||||
elif llm_type==3:
|
elif llm_type==3:
|
||||||
# print(f"doubao! question:{question}")
|
# print(f"doubao! question:{question}")
|
||||||
doubao_res=doubao_model(question,need_extra)
|
# doubao_res=doubao_model(question,need_extra)
|
||||||
if not doubao_res:
|
qianwen_plus_res=qianwen_plus(question,need_extra)
|
||||||
|
if not qianwen_plus_res:
|
||||||
result_queue.put((ans_index, None)) # 如果为空字符串,直接返回 None
|
result_queue.put((ans_index, None)) # 如果为空字符串,直接返回 None
|
||||||
else:
|
else:
|
||||||
result_queue.put((ans_index, (question, doubao_res)))
|
result_queue.put((ans_index, (question, qianwen_plus_res)))
|
||||||
else :
|
else :
|
||||||
assistant = pure_assistant()
|
assistant = pure_assistant()
|
||||||
ans = send_message(assistant, message=question)
|
ans = send_message(assistant, message=question)
|
||||||
|
@ -119,14 +119,14 @@ if __name__ == "__main__":
|
|||||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
|
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
|
||||||
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
|
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
|
||||||
# pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
|
# pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
|
||||||
pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
|
pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\“平安城市”三期暨雪亮工程一体化 建设项目.pdf"
|
||||||
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
|
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
|
||||||
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
||||||
output_folder = r"C:\Users\Administrator\Desktop\招标文件\output33"
|
output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)"
|
||||||
# selections = [1, 4] # 仅处理 selection 4、1
|
# selections = [1, 4] # 仅处理 selection 4、1
|
||||||
# selections = [1, 2, 3, 5]
|
# selections = [1, 2, 3, 5]
|
||||||
# files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods', selections) #engineering
|
# files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods', selections) #engineering
|
||||||
files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'engineering')
|
files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods')
|
||||||
print(files)
|
print(files)
|
||||||
# print(files[-1])
|
# print(files[-1])
|
||||||
# print(files[-2])
|
# print(files[-2])
|
||||||
|
@ -118,11 +118,9 @@ def convert_to_pdf(file_path):
|
|||||||
def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
||||||
"""
|
"""
|
||||||
从指定的PDF文件中提取无效部分并保存到输出文件夹中。
|
从指定的PDF文件中提取无效部分并保存到输出文件夹中。
|
||||||
begin_pattern从前往后匹配 end_pattern从后往前
|
begin_pattern匹配开头 end_pattern匹配结尾
|
||||||
页数小于100不进行begin_pattern,默认start_page=0
|
页数小于100不进行begin_pattern,默认start_page=0
|
||||||
页数大于200时end_pattern从前往后匹配
|
页数大于200时end_pattern从前往后匹配
|
||||||
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path (str): 输入的PDF文件路径。
|
file_path (str): 输入的PDF文件路径。
|
||||||
output_folder (str): 提取后文件的输出文件夹路径。
|
output_folder (str): 提取后文件的输出文件夹路径。
|
||||||
@ -134,29 +132,25 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
|||||||
# 定义开始模式
|
# 定义开始模式
|
||||||
begin_patterns = [
|
begin_patterns = [
|
||||||
regex.compile(
|
regex.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',
|
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\))]?\s*$',
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
),
|
),
|
||||||
regex.compile(
|
regex.compile(
|
||||||
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请',
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
# 定义结束模式
|
# 定义结束模式
|
||||||
end_patterns = [
|
end_patterns = [
|
||||||
regex.compile(
|
regex.compile(
|
||||||
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:格式|文件(?:的)?组成|文件(?:的)?构成|文件(?:的)?编制).*|'
|
||||||
r'第[一二三四五六七八九十]+(?:章|部分).*?合同|[::]清标报告',
|
r'第[一二三四五六七八九十]+(?:章|部分).*?合同|[::]清标报告',
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
),
|
),
|
||||||
regex.compile(
|
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:格式|文件(?:的)?组成|文件(?:的)?构成|文件(?:的)?编制).*',
|
|
||||||
regex.MULTILINE
|
|
||||||
),
|
|
||||||
regex.compile(
|
regex.compile(
|
||||||
r"\s*(投标文件|响应文件|响应性文件|应答文件)(?:的)?格式\s*",
|
r"\s*(投标文件|响应文件|响应性文件|应答文件)(?:的)?格式\s*",
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
)
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
# 定义排除模式
|
# 定义排除模式
|
||||||
@ -195,10 +189,14 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
|||||||
# 定义查找结束页的函数
|
# 定义查找结束页的函数
|
||||||
def find_end_page():
|
def find_end_page():
|
||||||
if total_pages > 200:
|
if total_pages > 200:
|
||||||
# print("总页数大于200页,结束页从前往后查找,跳过前30页。") #防止匹配到目录这块
|
# print("总页数大于200页,结束页从前往后查找,跳过前30页。") #且先匹配合同
|
||||||
for pattern in end_patterns:
|
for pattern in end_patterns:
|
||||||
for i in range(30, total_pages):
|
for i in range(30, total_pages):
|
||||||
text = page_texts[i]
|
text = page_texts[i]
|
||||||
|
exclusion_pattern = regex.compile(
|
||||||
|
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
||||||
|
if regex.search(exclusion_pattern,text):
|
||||||
|
continue
|
||||||
if regex.search(pattern, text):
|
if regex.search(pattern, text):
|
||||||
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
|
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
|
||||||
return i
|
return i
|
||||||
@ -232,9 +230,7 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
|||||||
future_end = executor.submit(find_end_page)
|
future_end = executor.submit(find_end_page)
|
||||||
start_page = future_start.result()
|
start_page = future_start.result()
|
||||||
end_page = future_end.result()
|
end_page = future_end.result()
|
||||||
|
|
||||||
# print(f"起始页: {start_page + 1}, 结束页: {end_page + 1}")
|
# print(f"起始页: {start_page + 1}, 结束页: {end_page + 1}")
|
||||||
|
|
||||||
# 验证页码范围
|
# 验证页码范围
|
||||||
if start_page > end_page:
|
if start_page > end_page:
|
||||||
print(f"无效的页码范围: 起始页 ({start_page + 1}) > 结束页 ({end_page + 1})")
|
print(f"无效的页码范围: 起始页 ({start_page + 1}) > 结束页 ({end_page + 1})")
|
||||||
@ -356,6 +352,7 @@ def generate_end_pattern(extraction_stage, chapter_type=None):
|
|||||||
r'|' + common_patterns
|
r'|' + common_patterns
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
end_pattern_template_third = (
|
end_pattern_template_third = (
|
||||||
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff、()()/]+\s*$'
|
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff、()()/]+\s*$'
|
||||||
r'|' + common_patterns
|
r'|' + common_patterns
|
||||||
@ -477,10 +474,17 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
|
|||||||
return perform_extraction_and_save(start_page, mid_page, end_page)
|
return perform_extraction_and_save(start_page, mid_page, end_page)
|
||||||
|
|
||||||
print(f"第一次提取tobidders_notice失败,尝试第二次提取: {pdf_path}")
|
print(f"第一次提取tobidders_notice失败,尝试第二次提取: {pdf_path}")
|
||||||
start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
|
start_page1, end_page1, start_page2, end_page2 = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
|
||||||
|
|
||||||
|
if start_page1 is not None and end_page1 is not None and start_page2 is not None and end_page2 is not None:
|
||||||
|
if end_page1==start_page2:
|
||||||
|
mid_page=end_page1
|
||||||
|
return perform_extraction_and_save(start_page1, mid_page, end_page2)
|
||||||
|
else:
|
||||||
|
path1=save_extracted_pages(pdf_path,output_folder,start_page1,end_page1,"tobidders_notice_part1",common_header)
|
||||||
|
path2=save_extracted_pages(pdf_path,output_folder,start_page2,end_page2,"tobidders_notice_part2",common_header)
|
||||||
|
return path1,path2
|
||||||
|
|
||||||
if start_page is not None and mid_page is not None and end_page is not None:
|
|
||||||
return perform_extraction_and_save(start_page, mid_page, end_page)
|
|
||||||
print("第二次提取tobidders_notice失败,尝试第三次提取!")
|
print("第二次提取tobidders_notice失败,尝试第三次提取!")
|
||||||
new_begin_pattern = regex.compile(
|
new_begin_pattern = regex.compile(
|
||||||
r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
|
r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
|
||||||
@ -503,9 +507,10 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
|
|||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知)+',
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知)+',
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
)
|
)
|
||||||
end_pattern = regex.compile(
|
second_begin_pattern = regex.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', regex.MULTILINE # 捕获中文部分
|
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', regex.MULTILINE # 捕获中文部分
|
||||||
)
|
)
|
||||||
|
end_pattern = second_begin_pattern
|
||||||
exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"] # 在这里添加需要排除的关键词
|
exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"] # 在这里添加需要排除的关键词
|
||||||
|
|
||||||
exclusion_pattern = regex.compile(
|
exclusion_pattern = regex.compile(
|
||||||
@ -515,10 +520,9 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
|
|||||||
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,
|
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,
|
||||||
exclusion_pattern, output_suffix)
|
exclusion_pattern, output_suffix)
|
||||||
if start_page1 is None or end_page1 is None:
|
if start_page1 is None or end_page1 is None:
|
||||||
return None, None, None
|
return None, None, None,None
|
||||||
|
|
||||||
# 提取第二部分
|
# 提取第二部分
|
||||||
start_page2 = end_page1
|
|
||||||
|
|
||||||
# 检查end_page1页面的内容
|
# 检查end_page1页面的内容
|
||||||
text = pdf_document.pages[end_page1].extract_text() or ""
|
text = pdf_document.pages[end_page1].extract_text() or ""
|
||||||
@ -531,19 +535,20 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
|
|||||||
# 检查是否包含排除关键词
|
# 检查是否包含排除关键词
|
||||||
if any(word in chapter_title for word in exclusion_words):
|
if any(word in chapter_title for word in exclusion_words):
|
||||||
# 如果包含排除关键词,直接返回相同的路径
|
# 如果包含排除关键词,直接返回相同的路径
|
||||||
return start_page1, end_page1, end_page1
|
second_begin_pattern=begin_pattern #投标人须知前附表和投标人须知正文不是紧挨着的!eg:(发布稿)恒丰理财有限责任公司智能投研终端项目竞争性磋商
|
||||||
|
|
||||||
# 如果不包含排除关键词,继续提取第二部分
|
# 如果不包含排除关键词,继续提取第二部分
|
||||||
output_suffix='tobidders_notice2'
|
output_suffix='tobidders_notice2'
|
||||||
_, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
|
begin_page=end_page1-1 #传入start_page2-1是因为非notice截取->start_page>begin_page
|
||||||
|
start_page2, end_page2 = extract_pages_generic(pdf_document, second_begin_pattern, end_pattern, begin_page, common_header,
|
||||||
exclusion_pattern, output_suffix)
|
exclusion_pattern, output_suffix)
|
||||||
|
|
||||||
if end_page2 is None:
|
if start_page2 is None or end_page2 is None:
|
||||||
return start_page1, end_page1, None
|
return start_page1, end_page1, end_page1,end_page1
|
||||||
|
|
||||||
return start_page1, end_page1, end_page2
|
|
||||||
|
|
||||||
|
return start_page1, end_page1, start_page2,end_page2
|
||||||
|
|
||||||
|
#return start_page1, end_page1, end_page1
|
||||||
def get_notice(pdf_path, output_folder, begin_page, common_header,invalid_endpage):
|
def get_notice(pdf_path, output_folder, begin_page, common_header,invalid_endpage):
|
||||||
begin_pattern = regex.compile(
|
begin_pattern = regex.compile(
|
||||||
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\))]?\s*$',
|
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\))]?\s*$',
|
||||||
|
@ -651,13 +651,13 @@ if __name__ == '__main__':
|
|||||||
# clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
|
# clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
|
||||||
# doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
|
# doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
|
||||||
# doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
|
# doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
|
||||||
pdf_path = r'C:\Users\Administrator\Desktop\货物\test5\磋商采购文件-恩施市森林火灾风险普查样品检测服务.pdf'
|
pdf_path = r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\“天府粮仓”青白江区“一园三片”数字化提升和标准化体系建设项目(三次)招标文件(N510113202400010820250102001).pdf'
|
||||||
|
|
||||||
output_dir = r"C:\Users\Administrator\Desktop\货物\test5"
|
output_dir = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\废标"
|
||||||
# invalid_added = insert_mark(pdf_path)
|
# invalid_added = insert_mark(pdf_path)
|
||||||
# invalid_added_docx = pdf2docx(invalid_added)
|
# # invalid_added_docx = pdf2docx(invalid_added)
|
||||||
invalid_added_docx=r'C:\Users\Administrator\Downloads\invalid_added.docx'
|
invalid_added_docx=r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\invalid_added.docx'
|
||||||
results = combine_find_invalid(invalid_added_docx, output_dir)
|
results = combine_find_invalid(invalid_added_docx, output_dir)
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
|
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
|
||||||
# print("Elapsed time:", str(end_time - start_time))
|
# # print("Elapsed time:", str(end_time - start_time))
|
@ -240,7 +240,7 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
|
|||||||
elif selection == 5:
|
elif selection == 5:
|
||||||
begin_pattern = regex.compile(
|
begin_pattern = regex.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术|供货).*?要求|'
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术|供货).*?要求|'
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购.*?(?:内容|要求|需求)|招标(?:内容|要求|需求)).*|'
|
r'^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购.*?(?:内容|要求|需求)|(招标|项目)(?:内容|要求|需求)).*|'
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?需求书'
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?需求书'
|
||||||
)
|
)
|
||||||
end_pattern = regex.compile(
|
end_pattern = regex.compile(
|
||||||
@ -315,12 +315,11 @@ if __name__ == "__main__":
|
|||||||
logger = get_global_logger("123")
|
logger = get_global_logger("123")
|
||||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
||||||
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
||||||
pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件111.pdf"
|
pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\开评定标\(发布稿)恒丰理财有限责任公司智能投研终端项目竞争性磋商.pdf"
|
||||||
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
||||||
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
||||||
output_folder = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output4444"
|
output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\开评定标"
|
||||||
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
||||||
selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
|
selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
|
||||||
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
||||||
print(generated_files)
|
print(generated_files)
|
||||||
|
|
@ -10,7 +10,7 @@ from flask_app.general.model_continue_query import continue_answer, process_cont
|
|||||||
from flask_app.general.file2markdown import convert_file_to_markdown
|
from flask_app.general.file2markdown import convert_file_to_markdown
|
||||||
from flask_app.general.format_change import pdf2docx
|
from flask_app.general.format_change import pdf2docx
|
||||||
from flask_app.general.多线程提问 import multi_threading
|
from flask_app.general.多线程提问 import multi_threading
|
||||||
from flask_app.general.通义千问long import qianwen_long, upload_file
|
from flask_app.general.通义千问long import qianwen_long, upload_file, qianwen_plus
|
||||||
from flask_app.general.json_utils import clean_json_string, combine_json_results
|
from flask_app.general.json_utils import clean_json_string, combine_json_results
|
||||||
from flask_app.general.doubao import doubao_model, generate_full_user_query, pdf2txt, read_txt_to_string
|
from flask_app.general.doubao import doubao_model, generate_full_user_query, pdf2txt, read_txt_to_string
|
||||||
from flask_app.货物标.技术参数要求提取后处理函数 import all_postprocess
|
from flask_app.货物标.技术参数要求提取后处理函数 import all_postprocess
|
||||||
@ -488,7 +488,8 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
|
|||||||
{}
|
{}
|
||||||
"""
|
"""
|
||||||
judge_query = first_query_template.format(f"文件内容:{full_text}")
|
judge_query = first_query_template.format(f"文件内容:{full_text}")
|
||||||
judge_res = doubao_model(judge_query)
|
# judge_res = doubao_model(judge_query)
|
||||||
|
judge_res=qianwen_plus(judge_query)
|
||||||
if '否' in judge_res or model_type == 2:
|
if '否' in judge_res or model_type == 2:
|
||||||
model_type = 2 # 使用qianwen-long+invalid_path
|
model_type = 2 # 使用qianwen-long+invalid_path
|
||||||
print("processed_filepath中无采购需求!调用invalid_path")
|
print("processed_filepath中无采购需求!调用invalid_path")
|
||||||
@ -500,7 +501,8 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
|
|||||||
print(model_res)
|
print(model_res)
|
||||||
else:
|
else:
|
||||||
user_query = generate_prompt(judge_res, full_text)
|
user_query = generate_prompt(judge_res, full_text)
|
||||||
model_res = doubao_model(user_query)
|
# model_res = doubao_model(user_query)
|
||||||
|
model_res=qianwen_plus(user_query)
|
||||||
print(model_res)
|
print(model_res)
|
||||||
|
|
||||||
cleaned_res = clean_json_string(model_res) #转字典
|
cleaned_res = clean_json_string(model_res) #转字典
|
||||||
@ -611,7 +613,7 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
|
|||||||
if model_type==1:
|
if model_type==1:
|
||||||
results = multi_threading(queries, "", "", 3,True) # 豆包
|
results = multi_threading(queries, "", "", 3,True) # 豆包
|
||||||
else:
|
else:
|
||||||
results = multi_threading(queries, "", file_id, 2,True) # 豆包
|
results = multi_threading(queries, "", file_id, 2,True) # qianwen-long
|
||||||
temp_final={}
|
temp_final={}
|
||||||
if not results:
|
if not results:
|
||||||
print("errror!未获得大模型的回答!")
|
print("errror!未获得大模型的回答!")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user