1.17 小bug
This commit is contained in:
parent
7e8f368324
commit
94b945e6b1
@ -48,7 +48,8 @@ def continue_answer(original_query, original_answer, model_type=1, file_id=None)
|
||||
|
||||
# 根据模型类型选择调用的模型
|
||||
if model_type == 1: # 使用 doubao_model
|
||||
model_res = doubao_model(continue_query)
|
||||
# model_res = doubao_model(continue_query)
|
||||
model_res=qianwen_plus(continue_query)
|
||||
elif model_type == 2: # 使用 qianwen_long
|
||||
if file_id is None:
|
||||
raise ValueError("file_id 必须在使用 qianwen_long 模型时提供!")
|
||||
|
@ -13,7 +13,8 @@ from dashscope import Assistants, Messages, Runs, Threads
|
||||
from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
|
||||
|
||||
from flask_app.general.doubao import doubao_model
|
||||
from flask_app.general.通义千问long import qianwen_long, upload_file
|
||||
from flask_app.general.通义千问long import qianwen_long, upload_file, qianwen_plus
|
||||
|
||||
prompt = """
|
||||
# 角色
|
||||
你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。
|
||||
@ -165,11 +166,12 @@ def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type
|
||||
result_queue.put((ans_index, (question, qianwen_res)))
|
||||
elif llm_type==3:
|
||||
# print(f"doubao! question:{question}")
|
||||
doubao_res=doubao_model(question,need_extra)
|
||||
if not doubao_res:
|
||||
# doubao_res=doubao_model(question,need_extra)
|
||||
qianwen_plus_res=qianwen_plus(question,need_extra)
|
||||
if not qianwen_plus_res:
|
||||
result_queue.put((ans_index, None)) # 如果为空字符串,直接返回 None
|
||||
else:
|
||||
result_queue.put((ans_index, (question, doubao_res)))
|
||||
result_queue.put((ans_index, (question, qianwen_plus_res)))
|
||||
else :
|
||||
assistant = pure_assistant()
|
||||
ans = send_message(assistant, message=question)
|
||||
|
@ -119,14 +119,14 @@ if __name__ == "__main__":
|
||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
|
||||
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
|
||||
# pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
|
||||
pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
|
||||
pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\“平安城市”三期暨雪亮工程一体化 建设项目.pdf"
|
||||
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
|
||||
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\招标文件\output33"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)"
|
||||
# selections = [1, 4] # 仅处理 selection 4、1
|
||||
# selections = [1, 2, 3, 5]
|
||||
# files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods', selections) #engineering
|
||||
files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'engineering')
|
||||
files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods')
|
||||
print(files)
|
||||
# print(files[-1])
|
||||
# print(files[-2])
|
||||
|
@ -118,11 +118,9 @@ def convert_to_pdf(file_path):
|
||||
def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
||||
"""
|
||||
从指定的PDF文件中提取无效部分并保存到输出文件夹中。
|
||||
begin_pattern从前往后匹配 end_pattern从后往前
|
||||
begin_pattern匹配开头 end_pattern匹配结尾
|
||||
页数小于100不进行begin_pattern,默认start_page=0
|
||||
页数大于200时end_pattern从前往后匹配
|
||||
|
||||
|
||||
Args:
|
||||
file_path (str): 输入的PDF文件路径。
|
||||
output_folder (str): 提取后文件的输出文件夹路径。
|
||||
@ -134,29 +132,25 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
||||
# 定义开始模式
|
||||
begin_patterns = [
|
||||
regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',
|
||||
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\))]?\s*$',
|
||||
regex.MULTILINE
|
||||
),
|
||||
regex.compile(
|
||||
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请',
|
||||
regex.MULTILINE
|
||||
)
|
||||
]
|
||||
|
||||
# 定义结束模式
|
||||
end_patterns = [
|
||||
regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:格式|文件(?:的)?组成|文件(?:的)?构成|文件(?:的)?编制).*|'
|
||||
r'第[一二三四五六七八九十]+(?:章|部分).*?合同|[::]清标报告',
|
||||
regex.MULTILINE
|
||||
),
|
||||
regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:格式|文件(?:的)?组成|文件(?:的)?构成|文件(?:的)?编制).*',
|
||||
regex.MULTILINE
|
||||
),
|
||||
regex.compile(
|
||||
r"\s*(投标文件|响应文件|响应性文件|应答文件)(?:的)?格式\s*",
|
||||
regex.MULTILINE
|
||||
)
|
||||
),
|
||||
]
|
||||
|
||||
# 定义排除模式
|
||||
@ -195,10 +189,14 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
||||
# 定义查找结束页的函数
|
||||
def find_end_page():
|
||||
if total_pages > 200:
|
||||
# print("总页数大于200页,结束页从前往后查找,跳过前30页。") #防止匹配到目录这块
|
||||
# print("总页数大于200页,结束页从前往后查找,跳过前30页。") #且先匹配合同
|
||||
for pattern in end_patterns:
|
||||
for i in range(30, total_pages):
|
||||
text = page_texts[i]
|
||||
exclusion_pattern = regex.compile(
|
||||
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
||||
if regex.search(exclusion_pattern,text):
|
||||
continue
|
||||
if regex.search(pattern, text):
|
||||
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
|
||||
return i
|
||||
@ -232,9 +230,7 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
||||
future_end = executor.submit(find_end_page)
|
||||
start_page = future_start.result()
|
||||
end_page = future_end.result()
|
||||
|
||||
# print(f"起始页: {start_page + 1}, 结束页: {end_page + 1}")
|
||||
|
||||
# 验证页码范围
|
||||
if start_page > end_page:
|
||||
print(f"无效的页码范围: 起始页 ({start_page + 1}) > 结束页 ({end_page + 1})")
|
||||
@ -356,6 +352,7 @@ def generate_end_pattern(extraction_stage, chapter_type=None):
|
||||
r'|' + common_patterns
|
||||
)
|
||||
|
||||
|
||||
end_pattern_template_third = (
|
||||
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff、()()/]+\s*$'
|
||||
r'|' + common_patterns
|
||||
@ -477,10 +474,17 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
|
||||
return perform_extraction_and_save(start_page, mid_page, end_page)
|
||||
|
||||
print(f"第一次提取tobidders_notice失败,尝试第二次提取: {pdf_path}")
|
||||
start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
|
||||
start_page1, end_page1, start_page2, end_page2 = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
|
||||
|
||||
if start_page1 is not None and end_page1 is not None and start_page2 is not None and end_page2 is not None:
|
||||
if end_page1==start_page2:
|
||||
mid_page=end_page1
|
||||
return perform_extraction_and_save(start_page1, mid_page, end_page2)
|
||||
else:
|
||||
path1=save_extracted_pages(pdf_path,output_folder,start_page1,end_page1,"tobidders_notice_part1",common_header)
|
||||
path2=save_extracted_pages(pdf_path,output_folder,start_page2,end_page2,"tobidders_notice_part2",common_header)
|
||||
return path1,path2
|
||||
|
||||
if start_page is not None and mid_page is not None and end_page is not None:
|
||||
return perform_extraction_and_save(start_page, mid_page, end_page)
|
||||
print("第二次提取tobidders_notice失败,尝试第三次提取!")
|
||||
new_begin_pattern = regex.compile(
|
||||
r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
|
||||
@ -503,9 +507,10 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知)+',
|
||||
regex.MULTILINE
|
||||
)
|
||||
end_pattern = regex.compile(
|
||||
second_begin_pattern = regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', regex.MULTILINE # 捕获中文部分
|
||||
)
|
||||
end_pattern = second_begin_pattern
|
||||
exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"] # 在这里添加需要排除的关键词
|
||||
|
||||
exclusion_pattern = regex.compile(
|
||||
@ -515,10 +520,9 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
|
||||
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,
|
||||
exclusion_pattern, output_suffix)
|
||||
if start_page1 is None or end_page1 is None:
|
||||
return None, None, None
|
||||
return None, None, None,None
|
||||
|
||||
# 提取第二部分
|
||||
start_page2 = end_page1
|
||||
|
||||
# 检查end_page1页面的内容
|
||||
text = pdf_document.pages[end_page1].extract_text() or ""
|
||||
@ -531,19 +535,20 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
|
||||
# 检查是否包含排除关键词
|
||||
if any(word in chapter_title for word in exclusion_words):
|
||||
# 如果包含排除关键词,直接返回相同的路径
|
||||
return start_page1, end_page1, end_page1
|
||||
second_begin_pattern=begin_pattern #投标人须知前附表和投标人须知正文不是紧挨着的!eg:(发布稿)恒丰理财有限责任公司智能投研终端项目竞争性磋商
|
||||
|
||||
# 如果不包含排除关键词,继续提取第二部分
|
||||
output_suffix='tobidders_notice2'
|
||||
_, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
|
||||
begin_page=end_page1-1 #传入start_page2-1是因为非notice截取->start_page>begin_page
|
||||
start_page2, end_page2 = extract_pages_generic(pdf_document, second_begin_pattern, end_pattern, begin_page, common_header,
|
||||
exclusion_pattern, output_suffix)
|
||||
|
||||
if end_page2 is None:
|
||||
return start_page1, end_page1, None
|
||||
|
||||
return start_page1, end_page1, end_page2
|
||||
if start_page2 is None or end_page2 is None:
|
||||
return start_page1, end_page1, end_page1,end_page1
|
||||
|
||||
return start_page1, end_page1, start_page2,end_page2
|
||||
|
||||
#return start_page1, end_page1, end_page1
|
||||
def get_notice(pdf_path, output_folder, begin_page, common_header,invalid_endpage):
|
||||
begin_pattern = regex.compile(
|
||||
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\))]?\s*$',
|
||||
|
@ -651,13 +651,13 @@ if __name__ == '__main__':
|
||||
# clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
|
||||
# doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
|
||||
# doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
|
||||
pdf_path = r'C:\Users\Administrator\Desktop\货物\test5\磋商采购文件-恩施市森林火灾风险普查样品检测服务.pdf'
|
||||
pdf_path = r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\“天府粮仓”青白江区“一园三片”数字化提升和标准化体系建设项目(三次)招标文件(N510113202400010820250102001).pdf'
|
||||
|
||||
output_dir = r"C:\Users\Administrator\Desktop\货物\test5"
|
||||
output_dir = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\废标"
|
||||
# invalid_added = insert_mark(pdf_path)
|
||||
# invalid_added_docx = pdf2docx(invalid_added)
|
||||
invalid_added_docx=r'C:\Users\Administrator\Downloads\invalid_added.docx'
|
||||
# # invalid_added_docx = pdf2docx(invalid_added)
|
||||
invalid_added_docx=r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\invalid_added.docx'
|
||||
results = combine_find_invalid(invalid_added_docx, output_dir)
|
||||
end_time = time.time()
|
||||
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
|
||||
# print("Elapsed time:", str(end_time - start_time))
|
||||
# # print("Elapsed time:", str(end_time - start_time))
|
@ -240,7 +240,7 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
|
||||
elif selection == 5:
|
||||
begin_pattern = regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术|供货).*?要求|'
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购.*?(?:内容|要求|需求)|招标(?:内容|要求|需求)).*|'
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购.*?(?:内容|要求|需求)|(招标|项目)(?:内容|要求|需求)).*|'
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?需求书'
|
||||
)
|
||||
end_pattern = regex.compile(
|
||||
@ -315,12 +315,11 @@ if __name__ == "__main__":
|
||||
logger = get_global_logger("123")
|
||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
||||
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
||||
pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件111.pdf"
|
||||
pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\开评定标\(发布稿)恒丰理财有限责任公司智能投研终端项目竞争性磋商.pdf"
|
||||
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
||||
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output4444"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\开评定标"
|
||||
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
||||
selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
|
||||
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
||||
print(generated_files)
|
||||
|
@ -10,7 +10,7 @@ from flask_app.general.model_continue_query import continue_answer, process_cont
|
||||
from flask_app.general.file2markdown import convert_file_to_markdown
|
||||
from flask_app.general.format_change import pdf2docx
|
||||
from flask_app.general.多线程提问 import multi_threading
|
||||
from flask_app.general.通义千问long import qianwen_long, upload_file
|
||||
from flask_app.general.通义千问long import qianwen_long, upload_file, qianwen_plus
|
||||
from flask_app.general.json_utils import clean_json_string, combine_json_results
|
||||
from flask_app.general.doubao import doubao_model, generate_full_user_query, pdf2txt, read_txt_to_string
|
||||
from flask_app.货物标.技术参数要求提取后处理函数 import all_postprocess
|
||||
@ -488,7 +488,8 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
|
||||
{}
|
||||
"""
|
||||
judge_query = first_query_template.format(f"文件内容:{full_text}")
|
||||
judge_res = doubao_model(judge_query)
|
||||
# judge_res = doubao_model(judge_query)
|
||||
judge_res=qianwen_plus(judge_query)
|
||||
if '否' in judge_res or model_type == 2:
|
||||
model_type = 2 # 使用qianwen-long+invalid_path
|
||||
print("processed_filepath中无采购需求!调用invalid_path")
|
||||
@ -500,7 +501,8 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
|
||||
print(model_res)
|
||||
else:
|
||||
user_query = generate_prompt(judge_res, full_text)
|
||||
model_res = doubao_model(user_query)
|
||||
# model_res = doubao_model(user_query)
|
||||
model_res=qianwen_plus(user_query)
|
||||
print(model_res)
|
||||
|
||||
cleaned_res = clean_json_string(model_res) #转字典
|
||||
@ -611,7 +613,7 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
|
||||
if model_type==1:
|
||||
results = multi_threading(queries, "", "", 3,True) # 豆包
|
||||
else:
|
||||
results = multi_threading(queries, "", file_id, 2,True) # 豆包
|
||||
results = multi_threading(queries, "", file_id, 2,True) # qianwen-long
|
||||
temp_final={}
|
||||
if not results:
|
||||
print("errror!未获得大模型的回答!")
|
||||
|
Loading…
x
Reference in New Issue
Block a user