1.17 小bug

This commit is contained in:
zy123 2025-01-17 15:45:53 +08:00
parent 7e8f368324
commit 94b945e6b1
7 changed files with 56 additions and 47 deletions

View File

@ -48,7 +48,8 @@ def continue_answer(original_query, original_answer, model_type=1, file_id=None)
# 根据模型类型选择调用的模型
if model_type == 1: # 使用 doubao_model
model_res = doubao_model(continue_query)
# model_res = doubao_model(continue_query)
model_res=qianwen_plus(continue_query)
elif model_type == 2: # 使用 qianwen_long
if file_id is None:
raise ValueError("file_id 必须在使用 qianwen_long 模型时提供!")

View File

@ -13,7 +13,8 @@ from dashscope import Assistants, Messages, Runs, Threads
from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
from flask_app.general.doubao import doubao_model
from flask_app.general.通义千问long import qianwen_long, upload_file
from flask_app.general.通义千问long import qianwen_long, upload_file, qianwen_plus
prompt = """
# 角色
你是一个文档处理专家专门负责理解和操作基于特定内容的文档任务这包括解析总结搜索或生成与给定文档相关的各类信息
@ -165,11 +166,12 @@ def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type
result_queue.put((ans_index, (question, qianwen_res)))
elif llm_type==3:
# print(f"doubao! question:{question}")
doubao_res=doubao_model(question,need_extra)
if not doubao_res:
# doubao_res=doubao_model(question,need_extra)
qianwen_plus_res=qianwen_plus(question,need_extra)
if not qianwen_plus_res:
result_queue.put((ans_index, None)) # 如果为空字符串,直接返回 None
else:
result_queue.put((ans_index, (question, doubao_res)))
result_queue.put((ans_index, (question, qianwen_plus_res)))
else :
assistant = pure_assistant()
ans = send_message(assistant, message=question)

View File

@ -119,14 +119,14 @@ if __name__ == "__main__":
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
# pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\“平安城市”三期暨雪亮工程一体化 建设项目.pdf"
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
output_folder = r"C:\Users\Administrator\Desktop\招标文件\output33"
output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)"
# selections = [1, 4] # 仅处理 selection 4、1
# selections = [1, 2, 3, 5]
# files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods', selections) #engineering
files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'engineering')
files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods')
print(files)
# print(files[-1])
# print(files[-2])

View File

@ -118,11 +118,9 @@ def convert_to_pdf(file_path):
def get_invalid_file(file_path, output_folder, common_header, begin_page):
"""
从指定的PDF文件中提取无效部分并保存到输出文件夹中
begin_pattern从前往后匹配 end_pattern从后往前
begin_pattern匹配开头 end_pattern匹配结尾
页数小于100不进行begin_pattern默认start_page=0
页数大于200时end_pattern从前往后匹配
Args:
file_path (str): 输入的PDF文件路径
output_folder (str): 提取后文件的输出文件夹路径
@ -134,29 +132,25 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
# 定义开始模式
begin_patterns = [
regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)]?\s*$',
regex.MULTILINE
),
regex.compile(
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请',
regex.MULTILINE
)
]
# 定义结束模式
end_patterns = [
regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:格式|文件(?:的)?组成|文件(?:的)?构成|文件(?:的)?编制).*|'
r'第[一二三四五六七八九十]+(?:章|部分).*?合同|[:]清标报告',
regex.MULTILINE
),
regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:格式|文件(?:的)?组成|文件(?:的)?构成|文件(?:的)?编制).*',
regex.MULTILINE
),
regex.compile(
r"\s*(投标文件|响应文件|响应性文件|应答文件)(?:的)?格式\s*",
regex.MULTILINE
)
),
]
# 定义排除模式
@ -195,10 +189,14 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
# 定义查找结束页的函数
def find_end_page():
if total_pages > 200:
# print("总页数大于200页结束页从前往后查找跳过前30页。") #防止匹配到目录这块
# print("总页数大于200页结束页从前往后查找跳过前30页。") #且先匹配合同
for pattern in end_patterns:
for i in range(30, total_pages):
text = page_texts[i]
exclusion_pattern = regex.compile(
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
if regex.search(exclusion_pattern,text):
continue
if regex.search(pattern, text):
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
return i
@ -232,9 +230,7 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
future_end = executor.submit(find_end_page)
start_page = future_start.result()
end_page = future_end.result()
# print(f"起始页: {start_page + 1}, 结束页: {end_page + 1}")
# 验证页码范围
if start_page > end_page:
print(f"无效的页码范围: 起始页 ({start_page + 1}) > 结束页 ({end_page + 1})")
@ -356,6 +352,7 @@ def generate_end_pattern(extraction_stage, chapter_type=None):
r'|' + common_patterns
)
end_pattern_template_third = (
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff、()/]+\s*$'
r'|' + common_patterns
@ -477,10 +474,17 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
return perform_extraction_and_save(start_page, mid_page, end_page)
print(f"第一次提取tobidders_notice失败尝试第二次提取: {pdf_path}")
start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
start_page1, end_page1, start_page2, end_page2 = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
if start_page1 is not None and end_page1 is not None and start_page2 is not None and end_page2 is not None:
if end_page1==start_page2:
mid_page=end_page1
return perform_extraction_and_save(start_page1, mid_page, end_page2)
else:
path1=save_extracted_pages(pdf_path,output_folder,start_page1,end_page1,"tobidders_notice_part1",common_header)
path2=save_extracted_pages(pdf_path,output_folder,start_page2,end_page2,"tobidders_notice_part2",common_header)
return path1,path2
if start_page is not None and mid_page is not None and end_page is not None:
return perform_extraction_and_save(start_page, mid_page, end_page)
print("第二次提取tobidders_notice失败尝试第三次提取!")
new_begin_pattern = regex.compile(
r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
@ -503,9 +507,10 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知)+',
regex.MULTILINE
)
end_pattern = regex.compile(
second_begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', regex.MULTILINE # 捕获中文部分
)
end_pattern = second_begin_pattern
exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"] # 在这里添加需要排除的关键词
exclusion_pattern = regex.compile(
@ -515,10 +520,9 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,
exclusion_pattern, output_suffix)
if start_page1 is None or end_page1 is None:
return None, None, None
return None, None, None,None
# 提取第二部分
start_page2 = end_page1
# 检查end_page1页面的内容
text = pdf_document.pages[end_page1].extract_text() or ""
@ -531,19 +535,20 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
# 检查是否包含排除关键词
if any(word in chapter_title for word in exclusion_words):
# 如果包含排除关键词,直接返回相同的路径
return start_page1, end_page1, end_page1
second_begin_pattern=begin_pattern #投标人须知前附表和投标人须知正文不是紧挨着的eg:(发布稿)恒丰理财有限责任公司智能投研终端项目竞争性磋商
# 如果不包含排除关键词,继续提取第二部分
output_suffix='tobidders_notice2'
_, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
begin_page=end_page1-1 #传入start_page2-1是因为非notice截取->start_page>begin_page
start_page2, end_page2 = extract_pages_generic(pdf_document, second_begin_pattern, end_pattern, begin_page, common_header,
exclusion_pattern, output_suffix)
if end_page2 is None:
return start_page1, end_page1, None
return start_page1, end_page1, end_page2
if start_page2 is None or end_page2 is None:
return start_page1, end_page1, end_page1,end_page1
return start_page1, end_page1, start_page2,end_page2
#return start_page1, end_page1, end_page1
def get_notice(pdf_path, output_folder, begin_page, common_header,invalid_endpage):
begin_pattern = regex.compile(
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)]?\s*$',

View File

@ -651,13 +651,13 @@ if __name__ == '__main__':
# clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
# doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
# doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
pdf_path = r'C:\Users\Administrator\Desktop\货物\test5\磋商采购文件-恩施市森林火灾风险普查样品检测服务.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\“天府粮仓”青白江区“一园三片”数字化提升和标准化体系建设项目(三次)招标文件N510113202400010820250102001.pdf'
output_dir = r"C:\Users\Administrator\Desktop\货物\test5"
output_dir = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\废标"
# invalid_added = insert_mark(pdf_path)
# invalid_added_docx = pdf2docx(invalid_added)
invalid_added_docx=r'C:\Users\Administrator\Downloads\invalid_added.docx'
# # invalid_added_docx = pdf2docx(invalid_added)
invalid_added_docx=r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\invalid_added.docx'
results = combine_find_invalid(invalid_added_docx, output_dir)
end_time = time.time()
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
# print("Elapsed time:", str(end_time - start_time))
# # print("Elapsed time:", str(end_time - start_time))

View File

@ -240,7 +240,7 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
elif selection == 5:
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术|供货).*?要求|'
r'^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购.*?(?:内容|要求|需求)|招标(?:内容|要求|需求)).*|'
r'^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购.*?(?:内容|要求|需求)|(招标|项目)(?:内容|要求|需求)).*|'
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?需求书'
)
end_pattern = regex.compile(
@ -315,12 +315,11 @@ if __name__ == "__main__":
logger = get_global_logger("123")
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件111.pdf"
pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\开评定标\(发布稿)恒丰理财有限责任公司智能投研终端项目竞争性磋商.pdf"
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
output_folder = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output4444"
output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\开评定标"
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
selection = 4 # 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
print(generated_files)

View File

@ -10,7 +10,7 @@ from flask_app.general.model_continue_query import continue_answer, process_cont
from flask_app.general.file2markdown import convert_file_to_markdown
from flask_app.general.format_change import pdf2docx
from flask_app.general.多线程提问 import multi_threading
from flask_app.general.通义千问long import qianwen_long, upload_file
from flask_app.general.通义千问long import qianwen_long, upload_file, qianwen_plus
from flask_app.general.json_utils import clean_json_string, combine_json_results
from flask_app.general.doubao import doubao_model, generate_full_user_query, pdf2txt, read_txt_to_string
from flask_app.货物标.技术参数要求提取后处理函数 import all_postprocess
@ -488,7 +488,8 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
{}
"""
judge_query = first_query_template.format(f"文件内容:{full_text}")
judge_res = doubao_model(judge_query)
# judge_res = doubao_model(judge_query)
judge_res=qianwen_plus(judge_query)
if '' in judge_res or model_type == 2:
model_type = 2 # 使用qianwen-long+invalid_path
print("processed_filepath中无采购需求!调用invalid_path")
@ -500,7 +501,8 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
print(model_res)
else:
user_query = generate_prompt(judge_res, full_text)
model_res = doubao_model(user_query)
# model_res = doubao_model(user_query)
model_res=qianwen_plus(user_query)
print(model_res)
cleaned_res = clean_json_string(model_res) #转字典
@ -611,7 +613,7 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
if model_type==1:
results = multi_threading(queries, "", "", 3,True) # 豆包
else:
results = multi_threading(queries, "", file_id, 2,True) # 豆包
results = multi_threading(queries, "", file_id, 2,True) # qianwen-long
temp_final={}
if not results:
print("errror!未获得大模型的回答!")