1.17 小bug
This commit is contained in:
parent
20e297c3ad
commit
28ddae379a
@ -344,8 +344,10 @@ def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keyword
|
|||||||
continue # 跳过空白行,进入下一个循环
|
continue # 跳过空白行,进入下一个循环
|
||||||
if not found_next_number:
|
if not found_next_number:
|
||||||
# 修改后的正则,支持 '数字 、' 格式
|
# 修改后的正则,支持 '数字 、' 格式
|
||||||
next_section_number = re.match(r'^([A-Za-z0-9]+(?:[..][A-Za-z0-9]+)*)|([((]\s*\d+\s*[))])|(\d+\s*、)',
|
number_pattern=(r'^([A-Za-z0-9]+(?:[..][A-Za-z0-9]+)*)' #A1 1.1 abc.123
|
||||||
next_text)
|
r'|([((]\s*[一二三四五六七八九十\d]+\s*[))])' #(1) (2)
|
||||||
|
r'|([一二三四五六七八九十\d]+\s*、)') #1、 2、
|
||||||
|
next_section_number = re.match(number_pattern,next_text)
|
||||||
if next_section_number:
|
if next_section_number:
|
||||||
found_next_number = True
|
found_next_number = True
|
||||||
if next_section_number.group(1):
|
if next_section_number.group(1):
|
||||||
@ -353,9 +355,9 @@ def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keyword
|
|||||||
dynamic_pattern = r'^' + r'[..]'.join(
|
dynamic_pattern = r'^' + r'[..]'.join(
|
||||||
[r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
|
[r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
|
||||||
elif next_section_number.group(2):
|
elif next_section_number.group(2):
|
||||||
dynamic_pattern = r'^[\(\(]\s*\d+\s*[\)\)]'
|
dynamic_pattern = r'^[\(\(]\s*[一二三四五六七八九十\d]+\s*[\)\)]'
|
||||||
elif next_section_number.group(3):
|
elif next_section_number.group(3):
|
||||||
dynamic_pattern = r'^\d+\s*、'
|
dynamic_pattern = r'^[一二三四五六七八九十\d]+\s*、'
|
||||||
current_section_pattern = re.compile(dynamic_pattern)
|
current_section_pattern = re.compile(dynamic_pattern)
|
||||||
if current_section_pattern and re.match(current_section_pattern, next_text):
|
if current_section_pattern and re.match(current_section_pattern, next_text):
|
||||||
extracted_paragraphs[active_key].append(next_text)
|
extracted_paragraphs[active_key].append(next_text)
|
||||||
@ -488,17 +490,17 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
|
|||||||
processed_paragraphs = preprocess_paragraphs(doc_contents)
|
processed_paragraphs = preprocess_paragraphs(doc_contents)
|
||||||
extracted_contents = extract_text_with_keywords(processed_paragraphs, [keywords], follow_up_keywords)
|
extracted_contents = extract_text_with_keywords(processed_paragraphs, [keywords], follow_up_keywords)
|
||||||
all_texts1,all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
|
all_texts1,all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
|
||||||
|
# print(all_texts2)
|
||||||
|
|
||||||
# 1. 得到有序的 all_text1_items
|
# 1. 得到有序的 all_text1_items
|
||||||
all_text1_items = sorted(all_texts1.items(), key=lambda x: x[0])
|
all_text1_items = sorted(all_texts1.items(), key=lambda x: x[0])
|
||||||
# 2. 得到纯内容列表
|
# 2. 得到纯内容列表
|
||||||
all_texts1_list = [content for (_, content) in all_text1_items]
|
all_texts1_list = [content for (_, content) in all_text1_items]
|
||||||
# print(all_texts)
|
|
||||||
# Proceed only if there is content to write
|
# Proceed only if there is content to write
|
||||||
selected_contents = {}
|
selected_contents = {}
|
||||||
final_list=[f"未解析到'{result_key}'!"]
|
final_list=[f"未解析到'{result_key}'!"]
|
||||||
seen_contents = set() # 使用集合跟踪已添加的内容以去重
|
seen_contents = set() # 使用集合跟踪已添加的内容以去重
|
||||||
if all_texts1_list:
|
if all_texts1_list or all_texts2:
|
||||||
with open(output_file, 'w', encoding='utf-8') as file:
|
with open(output_file, 'w', encoding='utf-8') as file:
|
||||||
counter = 1
|
counter = 1
|
||||||
for content in all_texts1_list:
|
for content in all_texts1_list:
|
||||||
@ -511,6 +513,9 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
|
|||||||
counter += 1
|
counter += 1
|
||||||
|
|
||||||
# 生成用户查询
|
# 生成用户查询
|
||||||
|
if not all_texts1_list:
|
||||||
|
num_list=[]
|
||||||
|
else:
|
||||||
user_query = generate_full_user_query(output_file, user_query)
|
user_query = generate_full_user_query(output_file, user_query)
|
||||||
model_ans = qianwen_plus(user_query) # 豆包模型返回结果
|
model_ans = qianwen_plus(user_query) # 豆包模型返回结果
|
||||||
# file_id = upload_file(output_file)
|
# file_id = upload_file(output_file)
|
||||||
@ -656,7 +661,7 @@ if __name__ == '__main__':
|
|||||||
output_dir = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\废标"
|
output_dir = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\废标"
|
||||||
# invalid_added = insert_mark(pdf_path)
|
# invalid_added = insert_mark(pdf_path)
|
||||||
# # invalid_added_docx = pdf2docx(invalid_added)
|
# # invalid_added_docx = pdf2docx(invalid_added)
|
||||||
invalid_added_docx=r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\invalid_added.docx'
|
invalid_added_docx=r'C:\Users\Administrator\Desktop\测试信号测试信号.docx'
|
||||||
results = combine_find_invalid(invalid_added_docx, output_dir)
|
results = combine_find_invalid(invalid_added_docx, output_dir)
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
|
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
|
||||||
|
@ -90,7 +90,7 @@ def read_docx_by_paragraphs(file_path):
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
file_path = r'D:\flask_project\flask_app\static\output\output1\2179c978-b638-4332-8bd2-2007ad6f7c9b\ztbfile.docx'
|
file_path = r'C:\Users\Administrator\Desktop\测试信号测试信号.docx'
|
||||||
read_docx(file_path) #按行读取
|
read_docx(file_path) #按行读取
|
||||||
|
|
||||||
# paragraphs = read_docx_by_paragraphs(file_path) #按段落读取
|
# paragraphs = read_docx_by_paragraphs(file_path) #按段落读取
|
||||||
|
@ -315,11 +315,11 @@ if __name__ == "__main__":
|
|||||||
logger = get_global_logger("123")
|
logger = get_global_logger("123")
|
||||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
||||||
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
||||||
pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\开评定标\(发布稿)恒丰理财有限责任公司智能投研终端项目竞争性磋商.pdf"
|
pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp\251WA0201146未来大厦LED屏幕采购及安装-自行组织货物类招标文件V1.2.pdf"
|
||||||
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
||||||
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
||||||
output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\开评定标"
|
output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp"
|
||||||
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
||||||
selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
|
selection = 5 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
|
||||||
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
||||||
print(generated_files)
|
print(generated_files)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user