1.17 小bug

This commit is contained in:
zy123 2025-01-17 17:36:46 +08:00
parent 20e297c3ad
commit 28ddae379a
3 changed files with 21 additions and 16 deletions

View File

@ -344,8 +344,10 @@ def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keyword
continue # 跳过空白行,进入下一个循环
if not found_next_number:
# 修改后的正则,支持 '数字 、' 格式
next_section_number = re.match(r'^([A-Za-z0-9]+(?:[.][A-Za-z0-9]+)*)|([(]\s*\d+\s*[)])|(\d+\s*、)',
next_text)
number_pattern=(r'^([A-Za-z0-9]+(?:[.][A-Za-z0-9]+)*)' #A1 1.1 abc.123
r'|([(]\s*[一二三四五六七八九十\d]+\s*[)])' #(1) 2
r'|([一二三四五六七八九十\d]+\s*、)') #1、 2、
next_section_number = re.match(number_pattern,next_text)
if next_section_number:
found_next_number = True
if next_section_number.group(1):
@ -353,9 +355,9 @@ def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keyword
dynamic_pattern = r'^' + r'[.]'.join(
[r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
elif next_section_number.group(2):
dynamic_pattern = r'^[\(\]\s*\d+\s*[\)\]'
dynamic_pattern = r'^[\(\]\s*[一二三四五六七八九十\d]+\s*[\)\]'
elif next_section_number.group(3):
dynamic_pattern = r'^\d+\s*、'
dynamic_pattern = r'^[一二三四五六七八九十\d]+\s*、'
current_section_pattern = re.compile(dynamic_pattern)
if current_section_pattern and re.match(current_section_pattern, next_text):
extracted_paragraphs[active_key].append(next_text)
@ -488,17 +490,17 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
processed_paragraphs = preprocess_paragraphs(doc_contents)
extracted_contents = extract_text_with_keywords(processed_paragraphs, [keywords], follow_up_keywords)
all_texts1,all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
# print(all_texts2)
# 1. 得到有序的 all_text1_items
all_text1_items = sorted(all_texts1.items(), key=lambda x: x[0])
# 2. 得到纯内容列表
all_texts1_list = [content for (_, content) in all_text1_items]
# print(all_texts)
# Proceed only if there is content to write
selected_contents = {}
final_list=[f"未解析到'{result_key}'"]
seen_contents = set() # 使用集合跟踪已添加的内容以去重
if all_texts1_list:
if all_texts1_list or all_texts2:
with open(output_file, 'w', encoding='utf-8') as file:
counter = 1
for content in all_texts1_list:
@ -511,11 +513,14 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
counter += 1
# 生成用户查询
user_query = generate_full_user_query(output_file, user_query)
model_ans = qianwen_plus(user_query) # 豆包模型返回结果
# file_id = upload_file(output_file)
# model_ans = qianwen_long(file_id, user_query)
num_list = process_string_list(model_ans) # 处理模型返回的序号
if not all_texts1_list:
num_list=[]
else:
user_query = generate_full_user_query(output_file, user_query)
model_ans = qianwen_plus(user_query) # 豆包模型返回结果
# file_id = upload_file(output_file)
# model_ans = qianwen_long(file_id, user_query)
num_list = process_string_list(model_ans) # 处理模型返回的序号
print(result_key + "选中的序号:" + str(num_list))
for index in num_list:
@ -656,7 +661,7 @@ if __name__ == '__main__':
output_dir = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\废标"
# invalid_added = insert_mark(pdf_path)
# # invalid_added_docx = pdf2docx(invalid_added)
invalid_added_docx=r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\invalid_added.docx'
invalid_added_docx=r'C:\Users\Administrator\Desktop\测试信号测试信号.docx'
results = combine_find_invalid(invalid_added_docx, output_dir)
end_time = time.time()
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))

View File

@ -90,7 +90,7 @@ def read_docx_by_paragraphs(file_path):
return []
if __name__ == "__main__":
file_path = r'D:\flask_project\flask_app\static\output\output1\2179c978-b638-4332-8bd2-2007ad6f7c9b\ztbfile.docx'
file_path = r'C:\Users\Administrator\Desktop\测试信号测试信号.docx'
read_docx(file_path) #按行读取
# paragraphs = read_docx_by_paragraphs(file_path) #按段落读取

View File

@ -315,11 +315,11 @@ if __name__ == "__main__":
logger = get_global_logger("123")
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\开评定标\(发布稿)恒丰理财有限责任公司智能投研终端项目竞争性磋商.pdf"
pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp\251WA0201146未来大厦LED屏幕采购及安装-自行组织货物类招标文件V1.2.pdf"
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\开评定标"
output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp"
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
selection = 4 # 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
selection = 5 # 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
print(generated_files)