From a2e3e9c0abf8df0c5e57879208a0da388da2f83f Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Wed, 9 Oct 2024 13:50:28 +0800 Subject: [PATCH] 10.9 --- flask_app/main/test.py | 322 +++++++++++++----- flask_app/main/多线程提问.py | 3 +- flask_app/main/截取pdf.py | 4 +- flask_app/main/投标人须知正文提取指定内容.py | 2 +- .../main/投标人须知正文条款提取成json文件.py | 4 +- .../文档理解大模型版/调用文件解析状态查询.py | 2 +- .../文档理解大模型版/调用文档解析结果获取.py | 2 +- flask_app/main/读取文件/按页读取pdf.py | 78 ++++- .../投标人须知正文提取指定内容货物标版.py | 8 +- .../投标人须知正文条款提取成json文件货物标版.py | 200 +++++++---- flask_app/货物标/货物标截取pdf.py | 37 +- 11 files changed, 501 insertions(+), 161 deletions(-) diff --git a/flask_app/main/test.py b/flask_app/main/test.py index 3507244..488f7b8 100644 --- a/flask_app/main/test.py +++ b/flask_app/main/test.py @@ -1,95 +1,261 @@ -# -*- encoding:utf-8 -*- +import json import re -def find_keys_by_value(target_value, json_data): - # 找到值等于目标值或以目标值开头的键 - matched_keys = [k for k, v in json_data.items() if v == target_value] - if not matched_keys: - matched_keys = [k for k, v in json_data.items() if isinstance(v, str) and v.startswith(target_value)] - return matched_keys -def find_keys_with_prefix(prefix, json_data): - # 只提取直接子项,比如 prefix 为 '7.2' 时只提取 '7.2.1', '7.2.2' 但不会提取 '7.3' - return [k for k in json_data.keys() if k.startswith(prefix) and k[len(prefix):].lstrip('.').isdigit()] +def compare_headings(current, new): + current_nums = [int(num) for num in current.split('.') if num.isdigit()] + new_nums = [int(num) for num in new.split('.') if num.isdigit()] -def extract_json(data, target_values): - results = {} + for c, n in zip(current_nums, new_nums): + if n > c: + return True + elif n < c: + return False - # 遍历所有目标值 - for target_value in target_values: - # 找到所有与目标值匹配的键 - matched_keys = find_keys_by_value(target_value, data) + return len(new_nums) > len(current_nums) - for key in matched_keys: - # 查找所有以该键为前缀的子键,限制只提取直接子项 - key_and_subheadings = find_keys_with_prefix(key, data) - for subkey in key_and_subheadings: - # 如果子键有多级结构(比如 '7.2.1'),并且是直接子项 - if "." in subkey: - parent_key = subkey.rsplit('.', 1)[0] - top_level_key = parent_key.split('.')[0] + '.' +def should_add_newline(content, keywords, max_length=20): + content_str = ''.join(content).strip() + return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length - # 确保顶级键不会重复添加 - if top_level_key not in results: - results[top_level_key] = data[top_level_key] - # 添加或更新父级键 - if parent_key not in results: - if parent_key in data: - results[parent_key] = data[parent_key] +def handle_content_append(current_content, line_content, append_newline, keywords): + if append_newline: + if should_add_newline(current_content, keywords): + current_content.append('\n') + append_newline = False + current_content.append(line_content) + return append_newline - # 添加当前子键和它的值 - if subkey in data: - results[subkey] = data[subkey] - return results +# def parse_text_by_heading(text): +# keywords = ['包含', '以下'] +# data = {} +# current_key = None +# current_content = [] +# append_newline = False +# skip_subheadings = False +# +# lines = text.split('\n') +# for i, line in enumerate(lines): +# line_stripped = line.strip().replace('.', '.') +# # print(line_stripped) +# # 匹配二级、三级标题形如 '1.1'、'2.2.3' 并确保其前后没有字母或括号 +# match = re.match(r'^(?