9.25
This commit is contained in:
parent
bdfc17bb13
commit
06a5cd03e1
@ -80,6 +80,8 @@ def parse_text_by_heading(text):
|
||||
current_key = None
|
||||
current_content = []
|
||||
append_newline = False
|
||||
sub_key_count = 0 # 用于生成二级标题的计数器
|
||||
third_key_count = 0 # 用于生成三级标题的计数器
|
||||
|
||||
lines = text.split('\n')
|
||||
for i, line in enumerate(lines):
|
||||
@ -96,26 +98,48 @@ def parse_text_by_heading(text):
|
||||
if current_key is None or (compare_headings(current_key, new_key) and (
|
||||
len(current_content) == 0 or current_content[-1][-1] != '第')):
|
||||
if current_key is not None:
|
||||
# 将之前的内容保存到data中,保留第一个换行符,后续的换行符转为空字符
|
||||
# 将之前的内容保存到 data 中
|
||||
content_string = ''.join(current_content).strip()
|
||||
data[current_key] = content_string.replace(' ', '')
|
||||
current_key = new_key
|
||||
current_content = [line_content]
|
||||
# 只有当标题级别为两级(如 1.1)时,才设置 append_newline 为 True
|
||||
# 初始化二级和三级标题的计数器
|
||||
sub_key_count = 0
|
||||
third_key_count = 0
|
||||
append_newline = len(new_key.split('.')) == 2
|
||||
else:
|
||||
append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
|
||||
else:
|
||||
# 匹配子序号 '1. 2.' 生成三级标题
|
||||
sub_match = re.match(r'^(\d+)[\.\、]\s*(.*)$', line_stripped)
|
||||
if sub_match:
|
||||
sub_number, sub_content = sub_match.groups()
|
||||
# 生成三级标题 key,如 5.0.1, 5.0.2 等
|
||||
if current_key:
|
||||
if sub_key_count == 0:
|
||||
# 生成二级标题,如 5.0
|
||||
sub_key_count += 1
|
||||
sub_key = f"{current_key}.0"
|
||||
data[sub_key] = ''.join(current_content).strip()
|
||||
current_content = [] # 重置内容以生成新的内容
|
||||
third_key_count += 1
|
||||
sub_key_with_third = f"{current_key}.0.{third_key_count}"
|
||||
data[sub_key_with_third] = sub_content.strip()
|
||||
continue
|
||||
else:
|
||||
# 处理一般行内容
|
||||
if line_stripped:
|
||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
|
||||
|
||||
if current_key is not None:
|
||||
# 保存最后一部分内容
|
||||
content_string = ''.join(current_content).strip()
|
||||
if content_string:
|
||||
data[current_key] = content_string.replace(' ', '')
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def convert_to_json(file_path, start_word, end_phrases):
|
||||
if file_path.endswith('.docx'):
|
||||
text = extract_text_from_docx(file_path)
|
||||
|
@ -240,8 +240,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
|
||||
|
||||
# 每提交一个任务暂停1秒,确保任务逐个提交
|
||||
for task_name, future in futures.items():
|
||||
futures[task_name] = executor.submit(future)
|
||||
time.sleep(1) # 任务提交后暂停1秒
|
||||
time.sleep(1) # 可以保留这个暂停
|
||||
|
||||
# 按照任务完成的顺序返回结果
|
||||
for future in concurrent.futures.as_completed(futures.values()):
|
||||
|
Loading…
x
Reference in New Issue
Block a user