This commit is contained in:
zy123 2024-09-25 10:45:32 +08:00
parent bdfc17bb13
commit 06a5cd03e1
2 changed files with 30 additions and 7 deletions

View File

@ -80,6 +80,8 @@ def parse_text_by_heading(text):
current_key = None
current_content = []
append_newline = False
sub_key_count = 0 # 用于生成二级标题的计数器
third_key_count = 0 # 用于生成三级标题的计数器
lines = text.split('\n')
for i, line in enumerate(lines):
@ -96,26 +98,48 @@ def parse_text_by_heading(text):
if current_key is None or (compare_headings(current_key, new_key) and (
len(current_content) == 0 or current_content[-1][-1] != '')):
if current_key is not None:
# 将之前的内容保存到data中,保留第一个换行符,后续的换行符转为空字符
# 将之前的内容保存到 data
content_string = ''.join(current_content).strip()
data[current_key] = content_string.replace(' ', '')
current_key = new_key
current_content = [line_content]
# 只有当标题级别为两级(如 1.1)时,才设置 append_newline 为 True
# 初始化二级和三级标题的计数器
sub_key_count = 0
third_key_count = 0
append_newline = len(new_key.split('.')) == 2
else:
append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
else:
# 匹配子序号 '1. 2.' 生成三级标题
sub_match = re.match(r'^(\d+)[\.\、]\s*(.*)$', line_stripped)
if sub_match:
sub_number, sub_content = sub_match.groups()
# 生成三级标题 key如 5.0.1, 5.0.2 等
if current_key:
if sub_key_count == 0:
# 生成二级标题,如 5.0
sub_key_count += 1
sub_key = f"{current_key}.0"
data[sub_key] = ''.join(current_content).strip()
current_content = [] # 重置内容以生成新的内容
third_key_count += 1
sub_key_with_third = f"{current_key}.0.{third_key_count}"
data[sub_key_with_third] = sub_content.strip()
continue
else:
# 处理一般行内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
if current_key is not None:
# 保存最后一部分内容
content_string = ''.join(current_content).strip()
if content_string:
data[current_key] = content_string.replace(' ', '')
return data
def convert_to_json(file_path, start_word, end_phrases):
if file_path.endswith('.docx'):
text = extract_text_from_docx(file_path)

View File

@ -240,8 +240,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
# 每提交一个任务暂停1秒确保任务逐个提交
for task_name, future in futures.items():
futures[task_name] = executor.submit(future)
time.sleep(1) # 任务提交后暂停1秒
time.sleep(1) # 可以保留这个暂停
# 按照任务完成的顺序返回结果
for future in concurrent.futures.as_completed(futures.values()):