9.25
This commit is contained in:
parent
bdfc17bb13
commit
06a5cd03e1
@ -80,6 +80,8 @@ def parse_text_by_heading(text):
|
|||||||
current_key = None
|
current_key = None
|
||||||
current_content = []
|
current_content = []
|
||||||
append_newline = False
|
append_newline = False
|
||||||
|
sub_key_count = 0 # 用于生成二级标题的计数器
|
||||||
|
third_key_count = 0 # 用于生成三级标题的计数器
|
||||||
|
|
||||||
lines = text.split('\n')
|
lines = text.split('\n')
|
||||||
for i, line in enumerate(lines):
|
for i, line in enumerate(lines):
|
||||||
@ -96,26 +98,48 @@ def parse_text_by_heading(text):
|
|||||||
if current_key is None or (compare_headings(current_key, new_key) and (
|
if current_key is None or (compare_headings(current_key, new_key) and (
|
||||||
len(current_content) == 0 or current_content[-1][-1] != '第')):
|
len(current_content) == 0 or current_content[-1][-1] != '第')):
|
||||||
if current_key is not None:
|
if current_key is not None:
|
||||||
# 将之前的内容保存到data中,保留第一个换行符,后续的换行符转为空字符
|
# 将之前的内容保存到 data 中
|
||||||
content_string = ''.join(current_content).strip()
|
content_string = ''.join(current_content).strip()
|
||||||
data[current_key] = content_string.replace(' ', '')
|
data[current_key] = content_string.replace(' ', '')
|
||||||
current_key = new_key
|
current_key = new_key
|
||||||
current_content = [line_content]
|
current_content = [line_content]
|
||||||
# 只有当标题级别为两级(如 1.1)时,才设置 append_newline 为 True
|
# 初始化二级和三级标题的计数器
|
||||||
|
sub_key_count = 0
|
||||||
|
third_key_count = 0
|
||||||
append_newline = len(new_key.split('.')) == 2
|
append_newline = len(new_key.split('.')) == 2
|
||||||
else:
|
else:
|
||||||
append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
|
append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
|
||||||
else:
|
else:
|
||||||
if line_stripped:
|
# 匹配子序号 '1. 2.' 生成三级标题
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
|
sub_match = re.match(r'^(\d+)[\.\、]\s*(.*)$', line_stripped)
|
||||||
|
if sub_match:
|
||||||
|
sub_number, sub_content = sub_match.groups()
|
||||||
|
# 生成三级标题 key,如 5.0.1, 5.0.2 等
|
||||||
|
if current_key:
|
||||||
|
if sub_key_count == 0:
|
||||||
|
# 生成二级标题,如 5.0
|
||||||
|
sub_key_count += 1
|
||||||
|
sub_key = f"{current_key}.0"
|
||||||
|
data[sub_key] = ''.join(current_content).strip()
|
||||||
|
current_content = [] # 重置内容以生成新的内容
|
||||||
|
third_key_count += 1
|
||||||
|
sub_key_with_third = f"{current_key}.0.{third_key_count}"
|
||||||
|
data[sub_key_with_third] = sub_content.strip()
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# 处理一般行内容
|
||||||
|
if line_stripped:
|
||||||
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
|
||||||
|
|
||||||
if current_key is not None:
|
if current_key is not None:
|
||||||
# 保存最后一部分内容
|
# 保存最后一部分内容
|
||||||
content_string = ''.join(current_content).strip()
|
content_string = ''.join(current_content).strip()
|
||||||
data[current_key] = content_string.replace(' ', '')
|
if content_string:
|
||||||
|
data[current_key] = content_string.replace(' ', '')
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def convert_to_json(file_path, start_word, end_phrases):
|
def convert_to_json(file_path, start_word, end_phrases):
|
||||||
if file_path.endswith('.docx'):
|
if file_path.endswith('.docx'):
|
||||||
text = extract_text_from_docx(file_path)
|
text = extract_text_from_docx(file_path)
|
||||||
|
@ -240,8 +240,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
|
|||||||
|
|
||||||
# 每提交一个任务暂停1秒,确保任务逐个提交
|
# 每提交一个任务暂停1秒,确保任务逐个提交
|
||||||
for task_name, future in futures.items():
|
for task_name, future in futures.items():
|
||||||
futures[task_name] = executor.submit(future)
|
time.sleep(1) # 可以保留这个暂停
|
||||||
time.sleep(1) # 任务提交后暂停1秒
|
|
||||||
|
|
||||||
# 按照任务完成的顺序返回结果
|
# 按照任务完成的顺序返回结果
|
||||||
for future in concurrent.futures.as_completed(futures.values()):
|
for future in concurrent.futures.as_completed(futures.values()):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user