1.24 后处理提取带星要求添加了脚本提取的逻辑
This commit is contained in:
parent
29c17c375d
commit
9117c4117b
@ -272,95 +272,132 @@ def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keyword
|
|||||||
found_next_number = False
|
found_next_number = False
|
||||||
current_section_pattern = None
|
current_section_pattern = None
|
||||||
|
|
||||||
while current_index < len(processed_paragraphs) - 1:
|
try:
|
||||||
current_index += 1
|
while current_index < len(processed_paragraphs) - 1:
|
||||||
next_text = processed_paragraphs[current_index].strip()
|
current_index += 1
|
||||||
|
try:
|
||||||
|
next_text = processed_paragraphs[current_index].strip()
|
||||||
|
except IndexError:
|
||||||
|
print(f"IndexError: current_index={current_index} is out of range.")
|
||||||
|
break # Exit the loop if index is out of range
|
||||||
|
|
||||||
if not next_text:
|
if not next_text:
|
||||||
continue # Skip empty lines
|
continue # Skip empty lines
|
||||||
|
|
||||||
if not found_next_number:
|
if not found_next_number:
|
||||||
# Modify the regular expression to support the required format
|
# Modify the regular expression to support the required format
|
||||||
number_pattern = (
|
number_pattern = (
|
||||||
r'^([A-Za-z0-9]+(?:[..][A-Za-z0-9]+)*)|'
|
r'^([A-Za-z0-9]+(?:[..][A-Za-z0-9]+)*)|'
|
||||||
r'([((]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[))])|'
|
r'([((]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[))])|'
|
||||||
r'((?:[一二三四五六七八九十]{1,2}|\d+)\s*、)'
|
r'((?:[一二三四五六七八九十]{1,2}|\d+)\s*、)'
|
||||||
)
|
)
|
||||||
next_section_number = re.match(number_pattern, next_text)
|
next_section_number = re.match(number_pattern, next_text)
|
||||||
|
|
||||||
if next_section_number:
|
if next_section_number:
|
||||||
found_next_number = True
|
found_next_number = True
|
||||||
# Determine dynamic pattern based on section number format
|
# Determine dynamic pattern based on section number format
|
||||||
if next_section_number.group(1):
|
if next_section_number.group(1):
|
||||||
section_parts = next_section_number.group(1).split('.')
|
section_parts = next_section_number.group(1).split('.')
|
||||||
dynamic_pattern = r'^' + r'[..]'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
|
dynamic_pattern = r'^' + r'[..]'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
|
||||||
elif next_section_number.group(2):
|
elif next_section_number.group(2):
|
||||||
dynamic_pattern = r'^[\(\(]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[\)\)]'
|
dynamic_pattern = r'^[\(\(]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[\)\)]'
|
||||||
elif next_section_number.group(3):
|
elif next_section_number.group(3):
|
||||||
dynamic_pattern = r'^(?:[一二三四五六七八九十]{1,2}|\d+)\s*、'
|
dynamic_pattern = r'^(?:[一二三四五六七八九十]{1,2}|\d+)\s*、'
|
||||||
current_section_pattern = re.compile(dynamic_pattern)
|
current_section_pattern = re.compile(dynamic_pattern)
|
||||||
|
|
||||||
if current_section_pattern and re.match(current_section_pattern, next_text):
|
if current_section_pattern and re.match(current_section_pattern, next_text):
|
||||||
extracted_paragraphs[active_key].append(next_text)
|
extracted_paragraphs[active_key].append(next_text)
|
||||||
else:
|
else:
|
||||||
continue_collecting = False
|
continue_collecting = False
|
||||||
active_key = None
|
active_key = None
|
||||||
break
|
break
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An unexpected error occurred in process_matching_section: {e}")
|
||||||
|
# Optionally, you can re-raise the exception or handle it as needed
|
||||||
|
# raise e
|
||||||
|
|
||||||
if continue_collecting:
|
if continue_collecting:
|
||||||
if text == '[$$table_start$$]':
|
if text == '[$$table_start$$]':
|
||||||
current_index += 1
|
try:
|
||||||
while processed_paragraphs[current_index] != '[$$table_over$$]':
|
|
||||||
extracted_paragraphs[active_key].append(processed_paragraphs[current_index])
|
|
||||||
current_index += 1
|
current_index += 1
|
||||||
|
while processed_paragraphs[current_index] != '[$$table_over$$]':
|
||||||
|
extracted_paragraphs[active_key].append(processed_paragraphs[current_index])
|
||||||
|
current_index += 1
|
||||||
|
except IndexError:
|
||||||
|
print(f"IndexError: current_index={current_index} during table processing.")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An unexpected error occurred while processing table: {e}")
|
||||||
return current_index
|
return current_index
|
||||||
|
|
||||||
if current_section_pattern and re.match(current_section_pattern, text):
|
if current_section_pattern:
|
||||||
continue_collecting = False
|
try:
|
||||||
active_key = None
|
if re.match(current_section_pattern, text):
|
||||||
|
continue_collecting = False
|
||||||
|
active_key = None
|
||||||
|
except re.error as re_err:
|
||||||
|
print(f"Regex error: {re_err} with pattern {current_section_pattern} and text '{text}'")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An unexpected error occurred while matching section pattern: {e}")
|
||||||
else:
|
else:
|
||||||
if active_key is not None:
|
if active_key is not None:
|
||||||
extracted_paragraphs[active_key].append(text)
|
try:
|
||||||
|
extracted_paragraphs[active_key].append(text)
|
||||||
|
except KeyError:
|
||||||
|
print(f"KeyError: active_key='{active_key}' not found in extracted_paragraphs.")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An unexpected error occurred while appending text: {e}")
|
||||||
return current_index
|
return current_index
|
||||||
|
|
||||||
if match_keywords(text, keywords):
|
try:
|
||||||
active_key = text
|
if match_keywords(text, keywords):
|
||||||
extracted_paragraphs[active_key] = [text]
|
active_key = text
|
||||||
|
extracted_paragraphs[active_key] = [text]
|
||||||
|
|
||||||
if (re.match(pattern_numbered, text) or re.match(pattern_parentheses, text)) and len(text) < 10:
|
if (re.match(pattern_numbered, text) or re.match(pattern_parentheses, text)) and len(text) < 10:
|
||||||
continue_collecting = True
|
continue_collecting = True
|
||||||
process_matching_section()
|
process_matching_section()
|
||||||
elif match_keywords(text, follow_up_keywords):
|
elif match_keywords(text, follow_up_keywords):
|
||||||
continue_collecting = True
|
continue_collecting = True
|
||||||
section_number = re.match(r'^(\d+([..]\d+)*)\s*[..]?', text)
|
section_number = re.match(r'^(\d+([..]\d+)*)\s*[..、]?', text)
|
||||||
|
|
||||||
if section_number:
|
if section_number:
|
||||||
current_section_number = section_number.group(1)
|
current_section_number = section_number.group(1)
|
||||||
level_count = current_section_number.count('.')
|
level_count = current_section_number.count('.')
|
||||||
parts = current_section_number.split('.')
|
parts = current_section_number.split('.')
|
||||||
|
|
||||||
matched_patterns = []
|
matched_patterns = []
|
||||||
for i in range(1, 6):
|
if len(parts) > 1:
|
||||||
parent_section_parts = parts[:-1].copy()
|
for i in range(1, 6):
|
||||||
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + i)
|
parent_section_parts = parts[:-1].copy()
|
||||||
parent_pattern = r'^' + r'\s*[..]\s*'.join(parent_section_parts) + r'(?!\s*[..]\s*\d+)'
|
try:
|
||||||
matched_patterns.append(parent_pattern)
|
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + i)
|
||||||
|
except ValueError:
|
||||||
|
print(f"ValueError: Unable to convert '{parent_section_parts[-1]}' to int.")
|
||||||
|
continue
|
||||||
|
parent_pattern = r'^' + r'\s*[..、]\s*'.join(parent_section_parts) + r'(?!\s*[..、]\s*\d+)'
|
||||||
|
matched_patterns.append(parent_pattern)
|
||||||
|
|
||||||
digit_comma_pattern = r'^\d+\s*、'
|
try:
|
||||||
matched_patterns.append(digit_comma_pattern)
|
current_top_level_num = int(current_section_number.split('.')[0])
|
||||||
|
for i in range(1, 6):
|
||||||
|
next_top_level_num = current_top_level_num + i
|
||||||
|
next_top_level_pattern = r'^' + str(next_top_level_num) + r'\s*[..、]'
|
||||||
|
if next_top_level_pattern not in matched_patterns:
|
||||||
|
matched_patterns.append(next_top_level_pattern)
|
||||||
|
except ValueError:
|
||||||
|
print(f"ValueError: Unable to convert '{current_section_number.split('.')[0]}' to int.")
|
||||||
|
|
||||||
current_top_level_num = int(current_section_number.split('.')[0])
|
combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
|
||||||
for i in range(1, 6):
|
try:
|
||||||
next_top_level_num = current_top_level_num + i
|
current_section_pattern = re.compile(combined_pattern)
|
||||||
next_top_level_pattern = r'^' + str(next_top_level_num) + r'\s*[..]'
|
except re.error as re_err:
|
||||||
if next_top_level_pattern not in matched_patterns:
|
print(f"Regex compilation error: {re_err} with pattern '{combined_pattern}'")
|
||||||
matched_patterns.append(next_top_level_pattern)
|
|
||||||
|
|
||||||
combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
|
else:
|
||||||
current_section_pattern = re.compile(combined_pattern)
|
process_matching_section() # Reusing the common function for matching sections
|
||||||
|
|
||||||
else:
|
except Exception as e:
|
||||||
process_matching_section() # Reusing the common function for matching sections
|
print(f"An unexpected error occurred in extract_from_text: {e} 当前index:{current_index}")
|
||||||
|
|
||||||
return current_index
|
return current_index
|
||||||
|
|
||||||
@ -676,7 +713,7 @@ if __name__ == '__main__':
|
|||||||
output_dir = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output6"
|
output_dir = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output6"
|
||||||
# invalid_added = insert_mark(pdf_path)
|
# invalid_added = insert_mark(pdf_path)
|
||||||
# invalid_added_docx = pdf2docx(invalid_added)
|
# invalid_added_docx = pdf2docx(invalid_added)
|
||||||
invalid_added_docx=r'C:\Users\Administrator\Desktop\货物标\zbfiles\output6\invalid_added.docx'
|
invalid_added_docx=r'C:\Users\Administrator\Desktop\fsdownload\a3b259fd-5ade-4c66-9117-eb44796682ad\invalid_added.docx'
|
||||||
results = combine_find_invalid(invalid_added_docx, output_dir)
|
results = combine_find_invalid(invalid_added_docx, output_dir)
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
|
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user