1.24 后处理提取带星要求添加了脚本提取的逻辑

This commit is contained in:
zy123 2025-01-24 14:51:46 +08:00
parent 29c17c375d
commit 9117c4117b

View File

@ -272,95 +272,132 @@ def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keyword
found_next_number = False found_next_number = False
current_section_pattern = None current_section_pattern = None
while current_index < len(processed_paragraphs) - 1: try:
current_index += 1 while current_index < len(processed_paragraphs) - 1:
next_text = processed_paragraphs[current_index].strip() current_index += 1
try:
next_text = processed_paragraphs[current_index].strip()
except IndexError:
print(f"IndexError: current_index={current_index} is out of range.")
break # Exit the loop if index is out of range
if not next_text: if not next_text:
continue # Skip empty lines continue # Skip empty lines
if not found_next_number: if not found_next_number:
# Modify the regular expression to support the required format # Modify the regular expression to support the required format
number_pattern = ( number_pattern = (
r'^([A-Za-z0-9]+(?:[.][A-Za-z0-9]+)*)|' r'^([A-Za-z0-9]+(?:[.][A-Za-z0-9]+)*)|'
r'([(]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[)])|' r'([(]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[)])|'
r'((?:[一二三四五六七八九十]{1,2}|\d+)\s*、)' r'((?:[一二三四五六七八九十]{1,2}|\d+)\s*、)'
) )
next_section_number = re.match(number_pattern, next_text) next_section_number = re.match(number_pattern, next_text)
if next_section_number: if next_section_number:
found_next_number = True found_next_number = True
# Determine dynamic pattern based on section number format # Determine dynamic pattern based on section number format
if next_section_number.group(1): if next_section_number.group(1):
section_parts = next_section_number.group(1).split('.') section_parts = next_section_number.group(1).split('.')
dynamic_pattern = r'^' + r'[.]'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b' dynamic_pattern = r'^' + r'[.]'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
elif next_section_number.group(2): elif next_section_number.group(2):
dynamic_pattern = r'^[\(\]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[\)\]' dynamic_pattern = r'^[\(\]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[\)\]'
elif next_section_number.group(3): elif next_section_number.group(3):
dynamic_pattern = r'^(?:[一二三四五六七八九十]{1,2}|\d+)\s*、' dynamic_pattern = r'^(?:[一二三四五六七八九十]{1,2}|\d+)\s*、'
current_section_pattern = re.compile(dynamic_pattern) current_section_pattern = re.compile(dynamic_pattern)
if current_section_pattern and re.match(current_section_pattern, next_text): if current_section_pattern and re.match(current_section_pattern, next_text):
extracted_paragraphs[active_key].append(next_text) extracted_paragraphs[active_key].append(next_text)
else: else:
continue_collecting = False continue_collecting = False
active_key = None active_key = None
break break
except Exception as e:
print(f"An unexpected error occurred in process_matching_section: {e}")
# Optionally, you can re-raise the exception or handle it as needed
# raise e
if continue_collecting: if continue_collecting:
if text == '[$$table_start$$]': if text == '[$$table_start$$]':
current_index += 1 try:
while processed_paragraphs[current_index] != '[$$table_over$$]':
extracted_paragraphs[active_key].append(processed_paragraphs[current_index])
current_index += 1 current_index += 1
while processed_paragraphs[current_index] != '[$$table_over$$]':
extracted_paragraphs[active_key].append(processed_paragraphs[current_index])
current_index += 1
except IndexError:
print(f"IndexError: current_index={current_index} during table processing.")
except Exception as e:
print(f"An unexpected error occurred while processing table: {e}")
return current_index return current_index
if current_section_pattern and re.match(current_section_pattern, text): if current_section_pattern:
continue_collecting = False try:
active_key = None if re.match(current_section_pattern, text):
continue_collecting = False
active_key = None
except re.error as re_err:
print(f"Regex error: {re_err} with pattern {current_section_pattern} and text '{text}'")
except Exception as e:
print(f"An unexpected error occurred while matching section pattern: {e}")
else: else:
if active_key is not None: if active_key is not None:
extracted_paragraphs[active_key].append(text) try:
extracted_paragraphs[active_key].append(text)
except KeyError:
print(f"KeyError: active_key='{active_key}' not found in extracted_paragraphs.")
except Exception as e:
print(f"An unexpected error occurred while appending text: {e}")
return current_index return current_index
if match_keywords(text, keywords): try:
active_key = text if match_keywords(text, keywords):
extracted_paragraphs[active_key] = [text] active_key = text
extracted_paragraphs[active_key] = [text]
if (re.match(pattern_numbered, text) or re.match(pattern_parentheses, text)) and len(text) < 10: if (re.match(pattern_numbered, text) or re.match(pattern_parentheses, text)) and len(text) < 10:
continue_collecting = True continue_collecting = True
process_matching_section() process_matching_section()
elif match_keywords(text, follow_up_keywords): elif match_keywords(text, follow_up_keywords):
continue_collecting = True continue_collecting = True
section_number = re.match(r'^(\d+([.]\d+)*)\s*[.]?', text) section_number = re.match(r'^(\d+([.]\d+)*)\s*[.]?', text)
if section_number: if section_number:
current_section_number = section_number.group(1) current_section_number = section_number.group(1)
level_count = current_section_number.count('.') level_count = current_section_number.count('.')
parts = current_section_number.split('.') parts = current_section_number.split('.')
matched_patterns = [] matched_patterns = []
for i in range(1, 6): if len(parts) > 1:
parent_section_parts = parts[:-1].copy() for i in range(1, 6):
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + i) parent_section_parts = parts[:-1].copy()
parent_pattern = r'^' + r'\s*[.]\s*'.join(parent_section_parts) + r'(?!\s*[.]\s*\d+)' try:
matched_patterns.append(parent_pattern) parent_section_parts[-1] = str(int(parent_section_parts[-1]) + i)
except ValueError:
print(f"ValueError: Unable to convert '{parent_section_parts[-1]}' to int.")
continue
parent_pattern = r'^' + r'\s*[..、]\s*'.join(parent_section_parts) + r'(?!\s*[..、]\s*\d+)'
matched_patterns.append(parent_pattern)
digit_comma_pattern = r'^\d+\s*、' try:
matched_patterns.append(digit_comma_pattern) current_top_level_num = int(current_section_number.split('.')[0])
for i in range(1, 6):
next_top_level_num = current_top_level_num + i
next_top_level_pattern = r'^' + str(next_top_level_num) + r'\s*[..、]'
if next_top_level_pattern not in matched_patterns:
matched_patterns.append(next_top_level_pattern)
except ValueError:
print(f"ValueError: Unable to convert '{current_section_number.split('.')[0]}' to int.")
current_top_level_num = int(current_section_number.split('.')[0]) combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
for i in range(1, 6): try:
next_top_level_num = current_top_level_num + i current_section_pattern = re.compile(combined_pattern)
next_top_level_pattern = r'^' + str(next_top_level_num) + r'\s*[.]' except re.error as re_err:
if next_top_level_pattern not in matched_patterns: print(f"Regex compilation error: {re_err} with pattern '{combined_pattern}'")
matched_patterns.append(next_top_level_pattern)
combined_pattern = r'(' + r')|('.join(matched_patterns) + r')' else:
current_section_pattern = re.compile(combined_pattern) process_matching_section() # Reusing the common function for matching sections
else: except Exception as e:
process_matching_section() # Reusing the common function for matching sections print(f"An unexpected error occurred in extract_from_text: {e} 当前index:{current_index}")
return current_index return current_index
@ -676,7 +713,7 @@ if __name__ == '__main__':
output_dir = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output6" output_dir = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output6"
# invalid_added = insert_mark(pdf_path) # invalid_added = insert_mark(pdf_path)
# invalid_added_docx = pdf2docx(invalid_added) # invalid_added_docx = pdf2docx(invalid_added)
invalid_added_docx=r'C:\Users\Administrator\Desktop\货物标\zbfiles\output6\invalid_added.docx' invalid_added_docx=r'C:\Users\Administrator\Desktop\fsdownload\a3b259fd-5ade-4c66-9117-eb44796682ad\invalid_added.docx'
results = combine_find_invalid(invalid_added_docx, output_dir) results = combine_find_invalid(invalid_added_docx, output_dir)
end_time = time.time() end_time = time.time()
print("Results:", json.dumps(results, ensure_ascii=False, indent=4)) print("Results:", json.dumps(results, ensure_ascii=False, indent=4))