1.24 后处理提取带星要求添加了脚本提取的逻辑

2025-01-24 14:51:46 +08:00 · 2025-01-24 14:51:46 +08:00 · 9117c4117b
commit 9117c4117b
parent 29c17c375d
1 changed files with 106 additions and 69 deletions
--- a/flask_app/general/无效标和废标公共代码.py
+++ b/flask_app/general/无效标和废标公共代码.py
@ -272,95 +272,132 @@ def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keyword
            found_next_number = False
            current_section_pattern = None

-            while current_index < len(processed_paragraphs) - 1:
-                current_index += 1
-                next_text = processed_paragraphs[current_index].strip()
+            try:
+                while current_index < len(processed_paragraphs) - 1:
+                    current_index += 1
+                    try:
+                        next_text = processed_paragraphs[current_index].strip()
+                    except IndexError:
+                        print(f"IndexError: current_index={current_index} is out of range.")
+                        break  # Exit the loop if index is out of range

-                if not next_text:
-                    continue  # Skip empty lines
+                    if not next_text:
+                        continue  # Skip empty lines

-                if not found_next_number:
-                    # Modify the regular expression to support the required format
-                    number_pattern = (
-                        r'^([A-Za-z0-9]+(?:[.．][A-Za-z0-9]+)*)|'
-                        r'([（(]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[）)])|'
-                        r'((?:[一二三四五六七八九十]{1,2}|\d+)\s*、)'
-                    )
-                    next_section_number = re.match(number_pattern, next_text)
+                    if not found_next_number:
+                        # Modify the regular expression to support the required format
+                        number_pattern = (
+                            r'^([A-Za-z0-9]+(?:[.．][A-Za-z0-9]+)*)|'
+                            r'([（(]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[）)])|'
+                            r'((?:[一二三四五六七八九十]{1,2}|\d+)\s*、)'
+                        )
+                        next_section_number = re.match(number_pattern, next_text)

-                    if next_section_number:
-                        found_next_number = True
-                        # Determine dynamic pattern based on section number format
-                        if next_section_number.group(1):
-                            section_parts = next_section_number.group(1).split('.')
-                            dynamic_pattern = r'^' + r'[.．]'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
-                        elif next_section_number.group(2):
-                            dynamic_pattern = r'^[\(\（]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[\)\）]'
-                        elif next_section_number.group(3):
-                            dynamic_pattern = r'^(?:[一二三四五六七八九十]{1,2}|\d+)\s*、'
-                        current_section_pattern = re.compile(dynamic_pattern)
+                        if next_section_number:
+                            found_next_number = True
+                            # Determine dynamic pattern based on section number format
+                            if next_section_number.group(1):
+                                section_parts = next_section_number.group(1).split('.')
+                                dynamic_pattern = r'^' + r'[.．]'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
+                            elif next_section_number.group(2):
+                                dynamic_pattern = r'^[\(\（]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[\)\）]'
+                            elif next_section_number.group(3):
+                                dynamic_pattern = r'^(?:[一二三四五六七八九十]{1,2}|\d+)\s*、'
+                            current_section_pattern = re.compile(dynamic_pattern)

-                if current_section_pattern and re.match(current_section_pattern, next_text):
-                    extracted_paragraphs[active_key].append(next_text)
-                else:
-                    continue_collecting = False
-                    active_key = None
-                    break
+                    if current_section_pattern and re.match(current_section_pattern, next_text):
+                        extracted_paragraphs[active_key].append(next_text)
+                    else:
+                        continue_collecting = False
+                        active_key = None
+                        break
+            except Exception as e:
+                print(f"An unexpected error occurred in process_matching_section: {e}")
+                # Optionally, you can re-raise the exception or handle it as needed
+                # raise e

        if continue_collecting:
            if text == '[$$table_start$$]':
-                current_index += 1
-                while processed_paragraphs[current_index] != '[$$table_over$$]':
-                    extracted_paragraphs[active_key].append(processed_paragraphs[current_index])
+                try:
                    current_index += 1
+                    while processed_paragraphs[current_index] != '[$$table_over$$]':
+                        extracted_paragraphs[active_key].append(processed_paragraphs[current_index])
+                        current_index += 1
+                except IndexError:
+                    print(f"IndexError: current_index={current_index} during table processing.")
+                except Exception as e:
+                    print(f"An unexpected error occurred while processing table: {e}")
                return current_index

-            if current_section_pattern and re.match(current_section_pattern, text):
-                continue_collecting = False
-                active_key = None
+            if current_section_pattern:
+                try:
+                    if re.match(current_section_pattern, text):
+                        continue_collecting = False
+                        active_key = None
+                except re.error as re_err:
+                    print(f"Regex error: {re_err} with pattern {current_section_pattern} and text '{text}'")
+                except Exception as e:
+                    print(f"An unexpected error occurred while matching section pattern: {e}")
            else:
                if active_key is not None:
-                    extracted_paragraphs[active_key].append(text)
+                    try:
+                        extracted_paragraphs[active_key].append(text)
+                    except KeyError:
+                        print(f"KeyError: active_key='{active_key}' not found in extracted_paragraphs.")
+                    except Exception as e:
+                        print(f"An unexpected error occurred while appending text: {e}")
                return current_index

-        if match_keywords(text, keywords):
-            active_key = text
-            extracted_paragraphs[active_key] = [text]
+        try:
+            if match_keywords(text, keywords):
+                active_key = text
+                extracted_paragraphs[active_key] = [text]

-            if (re.match(pattern_numbered, text) or re.match(pattern_parentheses, text)) and len(text) < 10:
-                continue_collecting = True
-                process_matching_section()
-            elif match_keywords(text, follow_up_keywords):
-                continue_collecting = True
-                section_number = re.match(r'^(\d+([.．]\d+)*)\s*[.．]?', text)
+                if (re.match(pattern_numbered, text) or re.match(pattern_parentheses, text)) and len(text) < 10:
+                    continue_collecting = True
+                    process_matching_section()
+                elif match_keywords(text, follow_up_keywords):
+                    continue_collecting = True
+                    section_number = re.match(r'^(\d+([.．]\d+)*)\s*[.．、]?', text)

-                if section_number:
-                    current_section_number = section_number.group(1)
-                    level_count = current_section_number.count('.')
-                    parts = current_section_number.split('.')
+                    if section_number:
+                        current_section_number = section_number.group(1)
+                        level_count = current_section_number.count('.')
+                        parts = current_section_number.split('.')

-                    matched_patterns = []
-                    for i in range(1, 6):
-                        parent_section_parts = parts[:-1].copy()
-                        parent_section_parts[-1] = str(int(parent_section_parts[-1]) + i)
-                        parent_pattern = r'^' + r'\s*[.．]\s*'.join(parent_section_parts) + r'(?!\s*[.．]\s*\d+)'
-                        matched_patterns.append(parent_pattern)
+                        matched_patterns = []
+                        if len(parts) > 1:
+                            for i in range(1, 6):
+                                parent_section_parts = parts[:-1].copy()
+                                try:
+                                    parent_section_parts[-1] = str(int(parent_section_parts[-1]) + i)
+                                except ValueError:
+                                    print(f"ValueError: Unable to convert '{parent_section_parts[-1]}' to int.")
+                                    continue
+                                parent_pattern = r'^' + r'\s*[.．、]\s*'.join(parent_section_parts) + r'(?!\s*[.．、]\s*\d+)'
+                                matched_patterns.append(parent_pattern)

-                    digit_comma_pattern = r'^\d+\s*、'
-                    matched_patterns.append(digit_comma_pattern)
+                        try:
+                            current_top_level_num = int(current_section_number.split('.')[0])
+                            for i in range(1, 6):
+                                next_top_level_num = current_top_level_num + i
+                                next_top_level_pattern = r'^' + str(next_top_level_num) + r'\s*[.．、]'
+                                if next_top_level_pattern not in matched_patterns:
+                                    matched_patterns.append(next_top_level_pattern)
+                        except ValueError:
+                            print(f"ValueError: Unable to convert '{current_section_number.split('.')[0]}' to int.")

-                    current_top_level_num = int(current_section_number.split('.')[0])
-                    for i in range(1, 6):
-                        next_top_level_num = current_top_level_num + i
-                        next_top_level_pattern = r'^' + str(next_top_level_num) + r'\s*[.．]'
-                        if next_top_level_pattern not in matched_patterns:
-                            matched_patterns.append(next_top_level_pattern)
+                        combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
+                        try:
+                            current_section_pattern = re.compile(combined_pattern)
+                        except re.error as re_err:
+                            print(f"Regex compilation error: {re_err} with pattern '{combined_pattern}'")

-                    combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
-                    current_section_pattern = re.compile(combined_pattern)
+                    else:
+                        process_matching_section()  # Reusing the common function for matching sections

-                else:
-                    process_matching_section()  # Reusing the common function for matching sections
+        except Exception as e:
+            print(f"An unexpected error occurred in extract_from_text: {e} 当前index:{current_index}")

        return current_index

@ -676,7 +713,7 @@ if __name__ == '__main__':
    output_dir = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output6"
    # invalid_added = insert_mark(pdf_path)
    # invalid_added_docx = pdf2docx(invalid_added)
-    invalid_added_docx=r'C:\Users\Administrator\Desktop\货物标\zbfiles\output6\invalid_added.docx'
+    invalid_added_docx=r'C:\Users\Administrator\Desktop\fsdownload\a3b259fd-5ade-4c66-9117-eb44796682ad\invalid_added.docx'
    results = combine_find_invalid(invalid_added_docx, output_dir)
    end_time = time.time()
    print("Results:", json.dumps(results, ensure_ascii=False, indent=4))