From 9117c4117b9a8c3cfddc9a9ac4207feeccd0d7ab Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Fri, 24 Jan 2025 14:51:46 +0800 Subject: [PATCH] =?UTF-8?q?1.24=20=E5=90=8E=E5=A4=84=E7=90=86=E6=8F=90?= =?UTF-8?q?=E5=8F=96=E5=B8=A6=E6=98=9F=E8=A6=81=E6=B1=82=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?=E4=BA=86=E8=84=9A=E6=9C=AC=E6=8F=90=E5=8F=96=E7=9A=84=E9=80=BB?= =?UTF-8?q?=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/无效标和废标公共代码.py | 175 +++++++++++++--------- 1 file changed, 106 insertions(+), 69 deletions(-) diff --git a/flask_app/general/无效标和废标公共代码.py b/flask_app/general/无效标和废标公共代码.py index de2e9dc..0f2d857 100644 --- a/flask_app/general/无效标和废标公共代码.py +++ b/flask_app/general/无效标和废标公共代码.py @@ -272,95 +272,132 @@ def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keyword found_next_number = False current_section_pattern = None - while current_index < len(processed_paragraphs) - 1: - current_index += 1 - next_text = processed_paragraphs[current_index].strip() + try: + while current_index < len(processed_paragraphs) - 1: + current_index += 1 + try: + next_text = processed_paragraphs[current_index].strip() + except IndexError: + print(f"IndexError: current_index={current_index} is out of range.") + break # Exit the loop if index is out of range - if not next_text: - continue # Skip empty lines + if not next_text: + continue # Skip empty lines - if not found_next_number: - # Modify the regular expression to support the required format - number_pattern = ( - r'^([A-Za-z0-9]+(?:[..][A-Za-z0-9]+)*)|' - r'([((]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[))])|' - r'((?:[一二三四五六七八九十]{1,2}|\d+)\s*、)' - ) - next_section_number = re.match(number_pattern, next_text) + if not found_next_number: + # Modify the regular expression to support the required format + number_pattern = ( + r'^([A-Za-z0-9]+(?:[..][A-Za-z0-9]+)*)|' + r'([((]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[))])|' + r'((?:[一二三四五六七八九十]{1,2}|\d+)\s*、)' + ) + next_section_number = re.match(number_pattern, next_text) - if next_section_number: - found_next_number = True - # Determine dynamic pattern based on section number format - if next_section_number.group(1): - section_parts = next_section_number.group(1).split('.') - dynamic_pattern = r'^' + r'[..]'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b' - elif next_section_number.group(2): - dynamic_pattern = r'^[\(\(]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[\)\)]' - elif next_section_number.group(3): - dynamic_pattern = r'^(?:[一二三四五六七八九十]{1,2}|\d+)\s*、' - current_section_pattern = re.compile(dynamic_pattern) + if next_section_number: + found_next_number = True + # Determine dynamic pattern based on section number format + if next_section_number.group(1): + section_parts = next_section_number.group(1).split('.') + dynamic_pattern = r'^' + r'[..]'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b' + elif next_section_number.group(2): + dynamic_pattern = r'^[\(\(]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[\)\)]' + elif next_section_number.group(3): + dynamic_pattern = r'^(?:[一二三四五六七八九十]{1,2}|\d+)\s*、' + current_section_pattern = re.compile(dynamic_pattern) - if current_section_pattern and re.match(current_section_pattern, next_text): - extracted_paragraphs[active_key].append(next_text) - else: - continue_collecting = False - active_key = None - break + if current_section_pattern and re.match(current_section_pattern, next_text): + extracted_paragraphs[active_key].append(next_text) + else: + continue_collecting = False + active_key = None + break + except Exception as e: + print(f"An unexpected error occurred in process_matching_section: {e}") + # Optionally, you can re-raise the exception or handle it as needed + # raise e if continue_collecting: if text == '[$$table_start$$]': - current_index += 1 - while processed_paragraphs[current_index] != '[$$table_over$$]': - extracted_paragraphs[active_key].append(processed_paragraphs[current_index]) + try: current_index += 1 + while processed_paragraphs[current_index] != '[$$table_over$$]': + extracted_paragraphs[active_key].append(processed_paragraphs[current_index]) + current_index += 1 + except IndexError: + print(f"IndexError: current_index={current_index} during table processing.") + except Exception as e: + print(f"An unexpected error occurred while processing table: {e}") return current_index - if current_section_pattern and re.match(current_section_pattern, text): - continue_collecting = False - active_key = None + if current_section_pattern: + try: + if re.match(current_section_pattern, text): + continue_collecting = False + active_key = None + except re.error as re_err: + print(f"Regex error: {re_err} with pattern {current_section_pattern} and text '{text}'") + except Exception as e: + print(f"An unexpected error occurred while matching section pattern: {e}") else: if active_key is not None: - extracted_paragraphs[active_key].append(text) + try: + extracted_paragraphs[active_key].append(text) + except KeyError: + print(f"KeyError: active_key='{active_key}' not found in extracted_paragraphs.") + except Exception as e: + print(f"An unexpected error occurred while appending text: {e}") return current_index - if match_keywords(text, keywords): - active_key = text - extracted_paragraphs[active_key] = [text] + try: + if match_keywords(text, keywords): + active_key = text + extracted_paragraphs[active_key] = [text] - if (re.match(pattern_numbered, text) or re.match(pattern_parentheses, text)) and len(text) < 10: - continue_collecting = True - process_matching_section() - elif match_keywords(text, follow_up_keywords): - continue_collecting = True - section_number = re.match(r'^(\d+([..]\d+)*)\s*[..]?', text) + if (re.match(pattern_numbered, text) or re.match(pattern_parentheses, text)) and len(text) < 10: + continue_collecting = True + process_matching_section() + elif match_keywords(text, follow_up_keywords): + continue_collecting = True + section_number = re.match(r'^(\d+([..]\d+)*)\s*[..、]?', text) - if section_number: - current_section_number = section_number.group(1) - level_count = current_section_number.count('.') - parts = current_section_number.split('.') + if section_number: + current_section_number = section_number.group(1) + level_count = current_section_number.count('.') + parts = current_section_number.split('.') - matched_patterns = [] - for i in range(1, 6): - parent_section_parts = parts[:-1].copy() - parent_section_parts[-1] = str(int(parent_section_parts[-1]) + i) - parent_pattern = r'^' + r'\s*[..]\s*'.join(parent_section_parts) + r'(?!\s*[..]\s*\d+)' - matched_patterns.append(parent_pattern) + matched_patterns = [] + if len(parts) > 1: + for i in range(1, 6): + parent_section_parts = parts[:-1].copy() + try: + parent_section_parts[-1] = str(int(parent_section_parts[-1]) + i) + except ValueError: + print(f"ValueError: Unable to convert '{parent_section_parts[-1]}' to int.") + continue + parent_pattern = r'^' + r'\s*[..、]\s*'.join(parent_section_parts) + r'(?!\s*[..、]\s*\d+)' + matched_patterns.append(parent_pattern) - digit_comma_pattern = r'^\d+\s*、' - matched_patterns.append(digit_comma_pattern) + try: + current_top_level_num = int(current_section_number.split('.')[0]) + for i in range(1, 6): + next_top_level_num = current_top_level_num + i + next_top_level_pattern = r'^' + str(next_top_level_num) + r'\s*[..、]' + if next_top_level_pattern not in matched_patterns: + matched_patterns.append(next_top_level_pattern) + except ValueError: + print(f"ValueError: Unable to convert '{current_section_number.split('.')[0]}' to int.") - current_top_level_num = int(current_section_number.split('.')[0]) - for i in range(1, 6): - next_top_level_num = current_top_level_num + i - next_top_level_pattern = r'^' + str(next_top_level_num) + r'\s*[..]' - if next_top_level_pattern not in matched_patterns: - matched_patterns.append(next_top_level_pattern) + combined_pattern = r'(' + r')|('.join(matched_patterns) + r')' + try: + current_section_pattern = re.compile(combined_pattern) + except re.error as re_err: + print(f"Regex compilation error: {re_err} with pattern '{combined_pattern}'") - combined_pattern = r'(' + r')|('.join(matched_patterns) + r')' - current_section_pattern = re.compile(combined_pattern) + else: + process_matching_section() # Reusing the common function for matching sections - else: - process_matching_section() # Reusing the common function for matching sections + except Exception as e: + print(f"An unexpected error occurred in extract_from_text: {e} 当前index:{current_index}") return current_index @@ -676,7 +713,7 @@ if __name__ == '__main__': output_dir = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output6" # invalid_added = insert_mark(pdf_path) # invalid_added_docx = pdf2docx(invalid_added) - invalid_added_docx=r'C:\Users\Administrator\Desktop\货物标\zbfiles\output6\invalid_added.docx' + invalid_added_docx=r'C:\Users\Administrator\Desktop\fsdownload\a3b259fd-5ade-4c66-9117-eb44796682ad\invalid_added.docx' results = combine_find_invalid(invalid_added_docx, output_dir) end_time = time.time() print("Results:", json.dumps(results, ensure_ascii=False, indent=4))