diff --git a/flask_app/general/无效标和废标公共代码.py b/flask_app/general/无效标和废标公共代码.py index 0f2d857..9378299 100644 --- a/flask_app/general/无效标和废标公共代码.py +++ b/flask_app/general/无效标和废标公共代码.py @@ -318,36 +318,18 @@ def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keyword if continue_collecting: if text == '[$$table_start$$]': - try: + current_index += 1 + while (processed_paragraphs[current_index] != '[$$table_over$$]'): + extracted_paragraphs[active_key].append(processed_paragraphs[current_index]) current_index += 1 - while processed_paragraphs[current_index] != '[$$table_over$$]': - extracted_paragraphs[active_key].append(processed_paragraphs[current_index]) - current_index += 1 - except IndexError: - print(f"IndexError: current_index={current_index} during table processing.") - except Exception as e: - print(f"An unexpected error occurred while processing table: {e}") return current_index - - if current_section_pattern: - try: - if re.match(current_section_pattern, text): - continue_collecting = False - active_key = None - except re.error as re_err: - print(f"Regex error: {re_err} with pattern {current_section_pattern} and text '{text}'") - except Exception as e: - print(f"An unexpected error occurred while matching section pattern: {e}") + if current_section_pattern and re.match(current_section_pattern, text): + continue_collecting = False + active_key = None else: if active_key is not None: - try: - extracted_paragraphs[active_key].append(text) - except KeyError: - print(f"KeyError: active_key='{active_key}' not found in extracted_paragraphs.") - except Exception as e: - print(f"An unexpected error occurred while appending text: {e}") - return current_index - + extracted_paragraphs[active_key].append(text) + return current_index try: if match_keywords(text, keywords): active_key = text @@ -713,7 +695,7 @@ if __name__ == '__main__': output_dir = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output6" # invalid_added = insert_mark(pdf_path) # invalid_added_docx = pdf2docx(invalid_added) - invalid_added_docx=r'C:\Users\Administrator\Desktop\fsdownload\a3b259fd-5ade-4c66-9117-eb44796682ad\invalid_added.docx' + invalid_added_docx=r'C:\Users\Administrator\Desktop\fsdownload\355fbb20-e439-48dd-abdd-7645d424af36\invalid_added.docx' results = combine_find_invalid(invalid_added_docx, output_dir) end_time = time.time() print("Results:", json.dumps(results, ensure_ascii=False, indent=4))