1.24 废标项
This commit is contained in:
parent
9117c4117b
commit
dbd217a83f
@ -318,36 +318,18 @@ def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keyword
|
||||
|
||||
if continue_collecting:
|
||||
if text == '[$$table_start$$]':
|
||||
try:
|
||||
current_index += 1
|
||||
while processed_paragraphs[current_index] != '[$$table_over$$]':
|
||||
while (processed_paragraphs[current_index] != '[$$table_over$$]'):
|
||||
extracted_paragraphs[active_key].append(processed_paragraphs[current_index])
|
||||
current_index += 1
|
||||
except IndexError:
|
||||
print(f"IndexError: current_index={current_index} during table processing.")
|
||||
except Exception as e:
|
||||
print(f"An unexpected error occurred while processing table: {e}")
|
||||
return current_index
|
||||
|
||||
if current_section_pattern:
|
||||
try:
|
||||
if re.match(current_section_pattern, text):
|
||||
if current_section_pattern and re.match(current_section_pattern, text):
|
||||
continue_collecting = False
|
||||
active_key = None
|
||||
except re.error as re_err:
|
||||
print(f"Regex error: {re_err} with pattern {current_section_pattern} and text '{text}'")
|
||||
except Exception as e:
|
||||
print(f"An unexpected error occurred while matching section pattern: {e}")
|
||||
else:
|
||||
if active_key is not None:
|
||||
try:
|
||||
extracted_paragraphs[active_key].append(text)
|
||||
except KeyError:
|
||||
print(f"KeyError: active_key='{active_key}' not found in extracted_paragraphs.")
|
||||
except Exception as e:
|
||||
print(f"An unexpected error occurred while appending text: {e}")
|
||||
return current_index
|
||||
|
||||
try:
|
||||
if match_keywords(text, keywords):
|
||||
active_key = text
|
||||
@ -713,7 +695,7 @@ if __name__ == '__main__':
|
||||
output_dir = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output6"
|
||||
# invalid_added = insert_mark(pdf_path)
|
||||
# invalid_added_docx = pdf2docx(invalid_added)
|
||||
invalid_added_docx=r'C:\Users\Administrator\Desktop\fsdownload\a3b259fd-5ade-4c66-9117-eb44796682ad\invalid_added.docx'
|
||||
invalid_added_docx=r'C:\Users\Administrator\Desktop\fsdownload\355fbb20-e439-48dd-abdd-7645d424af36\invalid_added.docx'
|
||||
results = combine_find_invalid(invalid_added_docx, output_dir)
|
||||
end_time = time.time()
|
||||
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
|
||||
|
Loading…
x
Reference in New Issue
Block a user