100 lines
4.5 KiB
Python
100 lines
4.5 KiB
Python
def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
|
||
from collections import OrderedDict
|
||
from docx import Document
|
||
import re
|
||
|
||
if isinstance(keywords, str):
|
||
keywords = [keywords]
|
||
|
||
doc = Document(doc_path)
|
||
extracted_paragraphs = OrderedDict()
|
||
continue_collecting = False
|
||
current_section_pattern = None
|
||
active_key = None
|
||
|
||
def match_keywords(text, patterns):
|
||
return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns)
|
||
|
||
def extract_from_text(text, current_index):
|
||
nonlocal continue_collecting, current_section_pattern, active_key
|
||
if text == "":
|
||
return current_index
|
||
|
||
if continue_collecting:
|
||
if current_section_pattern and re.match(current_section_pattern, text):
|
||
continue_collecting = False
|
||
active_key = None
|
||
else:
|
||
if active_key is not None:
|
||
extracted_paragraphs[active_key].append(text)
|
||
return current_index
|
||
|
||
if match_keywords(text, keywords):
|
||
active_key = text
|
||
extracted_paragraphs[active_key] = [text]
|
||
if match_keywords(text, follow_up_keywords):
|
||
continue_collecting = True
|
||
section_number = re.match(r'(\d+(\s*\.\s*\d+)*)', text)
|
||
if section_number:
|
||
current_section_number = section_number.group(1)
|
||
level_count = current_section_number.count('.')
|
||
|
||
# Pattern to match current level, e.g., 3.4.5
|
||
pattern = r'^' + (r'\d+\s*\.\s*') * level_count + r'\d+'
|
||
|
||
# Generate patterns for next section at same level and parent level
|
||
parts = current_section_number.split('.')
|
||
matched_patterns = [pattern] # start with the full pattern
|
||
|
||
# Next section at same level
|
||
parts[-1] = str(int(parts[-1]) + 1)
|
||
next_pattern = r'^' + r'\s*\.\s*'.join(parts)
|
||
matched_patterns.append(next_pattern)
|
||
|
||
# Parent section (if applicable)
|
||
if len(parts) > 1:
|
||
parent_section_parts = parts[:-1]
|
||
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1)
|
||
parent_pattern = r'^' + r'\s*\.\s*'.join(parent_section_parts)
|
||
matched_patterns.append(parent_pattern)
|
||
|
||
# Combine the patterns
|
||
combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
|
||
current_section_pattern = re.compile(combined_pattern)
|
||
|
||
else:
|
||
found_next_number = False
|
||
current_section_pattern = None
|
||
|
||
while current_index < len(doc.paragraphs) - 1:
|
||
current_index += 1
|
||
next_text = doc.paragraphs[current_index].text.strip()
|
||
if not found_next_number:
|
||
next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))', next_text)
|
||
if next_section_number:
|
||
found_next_number = True
|
||
if next_section_number.group(1):
|
||
section_parts = next_section_number.group(1).split('.')
|
||
dynamic_pattern = r'^' + r'\.'.join(
|
||
[r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
|
||
elif next_section_number.group(2):
|
||
dynamic_pattern = r'^[\(\(]\d+[\)\)]'
|
||
current_section_pattern = re.compile(dynamic_pattern)
|
||
if current_section_pattern and re.match(current_section_pattern, next_text):
|
||
extracted_paragraphs[active_key].append(next_text)
|
||
else:
|
||
continue_collecting = False
|
||
active_key = None
|
||
break
|
||
|
||
return current_index
|
||
|
||
index = 0
|
||
while index < len(doc.paragraphs):
|
||
index = extract_from_text(doc.paragraphs[index].text.strip(), index)
|
||
index += 1
|
||
|
||
return extracted_paragraphs
|
||
|
||
doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目).docx'
|