def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): from collections import OrderedDict from docx import Document import re if isinstance(keywords, str): keywords = [keywords] doc = Document(doc_path) extracted_paragraphs = OrderedDict() continue_collecting = False current_section_pattern = None active_key = None def match_keywords(text, patterns): return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns) def extract_from_text(text, current_index): nonlocal continue_collecting, current_section_pattern, active_key if text == "": return current_index if continue_collecting: if current_section_pattern and re.match(current_section_pattern, text): continue_collecting = False active_key = None else: if active_key is not None: extracted_paragraphs[active_key].append(text) return current_index if match_keywords(text, keywords): active_key = text extracted_paragraphs[active_key] = [text] if match_keywords(text, follow_up_keywords): continue_collecting = True section_number = re.match(r'(\d+(\s*\.\s*\d+)*)', text) if section_number: current_section_number = section_number.group(1) level_count = current_section_number.count('.') # Pattern to match current level, e.g., 3.4.5 pattern = r'^' + (r'\d+\s*\.\s*') * level_count + r'\d+' # Generate patterns for next section at same level and parent level parts = current_section_number.split('.') matched_patterns = [pattern] # start with the full pattern # Next section at same level parts[-1] = str(int(parts[-1]) + 1) next_pattern = r'^' + r'\s*\.\s*'.join(parts) matched_patterns.append(next_pattern) # Parent section (if applicable) if len(parts) > 1: parent_section_parts = parts[:-1] parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1) parent_pattern = r'^' + r'\s*\.\s*'.join(parent_section_parts) matched_patterns.append(parent_pattern) # Combine the patterns combined_pattern = r'(' + r')|('.join(matched_patterns) + r')' current_section_pattern = re.compile(combined_pattern) else: found_next_number = False current_section_pattern = None while current_index < len(doc.paragraphs) - 1: current_index += 1 next_text = doc.paragraphs[current_index].text.strip() if not found_next_number: next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))', next_text) if next_section_number: found_next_number = True if next_section_number.group(1): section_parts = next_section_number.group(1).split('.') dynamic_pattern = r'^' + r'\.'.join( [r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b' elif next_section_number.group(2): dynamic_pattern = r'^[\(\(]\d+[\)\)]' current_section_pattern = re.compile(dynamic_pattern) if current_section_pattern and re.match(current_section_pattern, next_text): extracted_paragraphs[active_key].append(next_text) else: continue_collecting = False active_key = None break return current_index index = 0 while index < len(doc.paragraphs): index = extract_from_text(doc.paragraphs[index].text.strip(), index) index += 1 return extracted_paragraphs doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目).docx'