import json import docx import fitz import re import os def extract_text_from_docx(file_path): doc = docx.Document(file_path) return '\n'.join([para.text for para in doc.paragraphs]) def clean_page_numbers(text): # 删除每页开头的页码,假设页码后跟至少一个空格 cleaned_text = re.sub(r'^\s*\d+\s+', '', text) # 删除每页末尾的页码,假设页码前有至少一个空格 cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text) # 删除形如 /123 或 123/ 的页码 cleaned_text = re.sub(r'\s*/\s*\d+\s*|\s*\d+\s*/\s*', '', cleaned_text) return cleaned_text def extract_text_from_pdf(file_path): doc = fitz.open(file_path) text = "" for page in doc: page_text = page.get_text() page_text = clean_page_numbers(page_text) text += page_text return text def extract_section(text, start_pattern, end_phrases): start_match = re.search(start_pattern, text) if not start_match: return "" # 如果没有找到匹配的开始模式,返回空字符串 start_index = start_match.start() end_index = len(text) for phrase in end_phrases: # Use multiline mode with `re.MULTILINE` match = re.search(phrase, text[start_index:], re.MULTILINE) #Hello, world!\nWelcome to OpenAI. 在多行字符串多,要 re.MULTILINE以匹配每一行的开头,否则只会匹配字符串的开头。 if match: end_index = start_index + match.start() break return text[start_index:end_index] def compare_headings(current, new): # 使用过滤来确保只处理非空且为数字的部分 current_nums = [int(num) for num in current.split('.') if num.isdigit()] new_nums = [int(num) for num in new.split('.') if num.isdigit()] # 比较数字序列以确定标题的层次关系 for c, n in zip(current_nums, new_nums): if n > c: return True elif n < c: return False # 如果新标题有更多层次,认为是新的子章节 return len(new_nums) > len(current_nums) def should_add_newline(content, keywords, max_length=20): content_str = ''.join(content).strip() return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length def handle_content_append(current_content, line_content, append_newline, keywords): if append_newline: if should_add_newline(current_content, keywords): current_content.append('\n') # 添加换行符 append_newline = False current_content.append(line_content) return append_newline #对二级标题如x.x进行额外处理:如果当前处理内容包含keywords中的内容,则必须保留换行符/如果当前内容字数大于20,不保留换行。 def parse_text_by_heading(text): keywords = ['包含', '以下'] data = {} current_key = None current_content = [] append_newline = False lines = text.split('\n') for i, line in enumerate(lines): line_stripped = line.strip() # 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号 match = re.match(r'^(?