import re from PyPDF2 import PdfReader from flask_app.货物标.截取pdf货物标版 import clean_page_content,extract_common_header def compare_headings(current, new): # 使用过滤来确保只处理非空且为数字的部分 current_nums = [int(num) for num in current.split('.') if num.isdigit()] new_nums = [int(num) for num in new.split('.') if num.isdigit()] # 比较数字序列以确定标题的层次关系 for c, n in zip(current_nums, new_nums): if n > c: return True elif n < c: return False # 如果新标题有更多层次,认为是新的子章节 return len(new_nums) > len(current_nums) def should_add_newline(content, keywords, max_length=20): content_str = ''.join(content).strip() return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length def handle_content_append(current_content, line_content, append_newline, keywords,in_special_section): if append_newline: if should_add_newline(current_content, keywords): current_content.append('\n') # 添加换行符 append_newline = False current_content.append(line_content) if in_special_section: current_content.append('\n') return append_newline def parse_text_by_heading(text): keywords = ['包含', '以下'] data = {} current_key = None current_key_chinese = None current_value_chinese = None current_content = [] append_newline = False skip_subheadings = False last_main_number = None temp_title = None # 临时存储以点号开头但不带数字的标题 pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*') initial_heading_pattern = None special_section_keywords = ['文件的组成', '文件的构成','文件包括:','文件包括:'] # 定义特殊章节关键词 in_special_section = False # 标志是否在特殊章节中 lines = text.split('\n') def check_year_pattern(line): # 检查是否为年份模式,如 '2014年' line_no_spaces = line.replace(' ', '') return re.match(r'^\d{4}\s*年', line_no_spaces) is not None def is_heading(line): # 检查是否匹配任何标题模式 patterns = [ r'^(? current_key_num and new_key_num - current_key_num <= 5)): # 处理临时标题与新主标题的关联 if temp_title: main_number = new_key_candidate.split('.')[0] data[f"{main_number}."] = temp_title temp_title = None # 重置临时标题 # 开始新的标题 if current_key is not None: content_string = ''.join(current_content).strip() data[current_key] = data.get(current_key, '') + content_string.replace(' ', '') current_key = new_key_candidate current_content = [line_content] append_newline = True last_main_number = new_key_candidate.rstrip('.') else: # 将当前行视为当前标题的内容 append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section) else: if not skip_subheadings and not in_special_section: # 增加 in_special_section 判断 numbered_match = pattern_numbered.match(line_stripped) parenthesis_match = pattern_parentheses.match(line_stripped) letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped) arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped) # 判断当前行是否匹配了任何标题模式 if numbered_match or parenthesis_match or letter_match or arabic_match: # 如果初始标题模式尚未设置,则记录当前匹配的标题模式 if initial_heading_pattern is None: if numbered_match: initial_heading_pattern = 'numbered' elif parenthesis_match: initial_heading_pattern = 'parentheses' elif letter_match: initial_heading_pattern = 'letter' elif arabic_match: initial_heading_pattern = 'arabic' # 确定当前匹配的标题模式 if numbered_match: current_heading_pattern = 'numbered' elif parenthesis_match: current_heading_pattern = 'parentheses' elif letter_match: current_heading_pattern = 'letter' elif arabic_match: current_heading_pattern = 'arabic' # 如果当前标题模式与初始标题模式一致,创建新的键值对 if current_heading_pattern == initial_heading_pattern: # 保存之前的 key 的内容 if current_key is not None: content_string = ''.join(current_content).strip() data[current_key] = data.get(current_key, '') + content_string.replace(' ', '') if current_key_chinese is not None: data[current_key_chinese] = current_value_chinese current_key_chinese = None # 处理匹配到的标题 if current_heading_pattern == 'numbered': current_key_chinese = numbered_match.group(1) current_value_chinese = line_stripped[numbered_match.end():].lstrip('..、,').replace(' ', '') elif current_heading_pattern == 'parentheses': current_key_chinese = parenthesis_match.group(1) current_value_chinese = line_stripped[parenthesis_match.end():].lstrip('..、,').replace(' ', '') elif current_heading_pattern == 'letter': # 字母标题处理 letter_key, letter_value = letter_match.groups() letter_to_chinese = { 'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五', 'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十', 'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五' } current_key_chinese = letter_to_chinese.get(letter_key, letter_key) current_value_chinese = letter_value.lstrip('..、,').replace(' ', '') elif current_heading_pattern == 'arabic': arabic_key, arabic_value = arabic_match.groups() current_key = arabic_key.replace('、', '.') current_content = [arabic_value] append_newline = True last_main_number = current_key.rstrip('.') continue else: # 当前标题模式与初始模式不一致,将该行视为内容 if line_stripped: append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section) else: # 未匹配到任何标题模式,将该行视为内容 if line_stripped: append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section) else: # 在特殊章节中,所有内容都作为当前标题的内容 if line_stripped: append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section) # 最后保存最后一个 key 对应的内容 if current_key is not None: content_string = ''.join(current_content).strip() data[current_key] = data.get(current_key, '') + content_string.replace(' ', '') if current_key_chinese is not None: data[current_key_chinese] = current_value_chinese if temp_title: # 处理任何未关联的临时标题 data["未分类标题"] = temp_title # 您可以根据需要选择合适的键名 return data def extract_text_from_pdf(file_path, start_word, end_pattern): # 从PDF文件中提取文本 common_header = extract_common_header(file_path) pdf_document = PdfReader(file_path) exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') #仅匹配第一页和最后一页,不需要exclusion_pattern all_pages_text = [] start_index = None # 处理所有页面 for i, page in enumerate(pdf_document.pages): page_text = page.extract_text() if page.extract_text() else "" cleaned_text = clean_page_content(page_text, common_header) # print(cleaned_text) # 在第一页查找开始位置 if i == 0 and start_index is None: start_match = re.search(start_word, cleaned_text, re.MULTILINE) if start_match: start_index = start_match.start() cleaned_text = cleaned_text[start_index:] # 在最后一页查找结束位置 if i == len(pdf_document.pages) - 1: matches = list(re.finditer(end_pattern, cleaned_text, re.MULTILINE)) if matches: end_index = matches[-1].start() cleaned_text = cleaned_text[:end_index] all_pages_text.append(cleaned_text) # 合并所有页面的文本 full_text = "\n".join(all_pages_text) return full_text