import fitz # PyMuPDF import re def extract_text_with_keywords(pdf_path, keywords, follow_up_keywords): """ 提取PDF文档中包含特定关键词的段落,包括正文和表格。如果段落中包含特定的后续关键词,也提取其后的段落,直到遇到下一个相似的序列编号。 :param pdf_path: PDF文件的路径。 :param keywords: 包含关键词的列表。 :param follow_up_keywords: 触发连续提取的关键词列表。 :return: 包含关键词的段落列表。 """ doc = fitz.open(pdf_path) extracted_paragraphs = [] continue_collecting = False current_section_pattern = None for page in doc: text_blocks = page.get_text("blocks") for index, block in enumerate(text_blocks): text = block[4].strip() # Text content of the block if text == "": # Skip empty lines continue if continue_collecting: if current_section_pattern and re.match(current_section_pattern, text): continue_collecting = False else: extracted_paragraphs.append(text) if any(keyword in text for keyword in keywords): extracted_paragraphs.append(text) if any(follow_up in text for follow_up in follow_up_keywords): continue_collecting = True section_number = re.match(r'(\d+(\.\d+)*)', text) if section_number: current_section_number = section_number.group(1) base_section_number = current_section_number.rsplit('.', 1)[0] current_section_pattern = re.compile(rf'^{re.escape(base_section_number)}\.\d+\b') else: found_next_number = False for next_index in range(index + 1, len(text_blocks)): next_text = text_blocks[next_index][4].strip() next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)', next_text) if next_section_number: found_next_number = True current_section_pattern = re.compile(rf'^{next_section_number.group(1)}\b') elif found_next_number: break doc.close() return extracted_paragraphs # Example usage doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标02_invalid.docx' keywords = ['否决', '无效投标', '被拒绝', '予以拒绝'] follow_up_keywords = ['情形之一'] extracted_contents = extract_text_with_keywords(doc_path, keywords, follow_up_keywords) # Writing to file and handling duplicates output_file = "C:\\Users\\Administrator\\Desktop\\temp.txt" with open(output_file, 'w', encoding='utf-8') as file: for content in extracted_contents: file.write(content + '\n') # file_id = upload_file(output_file) # user_query="根据文本中的内容,请你回答,否决投标和拒绝投标的情况有哪些?由于文本中存在冗余且无关的信息,且文本中的序号是混乱的,请你重新编排答案返回给我,以json格式返回,你的回答仅限于原文内容但删去原有序号,请不要自己组织语言。" # res=qianwen_long(file_id,user_query) # print("Query Result:", res)