zbparse/flask_app/main/废标项.py

import fitz  # PyMuPDF
import re

def extract_text_with_keywords(pdf_path, keywords, follow_up_keywords):
    """
    提取PDF文档中包含特定关键词的段落，包括正文和表格。如果段落中包含特定的后续关键词，也提取其后的段落，直到遇到下一个相似的序列编号。
    :param pdf_path: PDF文件的路径。
    :param keywords: 包含关键词的列表。
    :param follow_up_keywords: 触发连续提取的关键词列表。
    :return: 包含关键词的段落列表。
    """
    doc = fitz.open(pdf_path)
    extracted_paragraphs = []
    continue_collecting = False
    current_section_pattern = None

    for page in doc:
        text_blocks = page.get_text("blocks")
        for index, block in enumerate(text_blocks):
            text = block[4].strip()  # Text content of the block
            if text == "":  # Skip empty lines
                continue
            if continue_collecting:
                if current_section_pattern and re.match(current_section_pattern, text):
                    continue_collecting = False
                else:
                    extracted_paragraphs.append(text)

            if any(keyword in text for keyword in keywords):
                extracted_paragraphs.append(text)
                if any(follow_up in text for follow_up in follow_up_keywords):
                    continue_collecting = True
                    section_number = re.match(r'(\d+(\.\d+)*)', text)
                    if section_number:
                        current_section_number = section_number.group(1)
                        base_section_number = current_section_number.rsplit('.', 1)[0]
                        current_section_pattern = re.compile(rf'^{re.escape(base_section_number)}\.\d+\b')
                    else:
                        found_next_number = False
                        for next_index in range(index + 1, len(text_blocks)):
                            next_text = text_blocks[next_index][4].strip()
                            next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)', next_text)
                            if next_section_number:
                                found_next_number = True
                                current_section_pattern = re.compile(rf'^{next_section_number.group(1)}\b')
                            elif found_next_number:
                                break
    doc.close()
    return extracted_paragraphs

# Example usage
doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标02_invalid.docx'
keywords = ['否决', '无效投标', '被拒绝', '予以拒绝']
follow_up_keywords = ['情形之一']
extracted_contents = extract_text_with_keywords(doc_path, keywords, follow_up_keywords)

# Writing to file and handling duplicates
output_file = "C:\\Users\\Administrator\\Desktop\\temp.txt"
with open(output_file, 'w', encoding='utf-8') as file:
    for content in extracted_contents:
        file.write(content + '\n')

# file_id = upload_file(output_file)
# user_query="根据文本中的内容，请你回答，否决投标和拒绝投标的情况有哪些？由于文本中存在冗余且无关的信息，且文本中的序号是混乱的，请你重新编排答案返回给我，以json格式返回，你的回答仅限于原文内容但删去原有序号，请不要自己组织语言。"
# res=qianwen_long(file_id,user_query)
# print("Query Result:", res)