zbparse/flask_app/main/废标项.py
2024-08-29 16:37:09 +08:00

66 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import fitz # PyMuPDF
import re
def extract_text_with_keywords(pdf_path, keywords, follow_up_keywords):
"""
提取PDF文档中包含特定关键词的段落包括正文和表格。如果段落中包含特定的后续关键词也提取其后的段落直到遇到下一个相似的序列编号。
:param pdf_path: PDF文件的路径。
:param keywords: 包含关键词的列表。
:param follow_up_keywords: 触发连续提取的关键词列表。
:return: 包含关键词的段落列表。
"""
doc = fitz.open(pdf_path)
extracted_paragraphs = []
continue_collecting = False
current_section_pattern = None
for page in doc:
text_blocks = page.get_text("blocks")
for index, block in enumerate(text_blocks):
text = block[4].strip() # Text content of the block
if text == "": # Skip empty lines
continue
if continue_collecting:
if current_section_pattern and re.match(current_section_pattern, text):
continue_collecting = False
else:
extracted_paragraphs.append(text)
if any(keyword in text for keyword in keywords):
extracted_paragraphs.append(text)
if any(follow_up in text for follow_up in follow_up_keywords):
continue_collecting = True
section_number = re.match(r'(\d+(\.\d+)*)', text)
if section_number:
current_section_number = section_number.group(1)
base_section_number = current_section_number.rsplit('.', 1)[0]
current_section_pattern = re.compile(rf'^{re.escape(base_section_number)}\.\d+\b')
else:
found_next_number = False
for next_index in range(index + 1, len(text_blocks)):
next_text = text_blocks[next_index][4].strip()
next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)', next_text)
if next_section_number:
found_next_number = True
current_section_pattern = re.compile(rf'^{next_section_number.group(1)}\b')
elif found_next_number:
break
doc.close()
return extracted_paragraphs
# Example usage
doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标02_invalid.docx'
keywords = ['否决', '无效投标', '被拒绝', '予以拒绝']
follow_up_keywords = ['情形之一']
extracted_contents = extract_text_with_keywords(doc_path, keywords, follow_up_keywords)
# Writing to file and handling duplicates
output_file = "C:\\Users\\Administrator\\Desktop\\temp.txt"
with open(output_file, 'w', encoding='utf-8') as file:
for content in extracted_contents:
file.write(content + '\n')
# file_id = upload_file(output_file)
# user_query="根据文本中的内容请你回答否决投标和拒绝投标的情况有哪些由于文本中存在冗余且无关的信息且文本中的序号是混乱的请你重新编排答案返回给我以json格式返回你的回答仅限于原文内容但删去原有序号请不要自己组织语言。"
# res=qianwen_long(file_id,user_query)
# print("Query Result:", res)