66 lines
3.4 KiB
Python
66 lines
3.4 KiB
Python
import fitz # PyMuPDF
|
||
import re
|
||
|
||
def extract_text_with_keywords(pdf_path, keywords, follow_up_keywords):
|
||
"""
|
||
提取PDF文档中包含特定关键词的段落,包括正文和表格。如果段落中包含特定的后续关键词,也提取其后的段落,直到遇到下一个相似的序列编号。
|
||
:param pdf_path: PDF文件的路径。
|
||
:param keywords: 包含关键词的列表。
|
||
:param follow_up_keywords: 触发连续提取的关键词列表。
|
||
:return: 包含关键词的段落列表。
|
||
"""
|
||
doc = fitz.open(pdf_path)
|
||
extracted_paragraphs = []
|
||
continue_collecting = False
|
||
current_section_pattern = None
|
||
|
||
for page in doc:
|
||
text_blocks = page.get_text("blocks")
|
||
for index, block in enumerate(text_blocks):
|
||
text = block[4].strip() # Text content of the block
|
||
if text == "": # Skip empty lines
|
||
continue
|
||
if continue_collecting:
|
||
if current_section_pattern and re.match(current_section_pattern, text):
|
||
continue_collecting = False
|
||
else:
|
||
extracted_paragraphs.append(text)
|
||
|
||
if any(keyword in text for keyword in keywords):
|
||
extracted_paragraphs.append(text)
|
||
if any(follow_up in text for follow_up in follow_up_keywords):
|
||
continue_collecting = True
|
||
section_number = re.match(r'(\d+(\.\d+)*)', text)
|
||
if section_number:
|
||
current_section_number = section_number.group(1)
|
||
base_section_number = current_section_number.rsplit('.', 1)[0]
|
||
current_section_pattern = re.compile(rf'^{re.escape(base_section_number)}\.\d+\b')
|
||
else:
|
||
found_next_number = False
|
||
for next_index in range(index + 1, len(text_blocks)):
|
||
next_text = text_blocks[next_index][4].strip()
|
||
next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)', next_text)
|
||
if next_section_number:
|
||
found_next_number = True
|
||
current_section_pattern = re.compile(rf'^{next_section_number.group(1)}\b')
|
||
elif found_next_number:
|
||
break
|
||
doc.close()
|
||
return extracted_paragraphs
|
||
|
||
# Example usage
|
||
doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标02_invalid.docx'
|
||
keywords = ['否决', '无效投标', '被拒绝', '予以拒绝']
|
||
follow_up_keywords = ['情形之一']
|
||
extracted_contents = extract_text_with_keywords(doc_path, keywords, follow_up_keywords)
|
||
|
||
# Writing to file and handling duplicates
|
||
output_file = "C:\\Users\\Administrator\\Desktop\\temp.txt"
|
||
with open(output_file, 'w', encoding='utf-8') as file:
|
||
for content in extracted_contents:
|
||
file.write(content + '\n')
|
||
|
||
# file_id = upload_file(output_file)
|
||
# user_query="根据文本中的内容,请你回答,否决投标和拒绝投标的情况有哪些?由于文本中存在冗余且无关的信息,且文本中的序号是混乱的,请你重新编排答案返回给我,以json格式返回,你的回答仅限于原文内容但删去原有序号,请不要自己组织语言。"
|
||
# res=qianwen_long(file_id,user_query)
|
||
# print("Query Result:", res) |