zbparse/flask_app/main/投标人须知正文条款提取成json文件.py
2024-08-29 16:37:09 +08:00

153 lines
6.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import docx
import fitz
import re
import os
def extract_text_from_docx(file_path):
doc = docx.Document(file_path)
return '\n'.join([para.text for para in doc.paragraphs])
def clean_page_numbers(text):
# 删除每页开头的页码,假设页码后跟至少一个空格
cleaned_text = re.sub(r'^\s*\d+\s+', '', text)
# 删除每页末尾的页码,假设页码前有至少一个空格
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text)
# 删除形如 /123 或 123/ 的页码
cleaned_text = re.sub(r'\s*/\s*\d+\s*|\s*\d+\s*/\s*', '', cleaned_text)
return cleaned_text
def extract_text_from_pdf(file_path):
doc = fitz.open(file_path)
text = ""
for page in doc:
page_text = page.get_text()
page_text = clean_page_numbers(page_text)
text += page_text
return text
def extract_section(text, start_keyword, end_phrases):
start_index = text.find(start_keyword)
if start_index == -1:
return ""
end_index = len(text)
for phrase in end_phrases:
match = re.search(phrase, text[start_index:])
if match:
end_index = start_index + match.start()
break
return text[start_index:end_index]
def compare_headings(current, new):
# 使用过滤来确保只处理非空且为数字的部分
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
# 比较数字序列以确定标题的层次关系
for c, n in zip(current_nums, new_nums):
if n > c:
return True
elif n < c:
return False
# 如果新标题有更多层次,认为是新的子章节
return len(new_nums) > len(current_nums)
def should_add_newline(content, keywords, max_length=20):
content_str = ''.join(content).strip()
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
def handle_content_append(current_content, line_content, append_newline, keywords):
if append_newline:
if should_add_newline(current_content, keywords):
current_content.append('\n') # 添加换行符
append_newline = False
current_content.append(line_content)
return append_newline
#对二级标题如x.x进行额外处理如果当前处理内容包含keywords中的内容则必须保留换行符/如果当前内容字数大于20不保留换行。
def parse_text_by_heading(text):
keywords = ['包含', '以下']
data = {}
current_key = None
current_content = []
append_newline = False
lines = text.split('\n')
for i, line in enumerate(lines):
line_stripped = line.strip()
# 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match:
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
if match:
new_key, line_content = match.groups()
line_content = line_content.lstrip('.')
# 检查是否应该更新当前键和内容
if current_key is None or (compare_headings(current_key, new_key) and (
len(current_content) == 0 or current_content[-1][-1] != '')):
if current_key is not None:
# 将之前的内容保存到data中保留第一个换行符后续的换行符转为空字符
content_string = ''.join(current_content).strip()
data[current_key] = content_string.replace(' ', '')
current_key = new_key
current_content = [line_content]
# 只有当标题级别为两级(如 1.1)时,才设置 append_newline 为 True
append_newline = len(new_key.split('.')) == 2
else:
append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
else:
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
if current_key is not None:
# 保存最后一部分内容
content_string = ''.join(current_content).strip()
data[current_key] = content_string.replace(' ', '')
return data
def convert_to_json(file_path, start_word, end_phrases):
if file_path.endswith('.docx'):
text = extract_text_from_docx(file_path)
elif file_path.endswith('.pdf'):
text = extract_text_from_pdf(file_path)
else:
raise ValueError("Unsupported file format")
# 提取从 start_word 开始到 end_phrases 结束的内容
text = extract_section(text, start_word, end_phrases)
parsed_data = parse_text_by_heading(text)
return parsed_data
def convert_clause_to_json(input_path,output_folder):
start_word = "投标人须知正文"
end_phrases = [
r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
]
result = convert_to_json(input_path, start_word, end_phrases)
output_path = os.path.join(output_folder, "clause.json")
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=4, ensure_ascii=False)
return output_path
if __name__ == "__main__":
file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp6\\48e650ff-70eb-48df-874c-66c8abbcd89d\\ztbfile_tobidders_notice.pdf'
start_word = "投标人须知正文"
end_phrases = [
r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
]
output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件'
try:
output_path = convert_clause_to_json(file_path,output_folder)
print(f"Final JSON result saved to: {output_path}")
except ValueError as e:
print("Error:", e)