zbparse/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py
2024-10-16 20:18:55 +08:00

397 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import docx
import re
import os
import fitz
from PyPDF2 import PdfReader
from flask_app.货物标.货物标截取pdf import clean_page_content,extract_common_header
def extract_text_from_docx(file_path):
doc = docx.Document(file_path)
return '\n'.join([para.text for para in doc.paragraphs])
#PYPDF2版本
def extract_text_from_pdf(file_path, start_word, end_pattern):
# 从PDF文件中提取文本
common_header = extract_common_header(file_path)
pdf_document = PdfReader(file_path)
all_pages_text = []
start_index = None
# 处理所有页面
for i, page in enumerate(pdf_document.pages):
page_text = page.extract_text() if page.extract_text() else ""
cleaned_text = clean_page_content(page_text, common_header)
# 在第一页查找开始位置
if i == 0 and start_index is None:
start_match = re.search(start_word, cleaned_text, re.MULTILINE)
if start_match:
start_index = start_match.start()
cleaned_text = cleaned_text[start_index:]
# 在最后一页查找结束位置
if i == len(pdf_document.pages) - 1:
matches = list(re.finditer(end_pattern, cleaned_text, re.MULTILINE))
if matches:
end_index = matches[-1].start()
cleaned_text = cleaned_text[:end_index]
all_pages_text.append(cleaned_text)
# 合并所有页面的文本
full_text = "\n".join(all_pages_text)
return full_text
#fitz库版本
# def extract_text_from_pdf(file_path, start_word, end_pattern):
# # 从PDF文件中提取文本
# common_header = extract_common_header(file_path)
# doc = fitz.open(file_path)
# all_pages_text = []
# start_index = None
#
# # 处理所有页面
# for i in range(len(doc)):
# page = doc[i]
# page_text = page.get_text()
# cleaned_text = clean_page_content(page_text, common_header)
# print(cleaned_text)
# print("yes")
# # 在第一页查找开始位置
# if i == 0 and start_index is None:
# start_match = re.search(start_word, cleaned_text, re.MULTILINE)
# if start_match:
# start_index = start_match.start()
# cleaned_text = cleaned_text[start_index:]
#
# # 在最后一页查找结束位置
# if i == len(doc) - 1:
# for pattern in end_pattern:
# matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE))
# if matches:
# end_index = matches[-1].start()
# cleaned_text = cleaned_text[:end_index]
# break
#
# all_pages_text.append(cleaned_text)
#
# # 合并所有页面的文本
# full_text = "\n".join(all_pages_text)
# # 关闭文档
# doc.close()
#
# return full_text
def compare_headings(current, new):
# 使用过滤来确保只处理非空且为数字的部分
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
# 比较数字序列以确定标题的层次关系
for c, n in zip(current_nums, new_nums):
if n > c:
return True
elif n < c:
return False
# 如果新标题有更多层次,认为是新的子章节
return len(new_nums) > len(current_nums)
def should_add_newline(content, keywords, max_length=20):
content_str = ''.join(content).strip()
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
def handle_content_append(current_content, line_content, append_newline, keywords):
if append_newline:
if should_add_newline(current_content, keywords):
current_content.append('\n') # 添加换行符
append_newline = False
current_content.append(line_content)
return append_newline
"""
保存换行符的具体逻辑:
对于二级标题(如 1.1),如果其后的内容包含关键词或内容较短(<=20字符会在内容前添加一个换行符。
这个换行符会被保留在 current_content 列表中。
当处理下一个标题时,之前的内容(包括可能存在的换行符)会被合并并保存到 data 字典中。
解决了''''这类标题出现在正文中的情况。但是目前的逻辑是如果''已有了,就加入正文,否则''作为新的标题。
"""
#提取json主函数
def parse_text_by_heading(text):
keywords = ['包含', '以下']
data = {}
current_key = None
current_key_chinese = None
current_value_chinese = None
current_content = []
append_newline = False
skip_subheadings = False
last_main_number = None
lines = text.split('\n')
for i, line in enumerate(lines):
line_stripped = line.strip().replace('', '.')
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match:
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
# 新增:处理以点号开头的情况
dot_match = re.match(r'^\.(\d+)\s*(.+)$', line_stripped)
if match:
new_key, line_content = match.groups()
line_content = line_content.lstrip('..、,')
if any(keyword in line_content for keyword in keywords):
skip_subheadings = True
else:
skip_subheadings = False
# 保存中文标题的内容
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
current_key_chinese = None
# 保存阿拉伯数字的标题内容
if current_key is None or (compare_headings(current_key, new_key) and (
len(current_content) == 0 or current_content[-1][-1] != '')):
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
current_key = new_key
current_content = [line_content]
append_newline = len(new_key.rstrip('.').split('.')) <= 2
last_main_number = new_key.split('.')[0]
# if current_key is None or (current_key != new_key and (
# len(current_content) == 0 or current_content[-1][-1] != '第')):
# if current_key is not None:
# content_string = ''.join(current_content).strip()
# data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
# current_key = new_key
# current_content = [line_content]
# append_newline = len(new_key.rstrip('.').split('.')) <= 2
# last_main_number = new_key.split('.')[0]
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
elif dot_match:
# 处理以点号开头的情况
sub_number, line_content = dot_match.groups()
if last_main_number:
new_key = f"{last_main_number}.{sub_number}"
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
current_key = new_key
current_content = [line_content]
append_newline = True
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
else:
if not skip_subheadings:
# 合并了中文标题、字母标题、阿拉伯数字标题的匹配逻辑
pattern_title = re.compile(r'^\s*(?:[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
chinese_match = pattern_title.match(line_stripped)
letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
# 优化了处理逻辑,减少冗余
if chinese_match or letter_match or arabic_match:
# 保存之前的 key 的内容(无论是中文标题还是阿拉伯数字标题)
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
current_key_chinese = None
# 处理中文标题
if chinese_match:
current_key_chinese = chinese_match.group(1) or chinese_match.group(2)
current_value_chinese = line_stripped[chinese_match.end():].lstrip('..、,').replace(' ', '')
if current_key_chinese in data:
handle_content_append(current_content, '\n' + line_stripped, append_newline, keywords)
current_key_chinese = None
# 处理字母标题
elif letter_match:
letter_key, letter_value = letter_match.groups()
letter_to_chinese = {
'A': '', 'B': '', 'C': '', 'D': '', 'E': '',
'F': '', 'G': '', 'H': '', 'I': '', 'J': '',
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
}
current_key_chinese = letter_to_chinese.get(letter_key, letter_key)
current_value_chinese = letter_value.lstrip('..、,').replace(' ', '')
if current_key_chinese in data:
handle_content_append(current_content, '\n' + line_stripped, append_newline, keywords)
current_key_chinese = None
# 处理阿拉伯数字标题
elif arabic_match:
arabic_key, arabic_value = arabic_match.groups()
current_key = arabic_key.replace('', '.')
current_content = [arabic_value]
append_newline = True
last_main_number = current_key.rstrip('.')
continue
# 如果没有匹配标题,继续处理正文内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
# 最后保存最后一个 key 对应的内容
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
return data
#type=2时提取货物标的第一章招标公告时采用该逻辑
def parse_text_to_dict(text):
"""
解析文本,根据大标题划分内容,生成字典。
参数:
text (str): 要解析的文本。
返回:
dict: 大标题作为键,内容作为值的字典。
"""
# 正则表达式模式:匹配以一至十的汉字数字开头,后跟顿号和任意字符,且位于行首
pattern = re.compile(r'^([一二三四五六七八九十]+\s*、\s*.*)$', re.MULTILINE)
# 使用 re.finditer 找到所有大标题的位置
matches = list(pattern.finditer(text))
result = {}
for i, match in enumerate(matches):
title = match.group(1).strip() # 获取大标题文本
start = match.end() # 内容的起始位置
if i + 1 < len(matches):
end = matches[i + 1].start() # 下一个大标题的起始位置
else:
end = len(text) # 最后一个大标题,内容到文本末尾
content = text[start:end].strip() # 获取内容并去除前后空白
# 规范化换行符,并移除每行开头和结尾的空白
content = content.replace('\r\n', '\n') # 统一换行符
content = re.sub(r'[ \t]+\n', '\n', content) # 移除行尾空白
content = re.sub(r'^[ \t]+|[ \t]+$', '', content, flags=re.MULTILINE) # 移除行首和行尾空白
content = clean_content(content) # 处理内容中的换行符
result[title] = content
return result
def clean_content(content):
"""
处理内容中的换行符:
- 保留在子项编号前的换行符。
- 保留在冒号 ':' 或全角冒号 '' 前的第一个换行符。
- 移除其他位置的换行符,不留下额外的空格。
参数:
content (str): 要处理的内容字符串。
返回:
str: 处理后的内容字符串。
"""
# 定义子项编号的正则模式,包括:
# - 数字+点号+数字(如 1.1 或 11
# - 数字+顿号(如 2、
# - 点号+数字(如 .3 或 3
# - 数字+右括号(如 1) 或 1
# - 圆括号包围的数字(如 (5)
# - 全角圆括号包围的数字(如 5
# - 数字+点号(如 1. 或 1
numbering_pattern = r'(?:\d+[.]\d+(?:[.]\d+)*|\d+、|[.]\d+|\d+[)]|\(\d+\)|\d+|\d+[.])'
# 定义需要保留换行符的情况:
# 1. 换行符前面是子项编号
# 2. 换行符前面是任意非空白字符,且后面跟着 ':' 或 ''
pattern_keep = r'\n(?=(?:' + numbering_pattern + r'|[^:\n\r\f\v]+[:]))'
# 定义占位符,用于暂时替换需要保留的换行符
placeholder = "___PLACEHOLDER___"
# Step 1: 将需要保留的换行符替换为占位符
content_with_placeholder = re.sub(pattern_keep, placeholder, content)
# Step 2: 移除所有剩余的换行符
content_no_newlines = content_with_placeholder.replace('\n', '')
# Step 3: 将占位符替换回换行符
cleaned_content = content_no_newlines.replace(placeholder, '\n')
return cleaned_content
def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json"):
if not os.path.exists(file_path):
print(f"The specified file does not exist: {file_path}")
return ""
if type == 1:
start_word = r'^\s*[(]?\s*[一1]\s*[)]?\s*[、.]*\s*(说\s*明|总\s*则)'
end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
text = extract_text_from_pdf(file_path, start_word, end_pattern)
result = parse_text_by_heading(text)
else:
start_word = r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*'
end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人|磋商|供应商)须知前附表)'
text = extract_text_from_pdf(file_path, start_word, end_pattern)
result=parse_text_to_dict(text)
# result = convert_to_json(input_path, start_word, end_pattern)
# 检查输出文件夹是否存在,如果不存在则创建
if not os.path.exists(output_folder):
os.makedirs(output_folder)
print(f"Created output folder: {output_folder}")
file_name = "clause1.json" if type == 1 else "clause2.json"
# file_name = f"clause{suffix_counter}.json"
output_path = os.path.join(output_folder, file_name)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=4, ensure_ascii=False)
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
return output_path
def process_folder(input_folder, output_folder):
# 获取输入文件夹中的所有文件,过滤掉不以 'part2' 结尾的文件
files = [f for f in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, f)) and f.endswith('part2.pdf')]
# 遍历文件并对每个文件进行处理
for file_name in files:
file_path = os.path.join(input_folder, file_name)
# 去掉文件扩展名
file_name_without_extension = os.path.splitext(file_name)[0]
try:
# 调用 convert_clause_to_json传递文件路径、输出文件夹和递增的后缀
output_path = convert_clause_to_json(file_path, output_folder, 1, file_name_without_extension)
print(f"Processed file: {file_name}, JSON saved to: {output_path}")
except ValueError as e:
print(f"Error processing {file_name}: {e}")
#TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件定稿三次_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏
#TODO:911904c3-4d6e-47e0-acd8-b05e582209cb文件夹运行有问题百炼出错了
#TODO: 投标人须知正文这块,序号可能是乱序的,或许可以删除判断序号大小的逻辑,只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理
if __name__ == "__main__":
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
file_path='D:\\flask_project\\flask_app\\static\\output\\87f48f9c-e6ee-4dc1-a981-5a10085c4635\\ztbfile_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf'
output_folder = 'D:\\flask_project\\flask_app\\static\\output\\87f48f9c-e6ee-4dc1-a981-5a10085c4635\\tmp'
try:
output_path = convert_clause_to_json(file_path,output_folder,1)
print(f"Final JSON result saved to: {output_path}")
except ValueError as e:
print("Error:", e)
# input_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4'
# output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1'
#
# # 调用 process_folder 来处理整个文件夹中的所有文件
# process_folder(input_folder, output_folder)