zbparse/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py

314 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import docx
import re
import os
import fitz
from PyPDF2 import PdfReader
from flask_app.货物标.货物标截取pdf import clean_page_content,extract_common_header
def extract_text_from_docx(file_path):
doc = docx.Document(file_path)
return '\n'.join([para.text for para in doc.paragraphs])
#PYPDF2版本
def extract_text_from_pdf(file_path, start_word, end_pattern):
# 从PDF文件中提取文本
common_header = extract_common_header(file_path)
pdf_document = PdfReader(file_path)
all_pages_text = []
start_index = None
# 处理所有页面
for i, page in enumerate(pdf_document.pages):
page_text = page.extract_text() if page.extract_text() else ""
cleaned_text = clean_page_content(page_text, common_header)
# 在第一页查找开始位置
if i == 0 and start_index is None:
start_match = re.search(start_word, cleaned_text, re.MULTILINE)
if start_match:
start_index = start_match.start()
cleaned_text = cleaned_text[start_index:]
# 在最后一页查找结束位置
if i == len(pdf_document.pages) - 1:
for pattern in end_pattern:
matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE))
if matches:
end_index = matches[-1].start()
cleaned_text = cleaned_text[:end_index]
break
all_pages_text.append(cleaned_text)
# 合并所有页面的文本
full_text = "\n".join(all_pages_text)
# print(full_text)
return full_text
#fitz库版本
# def extract_text_from_pdf(file_path, start_word, end_pattern):
# # 从PDF文件中提取文本
# common_header = extract_common_header(file_path)
# doc = fitz.open(file_path)
# all_pages_text = []
# start_index = None
#
# # 处理所有页面
# for i in range(len(doc)):
# page = doc[i]
# page_text = page.get_text()
# cleaned_text = clean_page_content(page_text, common_header)
# print(cleaned_text)
# print("yes")
# # 在第一页查找开始位置
# if i == 0 and start_index is None:
# start_match = re.search(start_word, cleaned_text, re.MULTILINE)
# if start_match:
# start_index = start_match.start()
# cleaned_text = cleaned_text[start_index:]
#
# # 在最后一页查找结束位置
# if i == len(doc) - 1:
# for pattern in end_pattern:
# matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE))
# if matches:
# end_index = matches[-1].start()
# cleaned_text = cleaned_text[:end_index]
# break
#
# all_pages_text.append(cleaned_text)
#
# # 合并所有页面的文本
# full_text = "\n".join(all_pages_text)
# # 关闭文档
# doc.close()
#
# return full_text
def chinese_to_arabic(chinese_num):
cn_num = {
'': 1, '': 2, '': 3, '': 4, '': 5,
'': 6, '': 7, '': 8, '': 9, '': 10,
'十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15
}
return cn_num.get(chinese_num, 0)
def compare_headings(current, new):
# 使用过滤来确保只处理非空且为数字的部分
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
# 比较数字序列以确定标题的层次关系
for c, n in zip(current_nums, new_nums):
if n > c:
return True
elif n < c:
return False
# 如果新标题有更多层次,认为是新的子章节
return len(new_nums) > len(current_nums)
def should_add_newline(content, keywords, max_length=20):
content_str = ''.join(content).strip()
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
def handle_content_append(current_content, line_content, append_newline, keywords):
if append_newline:
if should_add_newline(current_content, keywords):
current_content.append('\n') # 添加换行符
append_newline = False
current_content.append(line_content)
return append_newline
"""
保存换行符的具体逻辑:
对于二级标题(如 1.1),如果其后的内容包含关键词或内容较短(<=20字符会在内容前添加一个换行符。
这个换行符会被保留在 current_content 列表中。
当处理下一个标题时,之前的内容(包括可能存在的换行符)会被合并并保存到 data 字典中。
解决了''''这类标题出现在正文中的情况。但是目前的逻辑是如果''已有了,就加入正文,否则''作为新的标题。
"""
#提取json主函数
def parse_text_by_heading(text):
keywords = ['包含', '以下']
data = {}
current_key = None
current_key_chinese = None
current_value_chinese = None
current_content = []
append_newline = False
skip_subheadings = False
lines = text.split('\n')
for i, line in enumerate(lines):
line_stripped = line.strip().replace('', '.')
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match:
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
if match:
new_key, line_content = match.groups()
line_content = line_content.lstrip('..、,')
if any(keyword in line_content for keyword in keywords):
skip_subheadings = True
else:
skip_subheadings = False
# 保存中文标题的内容
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
current_key_chinese = None
# 保存阿拉伯数字的标题内容
if current_key is None or (compare_headings(current_key, new_key) and (
len(current_content) == 0 or current_content[-1][-1] != '')):
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
current_key = new_key
current_content = [line_content]
append_newline = len(new_key.rstrip('.').split('.')) <= 2
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
else:
if not skip_subheadings:
# 合并了中文标题、字母标题、阿拉伯数字标题的匹配逻辑
pattern_title = re.compile(r'^\s*(?:[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
chinese_match = pattern_title.match(line_stripped)
letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
# 优化了处理逻辑,减少冗余
if chinese_match or letter_match or arabic_match:
# 保存之前的 key 的内容(无论是中文标题还是阿拉伯数字标题)
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
current_key_chinese = None
# 处理中文标题
if chinese_match:
current_key_chinese = chinese_match.group(1) or chinese_match.group(2)
current_value_chinese = line_stripped[chinese_match.end():].lstrip('..、,')
if current_key_chinese in data:
handle_content_append(current_content, '\n' + line_stripped, append_newline, keywords)
current_key_chinese = None
# 处理字母标题
elif letter_match:
letter_key, letter_value = letter_match.groups()
letter_to_chinese = {
'A': '', 'B': '', 'C': '', 'D': '', 'E': '',
'F': '', 'G': '', 'H': '', 'I': '', 'J': '',
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
}
current_key_chinese = letter_to_chinese.get(letter_key, letter_key)
current_value_chinese = letter_value.lstrip('..、,')
if current_key_chinese in data:
handle_content_append(current_content, '\n' + line_stripped, append_newline, keywords)
current_key_chinese = None
# 处理阿拉伯数字标题
elif arabic_match:
arabic_key, arabic_value = arabic_match.groups()
current_key = arabic_key.replace('', '.')
current_content = [arabic_value]
append_newline = True
continue
# 如果没有匹配标题,继续处理正文内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
# 最后保存最后一个 key 对应的内容
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
return data
# def convert_to_json(file_path, start_word, end_phrases):
# if file_path.endswith('.docx'):
# text = extract_text_from_docx(file_path)
# elif file_path.endswith('.pdf'):
# text = extract_text_from_pdf(file_path,start_word,end_phrases)
# else:
# raise ValueError("Unsupported file format")
# # print(text)
# parsed_data = parse_text_by_heading(text)
# return parsed_data
def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json"):
if not os.path.exists(file_path):
print(f"The specified file does not exist: {file_path}")
return ""
if type == 1:
start_word = r'^\s*[(]?\s*[一1]\s*[)]?\s*[、.]*\s*(说\s*明|总\s*则)'
end_pattern = [r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+']
else:
start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:'
end_pattern=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
text = extract_text_from_pdf(file_path, start_word, end_pattern)
result = parse_text_by_heading(text)
# result = convert_to_json(input_path, start_word, end_pattern)
# 检查输出文件夹是否存在,如果不存在则创建
if not os.path.exists(output_folder):
os.makedirs(output_folder)
print(f"Created output folder: {output_folder}")
file_name = "clause1.json" if type == 1 else "clause2.json"
# file_name = f"clause{suffix_counter}.json"
output_path = os.path.join(output_folder, file_name)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=4, ensure_ascii=False)
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
return output_path
def process_folder(input_folder, output_folder):
# 获取输入文件夹中的所有文件,过滤掉不以 'part2' 结尾的文件
files = [f for f in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, f)) and f.endswith('part2.pdf')]
# 初始化后缀计数器
suffix_counter = 1
# 遍历文件并对每个文件进行处理
for file_name in files:
file_path = os.path.join(input_folder, file_name)
suffix = str(suffix_counter) # 后缀为递增的数字
try:
# 调用 convert_clause_to_json传递文件路径、输出文件夹和递增的后缀
output_path = convert_clause_to_json(file_path, output_folder, 1, suffix)
print(f"Processed file: {file_name}, JSON saved to: {output_path}")
except ValueError as e:
print(f"Error processing {file_name}: {e}")
# 后缀递增
suffix_counter += 1
#TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件定稿三次_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏
#TODO:标题'一'应该越来越大,与'1.1'的逻辑分开'
#TODO:911904c3-4d6e-47e0-acd8-b05e582209cb文件夹运行有问题百炼出错了
if __name__ == "__main__":
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf'
output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2'
try:
output_path = convert_clause_to_json(file_path,output_folder)
print(f"Final JSON result saved to: {output_path}")
except ValueError as e:
print("Error:", e)
# input_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4'
# output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1'
#
# # 调用 process_folder 来处理整个文件夹中的所有文件
# process_folder(input_folder, output_folder)