2024-09-28 17:38:53 +08:00
|
|
|
|
import json
|
|
|
|
|
import docx
|
|
|
|
|
import re
|
|
|
|
|
import os
|
2024-10-09 13:50:28 +08:00
|
|
|
|
|
|
|
|
|
import fitz
|
2024-09-28 17:38:53 +08:00
|
|
|
|
from PyPDF2 import PdfReader
|
2024-10-09 13:50:28 +08:00
|
|
|
|
from flask_app.货物标.货物标截取pdf import clean_page_content,extract_common_header
|
2024-09-28 17:38:53 +08:00
|
|
|
|
|
|
|
|
|
def extract_text_from_docx(file_path):
|
|
|
|
|
doc = docx.Document(file_path)
|
|
|
|
|
return '\n'.join([para.text for para in doc.paragraphs])
|
|
|
|
|
|
2024-10-09 13:50:28 +08:00
|
|
|
|
#PYPDF2版本
|
2024-09-28 17:38:53 +08:00
|
|
|
|
def extract_text_from_pdf(file_path, start_word, end_pattern):
|
|
|
|
|
# 从PDF文件中提取文本
|
|
|
|
|
common_header = extract_common_header(file_path)
|
|
|
|
|
pdf_document = PdfReader(file_path)
|
|
|
|
|
all_pages_text = []
|
|
|
|
|
start_index = None
|
|
|
|
|
# 处理所有页面
|
|
|
|
|
for i, page in enumerate(pdf_document.pages):
|
|
|
|
|
page_text = page.extract_text() if page.extract_text() else ""
|
|
|
|
|
cleaned_text = clean_page_content(page_text, common_header)
|
|
|
|
|
|
|
|
|
|
# 在第一页查找开始位置
|
|
|
|
|
if i == 0 and start_index is None:
|
|
|
|
|
start_match = re.search(start_word, cleaned_text, re.MULTILINE)
|
|
|
|
|
if start_match:
|
|
|
|
|
start_index = start_match.start()
|
|
|
|
|
cleaned_text = cleaned_text[start_index:]
|
|
|
|
|
|
|
|
|
|
# 在最后一页查找结束位置
|
|
|
|
|
if i == len(pdf_document.pages) - 1:
|
2024-10-12 18:01:59 +08:00
|
|
|
|
matches = list(re.finditer(end_pattern, cleaned_text, re.MULTILINE))
|
|
|
|
|
if matches:
|
|
|
|
|
end_index = matches[-1].start()
|
|
|
|
|
cleaned_text = cleaned_text[:end_index]
|
2024-09-28 17:38:53 +08:00
|
|
|
|
|
|
|
|
|
all_pages_text.append(cleaned_text)
|
|
|
|
|
|
|
|
|
|
# 合并所有页面的文本
|
|
|
|
|
full_text = "\n".join(all_pages_text)
|
|
|
|
|
return full_text
|
|
|
|
|
|
2024-10-09 13:50:28 +08:00
|
|
|
|
#fitz库版本
|
|
|
|
|
# def extract_text_from_pdf(file_path, start_word, end_pattern):
|
|
|
|
|
# # 从PDF文件中提取文本
|
|
|
|
|
# common_header = extract_common_header(file_path)
|
|
|
|
|
# doc = fitz.open(file_path)
|
|
|
|
|
# all_pages_text = []
|
|
|
|
|
# start_index = None
|
|
|
|
|
#
|
|
|
|
|
# # 处理所有页面
|
|
|
|
|
# for i in range(len(doc)):
|
|
|
|
|
# page = doc[i]
|
|
|
|
|
# page_text = page.get_text()
|
|
|
|
|
# cleaned_text = clean_page_content(page_text, common_header)
|
|
|
|
|
# print(cleaned_text)
|
|
|
|
|
# print("yes")
|
|
|
|
|
# # 在第一页查找开始位置
|
|
|
|
|
# if i == 0 and start_index is None:
|
|
|
|
|
# start_match = re.search(start_word, cleaned_text, re.MULTILINE)
|
|
|
|
|
# if start_match:
|
|
|
|
|
# start_index = start_match.start()
|
|
|
|
|
# cleaned_text = cleaned_text[start_index:]
|
|
|
|
|
#
|
|
|
|
|
# # 在最后一页查找结束位置
|
|
|
|
|
# if i == len(doc) - 1:
|
|
|
|
|
# for pattern in end_pattern:
|
|
|
|
|
# matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE))
|
|
|
|
|
# if matches:
|
|
|
|
|
# end_index = matches[-1].start()
|
|
|
|
|
# cleaned_text = cleaned_text[:end_index]
|
|
|
|
|
# break
|
|
|
|
|
#
|
|
|
|
|
# all_pages_text.append(cleaned_text)
|
|
|
|
|
#
|
|
|
|
|
# # 合并所有页面的文本
|
|
|
|
|
# full_text = "\n".join(all_pages_text)
|
|
|
|
|
# # 关闭文档
|
|
|
|
|
# doc.close()
|
|
|
|
|
#
|
|
|
|
|
# return full_text
|
|
|
|
|
|
|
|
|
|
|
2024-09-28 17:38:53 +08:00
|
|
|
|
def compare_headings(current, new):
|
|
|
|
|
# 使用过滤来确保只处理非空且为数字的部分
|
|
|
|
|
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
|
|
|
|
|
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
|
|
|
|
|
|
|
|
|
|
# 比较数字序列以确定标题的层次关系
|
|
|
|
|
for c, n in zip(current_nums, new_nums):
|
|
|
|
|
if n > c:
|
|
|
|
|
return True
|
|
|
|
|
elif n < c:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
# 如果新标题有更多层次,认为是新的子章节
|
|
|
|
|
return len(new_nums) > len(current_nums)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def should_add_newline(content, keywords, max_length=20):
|
|
|
|
|
content_str = ''.join(content).strip()
|
|
|
|
|
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
|
|
|
|
|
|
|
|
|
|
def handle_content_append(current_content, line_content, append_newline, keywords):
|
|
|
|
|
if append_newline:
|
|
|
|
|
if should_add_newline(current_content, keywords):
|
|
|
|
|
current_content.append('\n') # 添加换行符
|
|
|
|
|
append_newline = False
|
|
|
|
|
current_content.append(line_content)
|
|
|
|
|
return append_newline
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
保存换行符的具体逻辑:
|
|
|
|
|
|
|
|
|
|
对于二级标题(如 1.1),如果其后的内容包含关键词或内容较短(<=20字符),会在内容前添加一个换行符。
|
|
|
|
|
这个换行符会被保留在 current_content 列表中。
|
|
|
|
|
当处理下一个标题时,之前的内容(包括可能存在的换行符)会被合并并保存到 data 字典中。
|
2024-10-09 17:07:55 +08:00
|
|
|
|
|
|
|
|
|
解决了'一''二'这类标题出现在正文中的情况。但是目前的逻辑是如果'一'已有了,就加入正文,否则'一'作为新的标题。
|
2024-09-28 17:38:53 +08:00
|
|
|
|
"""
|
|
|
|
|
#提取json主函数
|
|
|
|
|
def parse_text_by_heading(text):
|
|
|
|
|
keywords = ['包含', '以下']
|
|
|
|
|
data = {}
|
|
|
|
|
current_key = None
|
2024-10-09 17:07:55 +08:00
|
|
|
|
current_key_chinese = None
|
|
|
|
|
current_value_chinese = None
|
2024-09-28 17:38:53 +08:00
|
|
|
|
current_content = []
|
|
|
|
|
append_newline = False
|
2024-10-09 13:50:28 +08:00
|
|
|
|
skip_subheadings = False
|
2024-10-09 20:51:33 +08:00
|
|
|
|
last_main_number = None
|
2024-09-28 17:38:53 +08:00
|
|
|
|
|
2024-10-09 13:50:28 +08:00
|
|
|
|
lines = text.split('\n')
|
|
|
|
|
for i, line in enumerate(lines):
|
|
|
|
|
line_stripped = line.strip().replace('.', '.')
|
2024-09-28 17:38:53 +08:00
|
|
|
|
match = re.match(r'^(?<![a-zA-Z((])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
|
|
|
|
|
if not match:
|
|
|
|
|
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
|
|
|
|
|
|
2024-10-09 20:51:33 +08:00
|
|
|
|
# 新增:处理以点号开头的情况
|
|
|
|
|
dot_match = re.match(r'^\.(\d+)\s*(.+)$', line_stripped)
|
|
|
|
|
|
2024-09-28 17:38:53 +08:00
|
|
|
|
if match:
|
2024-10-09 17:07:55 +08:00
|
|
|
|
new_key, line_content = match.groups()
|
|
|
|
|
line_content = line_content.lstrip('..、,')
|
2024-10-09 13:50:28 +08:00
|
|
|
|
|
|
|
|
|
if any(keyword in line_content for keyword in keywords):
|
|
|
|
|
skip_subheadings = True
|
|
|
|
|
else:
|
|
|
|
|
skip_subheadings = False
|
|
|
|
|
|
2024-10-09 17:07:55 +08:00
|
|
|
|
# 保存中文标题的内容
|
|
|
|
|
if current_key_chinese is not None:
|
|
|
|
|
data[current_key_chinese] = current_value_chinese
|
|
|
|
|
current_key_chinese = None
|
|
|
|
|
|
|
|
|
|
# 保存阿拉伯数字的标题内容
|
2024-09-28 17:38:53 +08:00
|
|
|
|
if current_key is None or (compare_headings(current_key, new_key) and (
|
|
|
|
|
len(current_content) == 0 or current_content[-1][-1] != '第')):
|
|
|
|
|
if current_key is not None:
|
|
|
|
|
content_string = ''.join(current_content).strip()
|
2024-10-09 17:07:55 +08:00
|
|
|
|
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
|
2024-09-28 17:38:53 +08:00
|
|
|
|
current_key = new_key
|
|
|
|
|
current_content = [line_content]
|
2024-09-29 16:55:50 +08:00
|
|
|
|
append_newline = len(new_key.rstrip('.').split('.')) <= 2
|
2024-10-09 20:51:33 +08:00
|
|
|
|
last_main_number = new_key.split('.')[0]
|
|
|
|
|
else:
|
|
|
|
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
|
|
|
|
|
elif dot_match:
|
|
|
|
|
# 处理以点号开头的情况
|
|
|
|
|
sub_number, line_content = dot_match.groups()
|
|
|
|
|
if last_main_number:
|
|
|
|
|
new_key = f"{last_main_number}.{sub_number}"
|
|
|
|
|
if current_key is not None:
|
|
|
|
|
content_string = ''.join(current_content).strip()
|
|
|
|
|
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
|
|
|
|
|
current_key = new_key
|
|
|
|
|
current_content = [line_content]
|
|
|
|
|
append_newline = True
|
2024-09-28 17:38:53 +08:00
|
|
|
|
else:
|
2024-10-09 17:07:55 +08:00
|
|
|
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
|
2024-09-28 17:38:53 +08:00
|
|
|
|
else:
|
2024-10-09 13:50:28 +08:00
|
|
|
|
if not skip_subheadings:
|
2024-10-09 17:07:55 +08:00
|
|
|
|
# 合并了中文标题、字母标题、阿拉伯数字标题的匹配逻辑
|
|
|
|
|
pattern_title = re.compile(r'^\s*(?:[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
|
2024-10-09 13:50:28 +08:00
|
|
|
|
chinese_match = pattern_title.match(line_stripped)
|
|
|
|
|
letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
|
|
|
|
|
arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
|
|
|
|
|
|
2024-10-09 17:07:55 +08:00
|
|
|
|
# 优化了处理逻辑,减少冗余
|
2024-10-09 13:50:28 +08:00
|
|
|
|
if chinese_match or letter_match or arabic_match:
|
2024-10-09 17:07:55 +08:00
|
|
|
|
# 保存之前的 key 的内容(无论是中文标题还是阿拉伯数字标题)
|
2024-10-09 13:50:28 +08:00
|
|
|
|
if current_key is not None:
|
|
|
|
|
content_string = ''.join(current_content).strip()
|
2024-10-09 17:07:55 +08:00
|
|
|
|
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
|
|
|
|
|
if current_key_chinese is not None:
|
|
|
|
|
data[current_key_chinese] = current_value_chinese
|
2024-10-09 20:51:33 +08:00
|
|
|
|
current_key_chinese = None
|
2024-10-09 17:07:55 +08:00
|
|
|
|
|
|
|
|
|
# 处理中文标题
|
2024-10-09 13:50:28 +08:00
|
|
|
|
if chinese_match:
|
2024-10-09 17:07:55 +08:00
|
|
|
|
current_key_chinese = chinese_match.group(1) or chinese_match.group(2)
|
2024-10-09 20:51:33 +08:00
|
|
|
|
current_value_chinese = line_stripped[chinese_match.end():].lstrip('..、,').replace(' ', '')
|
2024-10-09 17:07:55 +08:00
|
|
|
|
if current_key_chinese in data:
|
|
|
|
|
handle_content_append(current_content, '\n' + line_stripped, append_newline, keywords)
|
|
|
|
|
current_key_chinese = None
|
|
|
|
|
# 处理字母标题
|
2024-10-09 13:50:28 +08:00
|
|
|
|
elif letter_match:
|
|
|
|
|
letter_key, letter_value = letter_match.groups()
|
|
|
|
|
letter_to_chinese = {
|
|
|
|
|
'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五',
|
|
|
|
|
'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十',
|
|
|
|
|
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
|
|
|
|
|
}
|
2024-10-09 17:07:55 +08:00
|
|
|
|
current_key_chinese = letter_to_chinese.get(letter_key, letter_key)
|
2024-10-09 20:51:33 +08:00
|
|
|
|
current_value_chinese = letter_value.lstrip('..、,').replace(' ', '')
|
2024-10-09 17:07:55 +08:00
|
|
|
|
if current_key_chinese in data:
|
|
|
|
|
handle_content_append(current_content, '\n' + line_stripped, append_newline, keywords)
|
|
|
|
|
current_key_chinese = None
|
|
|
|
|
# 处理阿拉伯数字标题
|
2024-10-09 13:50:28 +08:00
|
|
|
|
elif arabic_match:
|
|
|
|
|
arabic_key, arabic_value = arabic_match.groups()
|
|
|
|
|
current_key = arabic_key.replace('、', '.')
|
|
|
|
|
current_content = [arabic_value]
|
2024-10-09 20:51:33 +08:00
|
|
|
|
append_newline = True
|
|
|
|
|
last_main_number = current_key.rstrip('.')
|
2024-10-09 13:50:28 +08:00
|
|
|
|
continue
|
|
|
|
|
|
2024-10-09 17:07:55 +08:00
|
|
|
|
# 如果没有匹配标题,继续处理正文内容
|
2024-09-28 17:38:53 +08:00
|
|
|
|
if line_stripped:
|
|
|
|
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
|
|
|
|
|
|
2024-10-09 17:07:55 +08:00
|
|
|
|
# 最后保存最后一个 key 对应的内容
|
2024-09-28 17:38:53 +08:00
|
|
|
|
if current_key is not None:
|
|
|
|
|
content_string = ''.join(current_content).strip()
|
2024-10-09 17:07:55 +08:00
|
|
|
|
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
|
|
|
|
|
if current_key_chinese is not None:
|
|
|
|
|
data[current_key_chinese] = current_value_chinese
|
2024-09-28 17:38:53 +08:00
|
|
|
|
|
|
|
|
|
return data
|
|
|
|
|
|
2024-10-09 17:07:55 +08:00
|
|
|
|
|
2024-10-12 18:01:59 +08:00
|
|
|
|
#type=2时提取货物标的第一章招标公告时采用该逻辑
|
|
|
|
|
def parse_text_to_dict(text):
|
|
|
|
|
"""
|
|
|
|
|
解析文本,根据大标题划分内容,生成字典。
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
text (str): 要解析的文本。
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
dict: 大标题作为键,内容作为值的字典。
|
|
|
|
|
"""
|
|
|
|
|
# 正则表达式模式:匹配以一至十的汉字数字开头,后跟顿号和任意字符,且位于行首
|
|
|
|
|
pattern = re.compile(r'^([一二三四五六七八九十]+\s*、\s*.*)$', re.MULTILINE)
|
|
|
|
|
|
|
|
|
|
# 使用 re.finditer 找到所有大标题的位置
|
|
|
|
|
matches = list(pattern.finditer(text))
|
|
|
|
|
|
|
|
|
|
result = {}
|
|
|
|
|
for i, match in enumerate(matches):
|
|
|
|
|
title = match.group(1).strip() # 获取大标题文本
|
|
|
|
|
start = match.end() # 内容的起始位置
|
|
|
|
|
|
|
|
|
|
if i + 1 < len(matches):
|
|
|
|
|
end = matches[i + 1].start() # 下一个大标题的起始位置
|
|
|
|
|
else:
|
|
|
|
|
end = len(text) # 最后一个大标题,内容到文本末尾
|
|
|
|
|
|
|
|
|
|
content = text[start:end].strip() # 获取内容并去除前后空白
|
|
|
|
|
# 规范化换行符,并移除每行开头和结尾的空白
|
|
|
|
|
content = content.replace('\r\n', '\n') # 统一换行符
|
|
|
|
|
content = re.sub(r'[ \t]+\n', '\n', content) # 移除行尾空白
|
|
|
|
|
content = re.sub(r'^[ \t]+|[ \t]+$', '', content, flags=re.MULTILINE) # 移除行首和行尾空白
|
|
|
|
|
content = clean_content(content) # 处理内容中的换行符
|
|
|
|
|
result[title] = content
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
def clean_content(content):
|
|
|
|
|
"""
|
|
|
|
|
处理内容中的换行符:
|
|
|
|
|
- 保留在子项编号前的换行符。
|
|
|
|
|
- 保留在冒号 ':' 或全角冒号 ':' 前的第一个换行符。
|
|
|
|
|
- 移除其他位置的换行符,不留下额外的空格。
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
content (str): 要处理的内容字符串。
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
str: 处理后的内容字符串。
|
|
|
|
|
"""
|
|
|
|
|
# 定义子项编号的正则模式,包括:
|
|
|
|
|
# - 数字+点号+数字(如 1.1 或 1.1)
|
|
|
|
|
# - 数字+顿号(如 2、)
|
|
|
|
|
# - 点号+数字(如 .3 或 .3)
|
|
|
|
|
# - 数字+右括号(如 1) 或 1))
|
|
|
|
|
# - 圆括号包围的数字(如 (5))
|
|
|
|
|
# - 全角圆括号包围的数字(如 (5))
|
|
|
|
|
# - 数字+点号(如 1. 或 1.)
|
|
|
|
|
numbering_pattern = r'(?:\d+[..]\d+(?:[..]\d+)*|\d+、|[..]\d+|\d+[))]|\(\d+\)|(\d+)|\d+[..])'
|
|
|
|
|
|
|
|
|
|
# 定义需要保留换行符的情况:
|
|
|
|
|
# 1. 换行符前面是子项编号
|
|
|
|
|
# 2. 换行符前面是任意非空白字符,且后面跟着 ':' 或 ':'
|
|
|
|
|
pattern_keep = r'\n(?=(?:' + numbering_pattern + r'|[^:\n\r\f\v]+[::]))'
|
|
|
|
|
|
|
|
|
|
# 定义占位符,用于暂时替换需要保留的换行符
|
|
|
|
|
placeholder = "___PLACEHOLDER___"
|
|
|
|
|
|
|
|
|
|
# Step 1: 将需要保留的换行符替换为占位符
|
|
|
|
|
content_with_placeholder = re.sub(pattern_keep, placeholder, content)
|
|
|
|
|
|
|
|
|
|
# Step 2: 移除所有剩余的换行符
|
|
|
|
|
content_no_newlines = content_with_placeholder.replace('\n', '')
|
|
|
|
|
|
|
|
|
|
# Step 3: 将占位符替换回换行符
|
|
|
|
|
cleaned_content = content_no_newlines.replace(placeholder, '\n')
|
|
|
|
|
|
|
|
|
|
return cleaned_content
|
|
|
|
|
|
2024-09-28 17:38:53 +08:00
|
|
|
|
# def convert_to_json(file_path, start_word, end_phrases):
|
|
|
|
|
# if file_path.endswith('.docx'):
|
|
|
|
|
# text = extract_text_from_docx(file_path)
|
|
|
|
|
# elif file_path.endswith('.pdf'):
|
|
|
|
|
# text = extract_text_from_pdf(file_path,start_word,end_phrases)
|
|
|
|
|
# else:
|
|
|
|
|
# raise ValueError("Unsupported file format")
|
|
|
|
|
# # print(text)
|
|
|
|
|
# parsed_data = parse_text_by_heading(text)
|
|
|
|
|
# return parsed_data
|
|
|
|
|
|
2024-10-09 13:50:28 +08:00
|
|
|
|
def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json"):
|
2024-09-30 16:23:39 +08:00
|
|
|
|
if not os.path.exists(file_path):
|
|
|
|
|
print(f"The specified file does not exist: {file_path}")
|
2024-09-28 17:38:53 +08:00
|
|
|
|
return ""
|
|
|
|
|
if type == 1:
|
|
|
|
|
start_word = r'^\s*[((]?\s*[一1]\s*[))]?\s*[、.]*\s*(说\s*明|总\s*则)'
|
2024-10-12 18:01:59 +08:00
|
|
|
|
end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
|
|
|
|
|
text = extract_text_from_pdf(file_path, start_word, end_pattern)
|
|
|
|
|
result = parse_text_by_heading(text)
|
2024-09-28 17:38:53 +08:00
|
|
|
|
else:
|
2024-10-12 18:01:59 +08:00
|
|
|
|
start_word = r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*'
|
|
|
|
|
end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人|磋商|供应商)须知前附表)'
|
|
|
|
|
text = extract_text_from_pdf(file_path, start_word, end_pattern)
|
|
|
|
|
result=parse_text_to_dict(text)
|
2024-09-28 17:38:53 +08:00
|
|
|
|
# result = convert_to_json(input_path, start_word, end_pattern)
|
|
|
|
|
# 检查输出文件夹是否存在,如果不存在则创建
|
|
|
|
|
if not os.path.exists(output_folder):
|
|
|
|
|
os.makedirs(output_folder)
|
|
|
|
|
print(f"Created output folder: {output_folder}")
|
2024-10-12 18:01:59 +08:00
|
|
|
|
file_name = "clause1.json" if type == 1 else "clause2.json"
|
|
|
|
|
# file_name = f"clause{suffix_counter}.json"
|
2024-09-28 17:38:53 +08:00
|
|
|
|
output_path = os.path.join(output_folder, file_name)
|
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
|
|
|
json.dump(result, f, indent=4, ensure_ascii=False)
|
|
|
|
|
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
|
|
|
|
|
return output_path
|
|
|
|
|
|
|
|
|
|
|
2024-09-30 16:23:39 +08:00
|
|
|
|
def process_folder(input_folder, output_folder):
|
|
|
|
|
# 获取输入文件夹中的所有文件,过滤掉不以 'part2' 结尾的文件
|
|
|
|
|
files = [f for f in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, f)) and f.endswith('part2.pdf')]
|
|
|
|
|
|
|
|
|
|
# 遍历文件并对每个文件进行处理
|
|
|
|
|
for file_name in files:
|
|
|
|
|
file_path = os.path.join(input_folder, file_name)
|
2024-10-09 20:51:33 +08:00
|
|
|
|
# 去掉文件扩展名
|
|
|
|
|
file_name_without_extension = os.path.splitext(file_name)[0]
|
2024-09-30 16:23:39 +08:00
|
|
|
|
try:
|
|
|
|
|
# 调用 convert_clause_to_json,传递文件路径、输出文件夹和递增的后缀
|
2024-10-09 20:51:33 +08:00
|
|
|
|
output_path = convert_clause_to_json(file_path, output_folder, 1, file_name_without_extension)
|
2024-09-30 16:23:39 +08:00
|
|
|
|
print(f"Processed file: {file_name}, JSON saved to: {output_path}")
|
|
|
|
|
except ValueError as e:
|
|
|
|
|
print(f"Error processing {file_name}: {e}")
|
|
|
|
|
|
2024-10-09 13:50:28 +08:00
|
|
|
|
#TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次)_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏
|
|
|
|
|
#TODO:标题'一'应该越来越大,与'1.1'的逻辑分开';
|
|
|
|
|
#TODO:911904c3-4d6e-47e0-acd8-b05e582209cb文件夹运行有问题,百炼出错了
|
2024-09-28 17:38:53 +08:00
|
|
|
|
if __name__ == "__main__":
|
2024-10-12 18:01:59 +08:00
|
|
|
|
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
|
|
|
|
|
file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output5\\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件_notice.pdf'
|
|
|
|
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
|
|
|
|
output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output5\\tmp2'
|
|
|
|
|
try:
|
|
|
|
|
output_path = convert_clause_to_json(file_path,output_folder,2)
|
|
|
|
|
print(f"Final JSON result saved to: {output_path}")
|
|
|
|
|
except ValueError as e:
|
|
|
|
|
print("Error:", e)
|
|
|
|
|
|
|
|
|
|
# input_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4'
|
|
|
|
|
# output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1'
|
|
|
|
|
#
|
|
|
|
|
# # 调用 process_folder 来处理整个文件夹中的所有文件
|
|
|
|
|
# process_folder(input_folder, output_folder)
|