9.28
This commit is contained in:
parent
2c036d8504
commit
6637086547
@ -107,6 +107,6 @@ def extract_tables_main(path, output_folder):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp\\ztbfile_tobidders_notice_table.docx'
|
path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件(一中多媒体报告厅教学设备)_tobidders_notice_part1.docx'
|
||||||
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp" # 前附表json文件
|
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp" # 前附表json文件
|
||||||
extract_tables_main(path, output_folder)
|
extract_tables_main(path, output_folder)
|
||||||
|
@ -1,210 +0,0 @@
|
|||||||
import json
|
|
||||||
import docx
|
|
||||||
import re
|
|
||||||
import os
|
|
||||||
from PyPDF2 import PdfReader
|
|
||||||
from flask_app.main.截取pdf import clean_page_content,extract_common_header
|
|
||||||
|
|
||||||
def extract_text_from_docx(file_path):
|
|
||||||
doc = docx.Document(file_path)
|
|
||||||
return '\n'.join([para.text for para in doc.paragraphs])
|
|
||||||
|
|
||||||
|
|
||||||
def extract_text_from_pdf(file_path):
|
|
||||||
# 从PDF文件中提取文本
|
|
||||||
common_header = extract_common_header(file_path)
|
|
||||||
pdf_document = PdfReader(file_path)
|
|
||||||
text = ""
|
|
||||||
# 遍历每一页
|
|
||||||
for page in pdf_document.pages:
|
|
||||||
# 提取当前页面的文本
|
|
||||||
page_text = page.extract_text() if page.extract_text() else ""
|
|
||||||
# 清洗页面文本
|
|
||||||
page_text = clean_page_content(page_text, common_header)
|
|
||||||
# 将清洗后的文本添加到总文本中
|
|
||||||
text += page_text+"\n"
|
|
||||||
return text
|
|
||||||
|
|
||||||
def extract_section(text, start_pattern, end_phrases):
|
|
||||||
# 查找开始模式
|
|
||||||
start_match = re.search(start_pattern, text)
|
|
||||||
if not start_match:
|
|
||||||
return "" # 如果没有找到匹配的开始模式,返回空字符串
|
|
||||||
start_index = start_match.end() # 从匹配的结束位置开始
|
|
||||||
|
|
||||||
# 初始化结束索引为文本总长度
|
|
||||||
end_index = len(text)
|
|
||||||
|
|
||||||
# 遍历所有结束短语,查找第一个出现的结束短语
|
|
||||||
for phrase in end_phrases:
|
|
||||||
match = re.search(phrase, text[start_index:], flags=re.MULTILINE)
|
|
||||||
if match:
|
|
||||||
end_index = start_index + match.start() # 更新结束索引为匹配到的开始位置
|
|
||||||
break # 找到第一个匹配后立即停止搜索
|
|
||||||
|
|
||||||
# 提取并返回从开始模式后到结束模式前的内容
|
|
||||||
return text[start_index:end_index]
|
|
||||||
|
|
||||||
def compare_headings(current, new):
|
|
||||||
# 使用过滤来确保只处理非空且为数字的部分
|
|
||||||
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
|
|
||||||
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
|
|
||||||
|
|
||||||
# 比较数字序列以确定标题的层次关系
|
|
||||||
for c, n in zip(current_nums, new_nums):
|
|
||||||
if n > c:
|
|
||||||
return True
|
|
||||||
elif n < c:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# 如果新标题有更多层次,认为是新的子章节
|
|
||||||
return len(new_nums) > len(current_nums)
|
|
||||||
|
|
||||||
|
|
||||||
def should_add_newline(content, keywords, max_length=20):
|
|
||||||
content_str = ''.join(content).strip()
|
|
||||||
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
|
|
||||||
|
|
||||||
def handle_content_append(current_content, line_content, append_newline, keywords):
|
|
||||||
if append_newline:
|
|
||||||
if should_add_newline(current_content, keywords):
|
|
||||||
current_content.append('\n') # 添加换行符
|
|
||||||
append_newline = False
|
|
||||||
current_content.append(line_content)
|
|
||||||
return append_newline
|
|
||||||
|
|
||||||
#对二级标题如x.x进行额外处理:如果当前处理内容包含keywords中的内容,则必须保留换行符/如果当前内容字数大于20,不保留换行。
|
|
||||||
def parse_text_by_heading(text):
|
|
||||||
keywords = ['包含', '以下']
|
|
||||||
data = {}
|
|
||||||
current_key = None
|
|
||||||
current_content = []
|
|
||||||
append_newline = False
|
|
||||||
|
|
||||||
lines = text.split('\n')
|
|
||||||
for i, line in enumerate(lines):
|
|
||||||
line_stripped = line.strip()
|
|
||||||
# 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号
|
|
||||||
match = re.match(r'^(?<![a-zA-Z((])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
|
|
||||||
if not match:
|
|
||||||
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
|
|
||||||
|
|
||||||
if match:
|
|
||||||
new_key, line_content = match.groups()
|
|
||||||
line_content = line_content.lstrip('.')
|
|
||||||
# 检查是否应该更新当前键和内容
|
|
||||||
if current_key is None or (compare_headings(current_key, new_key) and (
|
|
||||||
len(current_content) == 0 or current_content[-1][-1] != '第')):
|
|
||||||
if current_key is not None:
|
|
||||||
# 将之前的内容保存到data中,保留第一个换行符,后续的换行符转为空字符
|
|
||||||
content_string = ''.join(current_content).strip()
|
|
||||||
data[current_key] = content_string.replace(' ', '')
|
|
||||||
current_key = new_key
|
|
||||||
current_content = [line_content]
|
|
||||||
# 只有当标题级别为两级(如 1.1)时,才设置 append_newline 为 True
|
|
||||||
append_newline = len(new_key.split('.')) == 2
|
|
||||||
else:
|
|
||||||
append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
|
|
||||||
else:
|
|
||||||
if line_stripped:
|
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
|
|
||||||
|
|
||||||
if current_key is not None:
|
|
||||||
# 保存最后一部分内容
|
|
||||||
content_string = ''.join(current_content).strip()
|
|
||||||
data[current_key] = content_string.replace(' ', '')
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
def convert_to_json(file_path, start_word, end_phrases):
|
|
||||||
if file_path.endswith('.docx'):
|
|
||||||
text = extract_text_from_docx(file_path)
|
|
||||||
elif file_path.endswith('.pdf'):
|
|
||||||
text = extract_text_from_pdf(file_path)
|
|
||||||
else:
|
|
||||||
raise ValueError("Unsupported file format")
|
|
||||||
# 提取从 start_word 开始到 end_phrases 结束的内容
|
|
||||||
text = extract_section(text, start_word, end_phrases)
|
|
||||||
# print(text)
|
|
||||||
parsed_data = parse_text_by_heading(text)
|
|
||||||
return parsed_data
|
|
||||||
|
|
||||||
def convert_clause_to_json(input_path,output_folder,type=1):
|
|
||||||
if not os.path.exists(input_path):
|
|
||||||
print(f"The specified file does not exist: {input_path}")
|
|
||||||
return ""
|
|
||||||
if type==1:
|
|
||||||
start_word = "投标人须知正文"
|
|
||||||
end_phrases = [
|
|
||||||
r'^第[一二三四五六七八九十]+章\s*评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
|
|
||||||
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:'
|
|
||||||
end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
|
|
||||||
result = convert_to_json(input_path, start_word, end_phrases)
|
|
||||||
file_name = "clause1.json" if type == 1 else "clause2.json"
|
|
||||||
output_path = os.path.join(output_folder, file_name)
|
|
||||||
with open(output_path, 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(result, f, indent=4, ensure_ascii=False)
|
|
||||||
post_process_json(output_path)
|
|
||||||
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
|
|
||||||
return output_path
|
|
||||||
|
|
||||||
def post_process_json(json_file_path): #处理一级标题如'5.1'过长的内容 zbtest20
|
|
||||||
# 读取 JSON 文件
|
|
||||||
with open(json_file_path, 'r', encoding='utf-8') as file:
|
|
||||||
data = json.load(file)
|
|
||||||
|
|
||||||
processed_data = {}
|
|
||||||
|
|
||||||
for key, value in data.items():
|
|
||||||
# 检查是否是一级标题(如 '5.'),并且其值包含 '\n'
|
|
||||||
if re.match(r'^\d+\.\s*$', key) and '\n' in value:
|
|
||||||
# 分割标题和正文
|
|
||||||
title, content = value.split('\n', 1)
|
|
||||||
|
|
||||||
# 添加原来的标题作为 '5.0',其值为原来标题的内容(即 title)
|
|
||||||
processed_data[key] = title.strip()
|
|
||||||
sub_key = f"{key.rstrip('.')}." + "0" # 自动生成 '5.0',与 '5.' 一致,保证点号的存在
|
|
||||||
|
|
||||||
processed_data[sub_key] = title.strip()
|
|
||||||
|
|
||||||
# 初始化计数器
|
|
||||||
sub_count = 1
|
|
||||||
|
|
||||||
# 根据子序号 '1.' 或 '1、' 进行分割
|
|
||||||
sub_sections = re.split(r'(\d+[\.\、])\s*', content)
|
|
||||||
|
|
||||||
current_sub_content = ""
|
|
||||||
for i in range(1, len(sub_sections), 2):
|
|
||||||
sub_number = sub_sections[i].strip() # 获取子序号
|
|
||||||
sub_content = sub_sections[i + 1].strip() # 获取内容
|
|
||||||
|
|
||||||
# 生成三级标题,如 '5.0.1', '5.0.2'
|
|
||||||
sub_key_with_number = f"{sub_key}.{sub_count}"
|
|
||||||
processed_data[sub_key_with_number] = sub_content
|
|
||||||
sub_count += 1
|
|
||||||
|
|
||||||
else:
|
|
||||||
# 如果没有分割需求,保留原数据
|
|
||||||
processed_data[key] = value
|
|
||||||
|
|
||||||
# 将修改后的数据重新写入到原来的 JSON 文件中
|
|
||||||
with open(json_file_path, 'w', encoding='utf-8') as file:
|
|
||||||
json.dump(processed_data, file, ensure_ascii=False, indent=4)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
|
|
||||||
file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output3\\2-招标文件(广水市教育局封闭管理)_qualification1.pdf'
|
|
||||||
# start_word = "投标人须知正文"
|
|
||||||
# end_phrases = [
|
|
||||||
# r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
|
|
||||||
# r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
|
|
||||||
# ]
|
|
||||||
output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output3\\tmp'
|
|
||||||
try:
|
|
||||||
output_path = convert_clause_to_json(file_path,output_folder)
|
|
||||||
print(f"Final JSON result saved to: {output_path}")
|
|
||||||
except ValueError as e:
|
|
||||||
print("Error:", e)
|
|
@ -143,7 +143,7 @@ def extract_from_notice(clause_path, type):
|
|||||||
return transformed_data
|
return transformed_data
|
||||||
|
|
||||||
|
|
||||||
# 假设原始数据文件路径
|
#TODO:考虑5.1这种二级标题后面换行符,但是仍然存在5.1.1情况时该怎么办 再审视一下zbtest20的处理是否合理
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp\\clause1.json'
|
file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp\\clause1.json'
|
||||||
# file_path='C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\clause1.json'
|
# file_path='C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\clause1.json'
|
||||||
|
@ -10,21 +10,53 @@ def extract_text_from_docx(file_path):
|
|||||||
return '\n'.join([para.text for para in doc.paragraphs])
|
return '\n'.join([para.text for para in doc.paragraphs])
|
||||||
|
|
||||||
|
|
||||||
def extract_text_from_pdf(file_path):
|
# def extract_text_from_pdf(file_path):
|
||||||
|
# # 从PDF文件中提取文本
|
||||||
|
# common_header = extract_common_header(file_path)
|
||||||
|
# pdf_document = PdfReader(file_path)
|
||||||
|
# text = ""
|
||||||
|
# # 遍历每一页
|
||||||
|
# for page in pdf_document.pages:
|
||||||
|
# # 提取当前页面的文本
|
||||||
|
# page_text = page.extract_text() if page.extract_text() else ""
|
||||||
|
# # 清洗页面文本
|
||||||
|
# page_text = clean_page_content(page_text, common_header)
|
||||||
|
# # 将清洗后的文本添加到总文本中
|
||||||
|
# text += page_text+"\n"
|
||||||
|
# return text
|
||||||
|
def extract_text_from_pdf(file_path, start_word, end_pattern):
|
||||||
# 从PDF文件中提取文本
|
# 从PDF文件中提取文本
|
||||||
common_header = extract_common_header(file_path)
|
common_header = extract_common_header(file_path)
|
||||||
pdf_document = PdfReader(file_path)
|
pdf_document = PdfReader(file_path)
|
||||||
text = ""
|
all_pages_text = []
|
||||||
# 遍历每一页
|
start_index = None
|
||||||
for page in pdf_document.pages:
|
# 处理所有页面
|
||||||
# 提取当前页面的文本
|
for i, page in enumerate(pdf_document.pages):
|
||||||
page_text = page.extract_text() if page.extract_text() else ""
|
page_text = page.extract_text() if page.extract_text() else ""
|
||||||
# 清洗页面文本
|
cleaned_text = clean_page_content(page_text, common_header)
|
||||||
page_text = clean_page_content(page_text, common_header)
|
|
||||||
# 将清洗后的文本添加到总文本中
|
|
||||||
text += page_text+"\n"
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
# 在第一页查找开始位置
|
||||||
|
if i == 0 and start_index is None:
|
||||||
|
start_match = re.search(start_word, cleaned_text, re.MULTILINE)
|
||||||
|
if start_match:
|
||||||
|
start_index = start_match.start()
|
||||||
|
cleaned_text = cleaned_text[start_index:]
|
||||||
|
|
||||||
|
# 在最后一页查找结束位置
|
||||||
|
if i == len(pdf_document.pages) - 1:
|
||||||
|
for pattern in end_pattern:
|
||||||
|
matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE))
|
||||||
|
if matches:
|
||||||
|
end_index = matches[-1].start()
|
||||||
|
cleaned_text = cleaned_text[:end_index]
|
||||||
|
break
|
||||||
|
|
||||||
|
all_pages_text.append(cleaned_text)
|
||||||
|
|
||||||
|
# 合并所有页面的文本
|
||||||
|
full_text = "\n".join(all_pages_text)
|
||||||
|
# print(full_text)
|
||||||
|
return full_text
|
||||||
def extract_section(text, start_pattern, end_phrases):
|
def extract_section(text, start_pattern, end_phrases):
|
||||||
# 查找开始模式
|
# 查找开始模式
|
||||||
start_match = re.search(start_pattern, text)
|
start_match = re.search(start_pattern, text)
|
||||||
@ -80,9 +112,8 @@ def parse_text_by_heading(text):
|
|||||||
current_key = None
|
current_key = None
|
||||||
current_content = []
|
current_content = []
|
||||||
append_newline = False
|
append_newline = False
|
||||||
|
|
||||||
lines = text.split('\n')
|
lines = text.split('\n')
|
||||||
for i, line in enumerate(lines):
|
for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n'
|
||||||
line_stripped = line.strip()
|
line_stripped = line.strip()
|
||||||
# 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号
|
# 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号
|
||||||
match = re.match(r'^(?<![a-zA-Z((])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
|
match = re.match(r'^(?<![a-zA-Z((])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
|
||||||
@ -94,11 +125,12 @@ def parse_text_by_heading(text):
|
|||||||
line_content = line_content.lstrip('.')
|
line_content = line_content.lstrip('.')
|
||||||
# 检查是否应该更新当前键和内容
|
# 检查是否应该更新当前键和内容
|
||||||
if current_key is None or (compare_headings(current_key, new_key) and (
|
if current_key is None or (compare_headings(current_key, new_key) and (
|
||||||
len(current_content) == 0 or current_content[-1][-1] != '第')):
|
len(current_content) == 0 or current_content[-1][-1] != '第')): #current_content: 内容占两行则是: ['本招标项目已具备招标条件,现对本项目施工监理进行招标,项目概况见本', '章附件一。']
|
||||||
if current_key is not None:
|
if current_key is not None:
|
||||||
# 将之前的内容保存到data中,保留第一个换行符,后续的换行符转为空字符
|
# 将之前的内容保存到data中,保留第一个换行符,后续的换行符转为空字符
|
||||||
content_string = ''.join(current_content).strip()
|
content_string = ''.join(current_content).strip()
|
||||||
data[current_key] = content_string.replace(' ', '')
|
data[current_key] = content_string.replace(' ', '')
|
||||||
|
|
||||||
current_key = new_key
|
current_key = new_key
|
||||||
current_content = [line_content]
|
current_content = [line_content]
|
||||||
# 只有当标题级别为两级(如 1.1)时,才设置 append_newline 为 True
|
# 只有当标题级别为两级(如 1.1)时,才设置 append_newline 为 True
|
||||||
@ -120,11 +152,11 @@ def convert_to_json(file_path, start_word, end_phrases):
|
|||||||
if file_path.endswith('.docx'):
|
if file_path.endswith('.docx'):
|
||||||
text = extract_text_from_docx(file_path)
|
text = extract_text_from_docx(file_path)
|
||||||
elif file_path.endswith('.pdf'):
|
elif file_path.endswith('.pdf'):
|
||||||
text = extract_text_from_pdf(file_path)
|
text = extract_text_from_pdf(file_path,start_word,end_phrases)
|
||||||
else:
|
else:
|
||||||
raise ValueError("Unsupported file format")
|
raise ValueError("Unsupported file format")
|
||||||
# 提取从 start_word 开始到 end_phrases 结束的内容
|
# 提取从 start_word 开始到 end_phrases 结束的内容
|
||||||
text = extract_section(text, start_word, end_phrases)
|
# text = extract_section(text, start_word, end_phrases)
|
||||||
# print(text)
|
# print(text)
|
||||||
parsed_data = parse_text_by_heading(text)
|
parsed_data = parse_text_by_heading(text)
|
||||||
return parsed_data
|
return parsed_data
|
||||||
@ -136,13 +168,18 @@ def convert_clause_to_json(input_path,output_folder,type=1):
|
|||||||
if type==1:
|
if type==1:
|
||||||
start_word = "投标人须知正文"
|
start_word = "投标人须知正文"
|
||||||
end_phrases = [
|
end_phrases = [
|
||||||
r'^第[一二三四五六七八九十]+章\s*评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
|
r'^第[一二三四五六七八九十]+章\s*评标办法',
|
||||||
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
|
r'^评标办法前附表',
|
||||||
|
r'^附(?:录|件|表)(?:一)?[::]'
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:'
|
start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:'
|
||||||
end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
|
end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
|
||||||
result = convert_to_json(input_path, start_word, end_phrases)
|
result = convert_to_json(input_path, start_word, end_phrases)
|
||||||
|
# 检查输出文件夹是否存在,如果不存在则创建
|
||||||
|
if not os.path.exists(output_folder):
|
||||||
|
os.makedirs(output_folder)
|
||||||
|
print(f"Created output folder: {output_folder}")
|
||||||
file_name = "clause1.json" if type == 1 else "clause2.json"
|
file_name = "clause1.json" if type == 1 else "clause2.json"
|
||||||
output_path = os.path.join(output_folder, file_name)
|
output_path = os.path.join(output_folder, file_name)
|
||||||
with open(output_path, 'w', encoding='utf-8') as f:
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
@ -155,9 +192,7 @@ def post_process_json(json_file_path): #处理一级标题如'5.1'过长的内
|
|||||||
# 读取 JSON 文件
|
# 读取 JSON 文件
|
||||||
with open(json_file_path, 'r', encoding='utf-8') as file:
|
with open(json_file_path, 'r', encoding='utf-8') as file:
|
||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
|
|
||||||
processed_data = {}
|
processed_data = {}
|
||||||
|
|
||||||
for key, value in data.items():
|
for key, value in data.items():
|
||||||
# 检查是否是一级标题(如 '5.'),并且其值包含 '\n'
|
# 检查是否是一级标题(如 '5.'),并且其值包含 '\n'
|
||||||
if re.match(r'^\d+\.\s*$', key) and '\n' in value:
|
if re.match(r'^\d+\.\s*$', key) and '\n' in value:
|
||||||
@ -195,14 +230,14 @@ def post_process_json(json_file_path): #处理一级标题如'5.1'过长的内
|
|||||||
json.dump(processed_data, file, ensure_ascii=False, indent=4)
|
json.dump(processed_data, file, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
|
file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest20_tobidders_notice.pdf'
|
||||||
file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output3\\2-招标文件(广水市教育局封闭管理)_qualification1.pdf'
|
# file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件(一中多媒体报告厅教学设备)_tobidders_notice_part1.pdf'
|
||||||
# start_word = "投标人须知正文"
|
# start_word = "投标人须知正文"
|
||||||
# end_phrases = [
|
# end_phrases = [
|
||||||
# r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
|
# r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
|
||||||
# r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
|
# r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
|
||||||
# ]
|
# ]
|
||||||
output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output3\\tmp'
|
output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp'
|
||||||
try:
|
try:
|
||||||
output_path = convert_clause_to_json(file_path,output_folder)
|
output_path = convert_clause_to_json(file_path,output_folder)
|
||||||
print(f"Final JSON result saved to: {output_path}")
|
print(f"Final JSON result saved to: {output_path}")
|
||||||
|
@ -72,6 +72,6 @@ def extract_text_by_page(file_path):
|
|||||||
print(f"Page {page_num + 1} is empty or text could not be extracted.")
|
print(f"Page {page_num + 1} is empty or text could not be extracted.")
|
||||||
return result
|
return result
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件正文(1)(1).pdf"
|
file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件(一中多媒体报告厅教学设备)_tobidders_notice_part2.pdf"
|
||||||
res=extract_text_by_page(file_path)
|
res=extract_text_by_page(file_path)
|
||||||
# print(res)
|
# print(res)
|
@ -1,72 +1,14 @@
|
|||||||
import json
|
def test_append_newline():
|
||||||
import re
|
def check_append_newline(key):
|
||||||
#这个字典可能有嵌套,你需要遍历里面的键名,对键名作判断,而不是键值,具体是这样的:如果处于同一层级的键的数量>1并且键名全由数字或点号组成。那么就将这些序号键名全部删除,重新组织成一个字典格式的数据,你可以考虑用字符串列表来保持部分平级的数据
|
append_newline = len(key.split('.')) == 2
|
||||||
#对于同级的键,如果数量>1且键名都统一,那么将键名去掉,用列表保持它们的键值
|
return append_newline
|
||||||
#对于同一个字典中,可能存在若干键值对,若它们的键值都是""或者"/" 你就将它们的键值删去,它们的键名用字符串列表保存
|
|
||||||
|
|
||||||
#zbtest20也有问题
|
# 测试用例
|
||||||
def contains_number_or_index(key, value):
|
test_cases = ["1.1", "1."]
|
||||||
# 判断值是否是数字或数字字符串
|
|
||||||
is_number = isinstance(value, (int, float)) or (isinstance(value, str) and value.isdigit())
|
|
||||||
# 判断键是否包含 "序号"
|
|
||||||
contains_index = '序号' in key
|
|
||||||
# 判断值中是否包含数字
|
|
||||||
contains_digit = isinstance(value, str) and re.search(r'\d+', value)
|
|
||||||
# 判断值中是否包含中文字符
|
|
||||||
contains_chinese = isinstance(value, str) and re.search(r'[\u4e00-\u9fff]', value)
|
|
||||||
# 如果值中包含数字但也有中文字符,则保留(返回 False)
|
|
||||||
if contains_digit and contains_chinese:
|
|
||||||
return False
|
|
||||||
# 如果值是数字或包含数字,且不包含中文字符,或者键包含 "序号",返回 True
|
|
||||||
return is_number or contains_index or contains_digit
|
|
||||||
|
|
||||||
#对于同一个字典中,可能存在若干键值对,若它们的键值都是""或者"/" 你就将它们的键值删去,它们的键名用字符串列表保存
|
for case in test_cases:
|
||||||
#如果键名是"序号"或者键值中全是数字,删去序号
|
result = check_append_newline(case)
|
||||||
def preprocess_dict(data):
|
print(f"序号 '{case}': append_newline = {result}")
|
||||||
if isinstance(data, dict):
|
|
||||||
if len(data) > 1:
|
|
||||||
# 检查是否所有值都是 "" 或 "/"
|
|
||||||
if all(v == "" or v == "/" for v in data.values()):
|
|
||||||
return list(data.keys())
|
|
||||||
else:
|
|
||||||
processed = {}
|
|
||||||
for k, v in data.items():
|
|
||||||
if not contains_number_or_index(k, v):
|
|
||||||
processed_v = preprocess_dict(v)
|
|
||||||
if processed_v != "": # 只添加非空值
|
|
||||||
processed[k] = processed_v
|
|
||||||
return processed
|
|
||||||
else:
|
|
||||||
return {k: preprocess_dict(v) for k, v in data.items()}
|
|
||||||
elif isinstance(data, list):
|
|
||||||
return [preprocess_dict(item) for item in data]
|
|
||||||
else:
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
# 运行测试
|
||||||
# 测试代码
|
test_append_newline()
|
||||||
#TODO:同一层全部都是数字才成功删除,没需求了
|
|
||||||
input_data = {
|
|
||||||
"符合性审查": {
|
|
||||||
"说明": "1ha",
|
|
||||||
"www":"哈哈",
|
|
||||||
"审查标准": [
|
|
||||||
{
|
|
||||||
"序号": 1,
|
|
||||||
"内容": "投标总报价超过项目(分包)预算金额或最高限价的;"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"序号": 2,
|
|
||||||
"内容": "《投标书》、《法定代表人授权书》、《开标一览表(含明细)》未提供或不符合招标文件要求的;"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"序号": 3,
|
|
||||||
"内容": "工期(服务期限)、质保期不符合招标文件要求的;"
|
|
||||||
},
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pred=preprocess_dict(input_data)
|
|
||||||
print(json.dumps(pred, ensure_ascii=False, indent=4))
|
|
||||||
# processed_data = process_dict(pred)
|
|
||||||
# print(json.dumps(processed_data, ensure_ascii=False, indent=4))
|
|
184
flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py
Normal file
184
flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py
Normal file
@ -0,0 +1,184 @@
|
|||||||
|
import json
|
||||||
|
import docx
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
from PyPDF2 import PdfReader
|
||||||
|
from flask_app.main.截取pdf import clean_page_content,extract_common_header
|
||||||
|
|
||||||
|
def extract_text_from_docx(file_path):
|
||||||
|
doc = docx.Document(file_path)
|
||||||
|
return '\n'.join([para.text for para in doc.paragraphs])
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_from_pdf(file_path, start_word, end_pattern):
|
||||||
|
# 从PDF文件中提取文本
|
||||||
|
common_header = extract_common_header(file_path)
|
||||||
|
pdf_document = PdfReader(file_path)
|
||||||
|
all_pages_text = []
|
||||||
|
start_index = None
|
||||||
|
# 处理所有页面
|
||||||
|
for i, page in enumerate(pdf_document.pages):
|
||||||
|
page_text = page.extract_text() if page.extract_text() else ""
|
||||||
|
cleaned_text = clean_page_content(page_text, common_header)
|
||||||
|
|
||||||
|
# 在第一页查找开始位置
|
||||||
|
if i == 0 and start_index is None:
|
||||||
|
start_match = re.search(start_word, cleaned_text, re.MULTILINE)
|
||||||
|
if start_match:
|
||||||
|
start_index = start_match.start()
|
||||||
|
cleaned_text = cleaned_text[start_index:]
|
||||||
|
|
||||||
|
# 在最后一页查找结束位置
|
||||||
|
if i == len(pdf_document.pages) - 1:
|
||||||
|
for pattern in end_pattern:
|
||||||
|
matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE))
|
||||||
|
if matches:
|
||||||
|
end_index = matches[-1].start()
|
||||||
|
cleaned_text = cleaned_text[:end_index]
|
||||||
|
break
|
||||||
|
|
||||||
|
all_pages_text.append(cleaned_text)
|
||||||
|
|
||||||
|
# 合并所有页面的文本
|
||||||
|
full_text = "\n".join(all_pages_text)
|
||||||
|
# print(full_text)
|
||||||
|
return full_text
|
||||||
|
|
||||||
|
def compare_headings(current, new):
|
||||||
|
# 使用过滤来确保只处理非空且为数字的部分
|
||||||
|
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
|
||||||
|
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
|
||||||
|
|
||||||
|
# 比较数字序列以确定标题的层次关系
|
||||||
|
for c, n in zip(current_nums, new_nums):
|
||||||
|
if n > c:
|
||||||
|
return True
|
||||||
|
elif n < c:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 如果新标题有更多层次,认为是新的子章节
|
||||||
|
return len(new_nums) > len(current_nums)
|
||||||
|
|
||||||
|
|
||||||
|
def should_add_newline(content, keywords, max_length=20):
|
||||||
|
content_str = ''.join(content).strip()
|
||||||
|
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
|
||||||
|
|
||||||
|
def handle_content_append(current_content, line_content, append_newline, keywords):
|
||||||
|
if append_newline:
|
||||||
|
if should_add_newline(current_content, keywords):
|
||||||
|
current_content.append('\n') # 添加换行符
|
||||||
|
append_newline = False
|
||||||
|
current_content.append(line_content)
|
||||||
|
return append_newline
|
||||||
|
|
||||||
|
"""
|
||||||
|
保存换行符的具体逻辑:
|
||||||
|
|
||||||
|
对于二级标题(如 1.1),如果其后的内容包含关键词或内容较短(<=20字符),会在内容前添加一个换行符。
|
||||||
|
这个换行符会被保留在 current_content 列表中。
|
||||||
|
当处理下一个标题时,之前的内容(包括可能存在的换行符)会被合并并保存到 data 字典中。
|
||||||
|
"""
|
||||||
|
#提取json主函数
|
||||||
|
def parse_text_by_heading(text):
|
||||||
|
keywords = ['包含', '以下']
|
||||||
|
data = {}
|
||||||
|
current_key = None
|
||||||
|
current_content = []
|
||||||
|
append_newline = False
|
||||||
|
|
||||||
|
lines = text.split('\n') #['一、说明', '1.适用范围', '招标文件仅适用于第一章“投标邀请书”中所述项目的货物、工程及服务的采购。']
|
||||||
|
for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n'
|
||||||
|
line_stripped = line.strip()
|
||||||
|
|
||||||
|
# 匹配中文数字标题,如 "一、说明"
|
||||||
|
chinese_match = re.match(r'^([一二三四五六七八九十]+、)\s*(.+)$', line_stripped)
|
||||||
|
if chinese_match:
|
||||||
|
chinese_key, chinese_value = chinese_match.groups()
|
||||||
|
chinese_key = chinese_key.rstrip('、') # 移除顿号
|
||||||
|
data[chinese_key] = chinese_value
|
||||||
|
current_key = None
|
||||||
|
current_content = []
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号
|
||||||
|
match = re.match(r'^(?<![a-zA-Z((])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
|
||||||
|
if not match:
|
||||||
|
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
new_key, line_content = match.groups()
|
||||||
|
line_content = line_content.lstrip('.')
|
||||||
|
# 检查是否应该更新当前键和内容
|
||||||
|
if current_key is None or (compare_headings(current_key, new_key) and (
|
||||||
|
len(current_content) == 0 or current_content[-1][-1] != '第')):
|
||||||
|
if current_key is not None:
|
||||||
|
# 将之前的内容保存到data中,保留第一个换行符,后续的换行符转为空字符
|
||||||
|
# print(current_content)
|
||||||
|
content_string = ''.join(current_content).strip()
|
||||||
|
# print(content_string)
|
||||||
|
data[current_key] = content_string.replace(' ', '')
|
||||||
|
current_key = new_key
|
||||||
|
current_content = [line_content]
|
||||||
|
# 只有当标题级别为两级(如 1.1)时,才设置 append_newline 为 True #TODO:一级标题也是
|
||||||
|
append_newline = len(new_key.split('.')) == 2
|
||||||
|
print(new_key)
|
||||||
|
else:
|
||||||
|
append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
|
||||||
|
else:
|
||||||
|
if line_stripped:
|
||||||
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
|
||||||
|
|
||||||
|
if current_key is not None:
|
||||||
|
# 保存最后一部分内容
|
||||||
|
content_string = ''.join(current_content).strip()
|
||||||
|
data[current_key] = content_string.replace(' ', '')
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
# def convert_to_json(file_path, start_word, end_phrases):
|
||||||
|
# if file_path.endswith('.docx'):
|
||||||
|
# text = extract_text_from_docx(file_path)
|
||||||
|
# elif file_path.endswith('.pdf'):
|
||||||
|
# text = extract_text_from_pdf(file_path,start_word,end_phrases)
|
||||||
|
# else:
|
||||||
|
# raise ValueError("Unsupported file format")
|
||||||
|
# # print(text)
|
||||||
|
# parsed_data = parse_text_by_heading(text)
|
||||||
|
# return parsed_data
|
||||||
|
|
||||||
|
def convert_clause_to_json(input_path,output_folder,type=1):
|
||||||
|
if not os.path.exists(input_path):
|
||||||
|
print(f"The specified file does not exist: {input_path}")
|
||||||
|
return ""
|
||||||
|
if type == 1:
|
||||||
|
start_word = r'^\s*[((]?\s*[一1]\s*[))]?\s*[、.]*\s*(说\s*明|总\s*则)'
|
||||||
|
end_pattern = [r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+']
|
||||||
|
else:
|
||||||
|
start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:'
|
||||||
|
end_pattern=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
|
||||||
|
text = extract_text_from_pdf(file_path, start_word, end_pattern)
|
||||||
|
result = parse_text_by_heading(text)
|
||||||
|
# result = convert_to_json(input_path, start_word, end_pattern)
|
||||||
|
# 检查输出文件夹是否存在,如果不存在则创建
|
||||||
|
if not os.path.exists(output_folder):
|
||||||
|
os.makedirs(output_folder)
|
||||||
|
print(f"Created output folder: {output_folder}")
|
||||||
|
file_name = "clause1.json" if type == 1 else "clause2.json"
|
||||||
|
output_path = os.path.join(output_folder, file_name)
|
||||||
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(result, f, indent=4, ensure_ascii=False)
|
||||||
|
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
|
||||||
|
file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件(一中多媒体报告厅教学设备)_tobidders_notice_part2.pdf'
|
||||||
|
output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1'
|
||||||
|
try:
|
||||||
|
output_path = convert_clause_to_json(file_path,output_folder)
|
||||||
|
print(f"Final JSON result saved to: {output_path}")
|
||||||
|
except ValueError as e:
|
||||||
|
print("Error:", e)
|
@ -161,14 +161,18 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
|
|||||||
return generated_files
|
return generated_files
|
||||||
|
|
||||||
|
|
||||||
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None):
|
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,output_suffix="normal"):
|
||||||
start_page = None
|
start_page = None
|
||||||
end_page = None
|
end_page = None
|
||||||
for i, page in enumerate(pdf_document.pages):
|
for i, page in enumerate(pdf_document.pages):
|
||||||
text = page.extract_text() or ""
|
text = page.extract_text() or ""
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text):
|
if output_suffix=="tobidders_notice":
|
||||||
continue
|
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and start_page is not None:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text):
|
||||||
|
continue
|
||||||
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
|
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
|
||||||
start_page = i
|
start_page = i
|
||||||
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
|
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
|
||||||
@ -183,8 +187,9 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
|||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
exclusion_pattern = None
|
exclusion_pattern = None
|
||||||
if output_suffix == "tobidders_notice":
|
if output_suffix == "tobidders_notice":
|
||||||
|
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
|
||||||
start_page, mid_page, end_page = extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern,
|
start_page, mid_page, end_page = extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern,
|
||||||
begin_page, common_header)
|
begin_page, common_header,exclusion_pattern)
|
||||||
if start_page is None or mid_page is None or end_page is None:
|
if start_page is None or mid_page is None or end_page is None:
|
||||||
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
||||||
return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header)
|
return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header)
|
||||||
@ -208,13 +213,15 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
|||||||
print(f"Error processing {pdf_path}: {e}")
|
print(f"Error processing {pdf_path}: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
|
def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header,exclusion_pattern):
|
||||||
start_page = None
|
start_page = None
|
||||||
mid_page = None
|
mid_page = None
|
||||||
end_page = None
|
end_page = None
|
||||||
for i, page in enumerate(pdf_document.pages):
|
for i, page in enumerate(pdf_document.pages):
|
||||||
text = page.extract_text() or ""
|
text = page.extract_text() or ""
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
|
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and mid_page is not None:
|
||||||
|
continue
|
||||||
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
|
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
|
||||||
start_page = i
|
start_page = i
|
||||||
if start_page is not None and mid_page is None and re.search(
|
if start_page is not None and mid_page is None and re.search(
|
||||||
@ -266,6 +273,7 @@ def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix,
|
|||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
|
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
|
||||||
)
|
)
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
|
||||||
# 提取第一部分
|
# 提取第一部分
|
||||||
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, -1, common_header)
|
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, -1, common_header)
|
||||||
if start_page1 is None or end_page1 is None:
|
if start_page1 is None or end_page1 is None:
|
||||||
@ -273,7 +281,7 @@ def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix,
|
|||||||
return None, None
|
return None, None
|
||||||
# 提取第二部分
|
# 提取第二部分
|
||||||
start_page2 = end_page1 # 第二部分的开始页就是第一部分的结束页
|
start_page2 = end_page1 # 第二部分的开始页就是第一部分的结束页
|
||||||
_, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header)
|
_, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,exclusion_pattern)
|
||||||
if end_page2 is None:
|
if end_page2 is None:
|
||||||
print(f"second: {output_suffix} 未找到第二部分的结束页在文件 {pdf_path} 中!")
|
print(f"second: {output_suffix} 未找到第二部分的结束页在文件 {pdf_path} 中!")
|
||||||
return None, None
|
return None, None
|
||||||
@ -390,7 +398,7 @@ def truncate_pdf_multiple(input_path, output_folder):
|
|||||||
|
|
||||||
# TODO:交通智能系统和招标(1)(1)文件有问题 sele=4的时候excludsion有问题
|
# TODO:交通智能系统和招标(1)(1)文件有问题 sele=4的时候excludsion有问题
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目).pdf"
|
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
||||||
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output4"
|
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output4"
|
||||||
# truncate_pdf_multiple(input_path,output_folder)
|
# truncate_pdf_multiple(input_path,output_folder)
|
||||||
selection = 4 # 例如:1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1和qualification2 4.投标人须知前附表
|
selection = 4 # 例如:1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1和qualification2 4.投标人须知前附表
|
||||||
|
Loading…
x
Reference in New Issue
Block a user