11.13 无效投标
This commit is contained in:
parent
b61d240957
commit
6e0b2e4a68
@ -33,6 +33,9 @@ def handle_content_append(current_content, line_content, append_newline, keyword
|
|||||||
current_content.append('\n')
|
current_content.append('\n')
|
||||||
return append_newline
|
return append_newline
|
||||||
|
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
def parse_text_by_heading(text):
|
def parse_text_by_heading(text):
|
||||||
keywords = ['包含', '以下']
|
keywords = ['包含', '以下']
|
||||||
data = {}
|
data = {}
|
||||||
@ -44,10 +47,16 @@ def parse_text_by_heading(text):
|
|||||||
skip_subheadings = False
|
skip_subheadings = False
|
||||||
last_main_number = None
|
last_main_number = None
|
||||||
temp_title = None # 临时存储以点号开头但不带数字的标题
|
temp_title = None # 临时存储以点号开头但不带数字的标题
|
||||||
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') #一、
|
|
||||||
pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*') #(一)
|
# 定义所有需要的正则表达式模式
|
||||||
|
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') # 一、
|
||||||
|
pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*') # (一)
|
||||||
|
pattern_letter_initial = re.compile(r'^([A-Z])\.?\s*(.*)$') # 初始扫描时匹配 A内容 或 A. 内容
|
||||||
|
pattern_letter = re.compile(r'^([A-Z])\.\s*(.*)$') # 主循环中严格匹配 A. 内容
|
||||||
|
pattern_arabic = re.compile(r'^(\d+、)\s*(.+)$') # 1、内容
|
||||||
|
|
||||||
initial_heading_pattern = None
|
initial_heading_pattern = None
|
||||||
special_section_keywords = ['文件的组成', '文件的构成','文件包括:','文件包括:'] # 定义特殊章节关键词
|
special_section_keywords = ['文件的组成', '文件的构成','文件包括:','文件包括:','雷同认定'] # 定义特殊章节关键词
|
||||||
in_special_section = False # 标志是否在特殊章节中
|
in_special_section = False # 标志是否在特殊章节中
|
||||||
lines = text.split('\n')
|
lines = text.split('\n')
|
||||||
|
|
||||||
@ -69,6 +78,18 @@ def parse_text_by_heading(text):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# 预先扫描前5行,检查是否有匹配任何标题模式
|
||||||
|
first_five_lines = lines[:5]
|
||||||
|
has_initial_heading_patterns = False
|
||||||
|
for line in first_five_lines:
|
||||||
|
line_stripped = line.strip().replace('.', '.')
|
||||||
|
if (pattern_numbered.match(line_stripped) or
|
||||||
|
pattern_parentheses.match(line_stripped) or
|
||||||
|
pattern_letter_initial.match(line_stripped) or
|
||||||
|
pattern_arabic.match(line_stripped)):
|
||||||
|
has_initial_heading_patterns = True
|
||||||
|
break
|
||||||
|
|
||||||
for i, line in enumerate(lines):
|
for i, line in enumerate(lines):
|
||||||
line_stripped = line.strip().replace('.', '.')
|
line_stripped = line.strip().replace('.', '.')
|
||||||
|
|
||||||
@ -97,12 +118,11 @@ def parse_text_by_heading(text):
|
|||||||
dot_text_match = re.match(r'^[..]\s*(\D.+)$', line_stripped)
|
dot_text_match = re.match(r'^[..]\s*(\D.+)$', line_stripped)
|
||||||
|
|
||||||
# 匹配不带点号的纯数字开头的情况,例如 '27xxxxx'
|
# 匹配不带点号的纯数字开头的情况,例如 '27xxxxx'
|
||||||
# pure_number_match = re.match(r'^(\d+)([^.\d].*)', line_stripped)
|
pure_number_match = re.match(r'^(\d+)([^.\d)()].*)', line_stripped) # 不允许出现右括号
|
||||||
pure_number_match = re.match(r'^(\d+)([^.\d)()].*)', line_stripped) #不允许出现右括号
|
|
||||||
|
|
||||||
if match:
|
if match:
|
||||||
new_key, line_content = match.groups()
|
new_key, line_content = match.groups()
|
||||||
line_content = line_content.lstrip('..、,')
|
line_content = line_content.lstrip('..、,').replace(' ', '')
|
||||||
|
|
||||||
if any(keyword in line_content for keyword in keywords):
|
if any(keyword in line_content for keyword in keywords):
|
||||||
skip_subheadings = True
|
skip_subheadings = True
|
||||||
@ -127,13 +147,13 @@ def parse_text_by_heading(text):
|
|||||||
len(current_content) == 0 or current_content[-1][-1] != '第')):
|
len(current_content) == 0 or current_content[-1][-1] != '第')):
|
||||||
if current_key is not None:
|
if current_key is not None:
|
||||||
content_string = ''.join(current_content).strip()
|
content_string = ''.join(current_content).strip()
|
||||||
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
|
data[current_key] = data.get(current_key, '') + content_string
|
||||||
current_key = new_key
|
current_key = new_key
|
||||||
current_content = [line_content]
|
current_content = [line_content]
|
||||||
append_newline = len(new_key.rstrip('.').split('.')) <= 2
|
append_newline = len(new_key.rstrip('.').split('.')) <= 2
|
||||||
last_main_number = new_key.split('.')[0]
|
last_main_number = new_key.split('.')[0]
|
||||||
else:
|
else:
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
|
||||||
|
|
||||||
elif dot_match:
|
elif dot_match:
|
||||||
# 处理以点号开头并带有数字的情况
|
# 处理以点号开头并带有数字的情况
|
||||||
@ -143,12 +163,12 @@ def parse_text_by_heading(text):
|
|||||||
new_key = f"{last_main_number}.{sub_number}"
|
new_key = f"{last_main_number}.{sub_number}"
|
||||||
if current_key is not None:
|
if current_key is not None:
|
||||||
content_string = ''.join(current_content).strip()
|
content_string = ''.join(current_content).strip()
|
||||||
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
|
data[current_key] = data.get(current_key, '') + content_string
|
||||||
current_key = new_key
|
current_key = new_key
|
||||||
current_content = [line_content]
|
current_content = [line_content]
|
||||||
append_newline = True
|
append_newline = True
|
||||||
else:
|
else:
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
|
||||||
|
|
||||||
elif dot_text_match:
|
elif dot_text_match:
|
||||||
# 处理以点号开头但不带数字的情况,存储到临时变量
|
# 处理以点号开头但不带数字的情况,存储到临时变量
|
||||||
@ -159,7 +179,7 @@ def parse_text_by_heading(text):
|
|||||||
# 处理不带点号的纯数字开头的情况
|
# 处理不带点号的纯数字开头的情况
|
||||||
new_key_candidate, line_content = pure_number_match.groups()
|
new_key_candidate, line_content = pure_number_match.groups()
|
||||||
new_key_candidate += '.' # 添加点号
|
new_key_candidate += '.' # 添加点号
|
||||||
line_content = line_content.lstrip('..、,')
|
line_content = line_content.lstrip('..、,').replace(' ', '')
|
||||||
|
|
||||||
# 提取当前键名和新键名的数值部分
|
# 提取当前键名和新键名的数值部分
|
||||||
def extract_number(key):
|
def extract_number(key):
|
||||||
@ -179,29 +199,30 @@ def parse_text_by_heading(text):
|
|||||||
# 开始新的标题
|
# 开始新的标题
|
||||||
if current_key is not None:
|
if current_key is not None:
|
||||||
content_string = ''.join(current_content).strip()
|
content_string = ''.join(current_content).strip()
|
||||||
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
|
data[current_key] = data.get(current_key, '') + content_string
|
||||||
current_key = new_key_candidate
|
current_key = new_key_candidate
|
||||||
current_content = [line_content]
|
current_content = [line_content]
|
||||||
append_newline = True
|
append_newline = True
|
||||||
last_main_number = new_key_candidate.rstrip('.')
|
last_main_number = new_key_candidate.rstrip('.')
|
||||||
else:
|
else:
|
||||||
# 将当前行视为当前标题的内容
|
# 将当前行视为当前标题的内容
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if not skip_subheadings and not in_special_section: # 增加 in_special_section 判断
|
# 根据预先设置的标志决定是否执行这部分代码
|
||||||
|
if has_initial_heading_patterns and not skip_subheadings and not in_special_section:
|
||||||
numbered_match = pattern_numbered.match(line_stripped)
|
numbered_match = pattern_numbered.match(line_stripped)
|
||||||
parenthesis_match = pattern_parentheses.match(line_stripped)
|
parentheses_match = pattern_parentheses.match(line_stripped)
|
||||||
letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
|
letter_match = pattern_letter.match(line_stripped)
|
||||||
arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
|
arabic_match = pattern_arabic.match(line_stripped)
|
||||||
|
|
||||||
# 判断当前行是否匹配了任何标题模式
|
# 判断当前行是否匹配了任何标题模式
|
||||||
if numbered_match or parenthesis_match or letter_match or arabic_match:
|
if numbered_match or parentheses_match or letter_match or arabic_match:
|
||||||
# 如果初始标题模式尚未设置,则记录当前匹配的标题模式
|
# 如果初始标题模式尚未设置,则记录当前匹配的标题模式
|
||||||
if initial_heading_pattern is None:
|
if initial_heading_pattern is None:
|
||||||
if numbered_match:
|
if numbered_match:
|
||||||
initial_heading_pattern = 'numbered'
|
initial_heading_pattern = 'numbered'
|
||||||
elif parenthesis_match:
|
elif parentheses_match:
|
||||||
initial_heading_pattern = 'parentheses'
|
initial_heading_pattern = 'parentheses'
|
||||||
elif letter_match:
|
elif letter_match:
|
||||||
initial_heading_pattern = 'letter'
|
initial_heading_pattern = 'letter'
|
||||||
@ -211,7 +232,7 @@ def parse_text_by_heading(text):
|
|||||||
# 确定当前匹配的标题模式
|
# 确定当前匹配的标题模式
|
||||||
if numbered_match:
|
if numbered_match:
|
||||||
current_heading_pattern = 'numbered'
|
current_heading_pattern = 'numbered'
|
||||||
elif parenthesis_match:
|
elif parentheses_match:
|
||||||
current_heading_pattern = 'parentheses'
|
current_heading_pattern = 'parentheses'
|
||||||
elif letter_match:
|
elif letter_match:
|
||||||
current_heading_pattern = 'letter'
|
current_heading_pattern = 'letter'
|
||||||
@ -223,7 +244,7 @@ def parse_text_by_heading(text):
|
|||||||
# 保存之前的 key 的内容
|
# 保存之前的 key 的内容
|
||||||
if current_key is not None:
|
if current_key is not None:
|
||||||
content_string = ''.join(current_content).strip()
|
content_string = ''.join(current_content).strip()
|
||||||
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
|
data[current_key] = data.get(current_key, '') + content_string
|
||||||
if current_key_chinese is not None:
|
if current_key_chinese is not None:
|
||||||
data[current_key_chinese] = current_value_chinese
|
data[current_key_chinese] = current_value_chinese
|
||||||
current_key_chinese = None
|
current_key_chinese = None
|
||||||
@ -233,8 +254,8 @@ def parse_text_by_heading(text):
|
|||||||
current_key_chinese = numbered_match.group(1)
|
current_key_chinese = numbered_match.group(1)
|
||||||
current_value_chinese = line_stripped[numbered_match.end():].lstrip('..、,').replace(' ', '')
|
current_value_chinese = line_stripped[numbered_match.end():].lstrip('..、,').replace(' ', '')
|
||||||
elif current_heading_pattern == 'parentheses':
|
elif current_heading_pattern == 'parentheses':
|
||||||
current_key_chinese = parenthesis_match.group(1)
|
current_key_chinese = parentheses_match.group(1)
|
||||||
current_value_chinese = line_stripped[parenthesis_match.end():].lstrip('..、,').replace(' ', '')
|
current_value_chinese = line_stripped[parentheses_match.end():].lstrip('..、,').replace(' ', '')
|
||||||
elif current_heading_pattern == 'letter':
|
elif current_heading_pattern == 'letter':
|
||||||
# 字母标题处理
|
# 字母标题处理
|
||||||
letter_key, letter_value = letter_match.groups()
|
letter_key, letter_value = letter_match.groups()
|
||||||
@ -248,27 +269,27 @@ def parse_text_by_heading(text):
|
|||||||
elif current_heading_pattern == 'arabic':
|
elif current_heading_pattern == 'arabic':
|
||||||
arabic_key, arabic_value = arabic_match.groups()
|
arabic_key, arabic_value = arabic_match.groups()
|
||||||
current_key = arabic_key.replace('、', '.')
|
current_key = arabic_key.replace('、', '.')
|
||||||
current_content = [arabic_value]
|
current_content = [arabic_value.replace(' ', '')]
|
||||||
append_newline = True
|
append_newline = True
|
||||||
last_main_number = current_key.rstrip('.')
|
last_main_number = current_key.rstrip('.')
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
# 当前标题模式与初始模式不一致,将该行视为内容
|
# 当前标题模式与初始模式不一致,将该行视为内容
|
||||||
if line_stripped:
|
if line_stripped:
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
|
||||||
else:
|
else:
|
||||||
# 未匹配到任何标题模式,将该行视为内容
|
# 未匹配到任何标题模式,将该行视为内容
|
||||||
if line_stripped:
|
if line_stripped:
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
|
||||||
else:
|
else:
|
||||||
# 在特殊章节中,所有内容都作为当前标题的内容
|
# 在特殊章节中,所有内容都作为当前标题的内容
|
||||||
if line_stripped:
|
if line_stripped:
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
|
||||||
|
|
||||||
# 最后保存最后一个 key 对应的内容
|
# 最后保存最后一个 key 对应的内容
|
||||||
if current_key is not None:
|
if current_key is not None:
|
||||||
content_string = ''.join(current_content).strip()
|
content_string = ''.join(current_content).strip()
|
||||||
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
|
data[current_key] = data.get(current_key, '') + content_string
|
||||||
if current_key_chinese is not None:
|
if current_key_chinese is not None:
|
||||||
data[current_key_chinese] = current_value_chinese
|
data[current_key_chinese] = current_value_chinese
|
||||||
if temp_title:
|
if temp_title:
|
||||||
@ -277,6 +298,10 @@ def parse_text_by_heading(text):
|
|||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def extract_text_from_pdf(file_path, start_word, end_pattern):
|
def extract_text_from_pdf(file_path, start_word, end_pattern):
|
||||||
# 从PDF文件中提取文本
|
# 从PDF文件中提取文本
|
||||||
common_header = extract_common_header(file_path)
|
common_header = extract_common_header(file_path)
|
||||||
@ -308,3 +333,4 @@ def extract_text_from_pdf(file_path, start_word, end_pattern):
|
|||||||
# 合并所有页面的文本
|
# 合并所有页面的文本
|
||||||
full_text = "\n".join(all_pages_text)
|
full_text = "\n".join(all_pages_text)
|
||||||
return full_text
|
return full_text
|
||||||
|
|
||||||
|
@ -96,7 +96,7 @@ def extract_text_by_page(file_path):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
||||||
file_path ="C:\\Users\\Administrator\\Desktop\\fsdownload\\1ca1d27d-fc21-4697-8075-9027103df030\\ztbfile.pdf"
|
file_path =r"C:\Users\Administrator\Desktop\货物标\output4_2\招标文件111_tobidders_notice_part2.pdf"
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
||||||
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
||||||
|
@ -591,13 +591,13 @@ if __name__ == "__main__":
|
|||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标"
|
||||||
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf"
|
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf"
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
|
||||||
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
|
input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest20.pdf"
|
||||||
output_folder = "C:\\Users\\Administrator\\Desktop\\new招标文件\\output3"
|
output_folder = "C:\\Users\\Administrator\\Desktop\\new招标文件\\output3"
|
||||||
# files=truncate_pdf_multiple(input_path,output_folder)
|
# files=truncate_pdf_multiple(input_path,output_folder)
|
||||||
# selections = [4, 1] # 仅处理 selection 4、1
|
# selections = [4, 1] # 仅处理 selection 4、1
|
||||||
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
|
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
|
||||||
# print(files)
|
# print(files)
|
||||||
selection = 3 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
|
selection = 1 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
|
||||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||||
print(generated_files)
|
print(generated_files)
|
||||||
# print("生成的文件:", generated_files)
|
# print("生成的文件:", generated_files)
|
||||||
|
@ -12,11 +12,12 @@ def convert_clause_to_json(file_path, output_folder, type=1):
|
|||||||
r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文)'
|
r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文)'
|
||||||
)
|
)
|
||||||
end_pattern = (
|
end_pattern = (
|
||||||
r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
|
# r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
|
||||||
|
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$|'
|
||||||
r'评标办法前附表|'
|
r'评标办法前附表|'
|
||||||
r'附录(?:一)?[::]|'
|
r'附录(?:一)?[::]|'
|
||||||
r'附件(?:一)?[::]|'
|
r'附件(?:一)?[::]|'
|
||||||
r'附表(?:一)?[::])$'
|
r'附表(?:一)?[::]'
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# 对于其他类型,使用原始字符串定义 start_word 和 end_pattern
|
# 对于其他类型,使用原始字符串定义 start_word 和 end_pattern
|
||||||
@ -29,7 +30,8 @@ def convert_clause_to_json(file_path, output_folder, type=1):
|
|||||||
r'|.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\))]?\s*$'
|
r'|.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\))]?\s*$'
|
||||||
)
|
)
|
||||||
end_pattern = (
|
end_pattern = (
|
||||||
r'^(?:第[一二三四五六七八九十]+章\s*投标人须知|投标人须知前附表)$'
|
r'^(?:第[一二三四五六七八九十]+章\s*投标人须知|投标人须知前附表)$|'
|
||||||
|
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$'
|
||||||
)
|
)
|
||||||
|
|
||||||
if file_path.endswith('.pdf'):
|
if file_path.endswith('.pdf'):
|
||||||
@ -91,8 +93,9 @@ def convert_clause_to_json(file_path, output_folder, type=1):
|
|||||||
# json.dump(processed_data, file, ensure_ascii=False, indent=4)
|
# json.dump(processed_data, file, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
file_path='C:\\Users\\Administrator\\Desktop\\fsdownload\\dafa8e6f-319e-4547-abbd-65375bf82004\\ztbfile_tobidders_notice.pdf'
|
# file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zbtest20_tobidders_notice.pdf'
|
||||||
output_folder = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\dafa8e6f-319e-4547-abbd-65375bf82004\\tmp'
|
file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zbtest4_tobidders_notice.pdf'
|
||||||
|
output_folder = r'C:\Users\Administrator\Desktop\招标文件\招标test文件夹\tmp'
|
||||||
try:
|
try:
|
||||||
output_path = convert_clause_to_json(file_path,output_folder)
|
output_path = convert_clause_to_json(file_path,output_folder)
|
||||||
print(f"Final JSON result saved to: {output_path}")
|
print(f"Final JSON result saved to: {output_path}")
|
||||||
|
@ -80,7 +80,7 @@ def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
|
|||||||
output_suffix="normal"):
|
output_suffix="normal"):
|
||||||
start_page = None
|
start_page = None
|
||||||
end_page = None
|
end_page = None
|
||||||
for i, page in enumerate(pdf_document.pages):
|
for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
|
||||||
text = page.extract_text() or ""
|
text = page.extract_text() or ""
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
if output_suffix == "tobidders_notice":
|
if output_suffix == "tobidders_notice":
|
||||||
@ -430,6 +430,7 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
|||||||
# return path1, path2
|
# return path1, path2
|
||||||
|
|
||||||
def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page):
|
def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page):
|
||||||
|
output_suffix="tobidders_notice"
|
||||||
begin_pattern = re.compile(
|
begin_pattern = re.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+',re.MULTILINE
|
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+',re.MULTILINE
|
||||||
)
|
)
|
||||||
@ -462,8 +463,8 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page)
|
|||||||
return start_page1, end_page1,end_page1
|
return start_page1, end_page1,end_page1
|
||||||
|
|
||||||
# 如果不包含排除关键词,继续提取第二部分
|
# 如果不包含排除关键词,继续提取第二部分
|
||||||
_, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
|
_, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2-1, common_header,
|
||||||
exclusion_pattern)
|
exclusion_pattern,output_suffix)
|
||||||
|
|
||||||
if end_page2 is None:
|
if end_page2 is None:
|
||||||
return start_page1, end_page1,end_page1
|
return start_page1, end_page1,end_page1
|
||||||
@ -471,9 +472,6 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page)
|
|||||||
return start_page1, end_page1,end_page2
|
return start_page1, end_page1,end_page2
|
||||||
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
def extract_pages_qualification(pdf_document, begin_page, common_header):
|
def extract_pages_qualification(pdf_document, begin_page, common_header):
|
||||||
# 开始匹配模式,仅匹配“附录”、“附件”或“附表”
|
# 开始匹配模式,仅匹配“附录”、“附件”或“附表”
|
||||||
begin_pattern = re.compile(
|
begin_pattern = re.compile(
|
||||||
@ -834,16 +832,16 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
|
|||||||
#ztbfile.pdf少资格评审 包头少符合性评审
|
#ztbfile.pdf少资格评审 包头少符合性评审
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
||||||
input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\1ca1d27d-fc21-4697-8075-9027103df030\\ztbfile.pdf"
|
input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件111.pdf"
|
||||||
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
|
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
||||||
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
|
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
|
||||||
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
|
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
|
||||||
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output3"
|
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output4_2"
|
||||||
# files = truncate_pdf_multiple(input_path, output_folder)
|
# files = truncate_pdf_multiple(input_path, output_folder)
|
||||||
# selections = [1,4]
|
# selections = [1,4]
|
||||||
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
|
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
|
||||||
# print(files)
|
# print(files)
|
||||||
selection = 3# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
selection = 4# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
||||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||||
print(generated_files)
|
print(generated_files)
|
@ -128,13 +128,14 @@ def clean_content(content):
|
|||||||
return cleaned_content
|
return cleaned_content
|
||||||
|
|
||||||
#如果file_path为空,返回""
|
#如果file_path为空,返回""
|
||||||
|
#TODO:这里的start_word end_pattern可以优化
|
||||||
def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json"):
|
def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json"):
|
||||||
if not os.path.exists(file_path):
|
if not os.path.exists(file_path):
|
||||||
print(f"The specified file does not exist: {file_path}")
|
print(f"The specified file does not exist: {file_path}")
|
||||||
return ""
|
return ""
|
||||||
if type == 1:
|
if type == 1:
|
||||||
start_word = r'^\s*(?:[((]?\s*[一二12]?\s*[))]?\s*[、..]*\s*)?(说\s*明|总\s*则)'
|
start_word = r'^\s*(?:[((]?\s*[一二12]?\s*[))]?\s*[、..]*\s*)?(说\s*明|总\s*则)'
|
||||||
end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*)$'
|
end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*)$'
|
||||||
else:
|
else:
|
||||||
start_word = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*|.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*)$'
|
start_word = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*|.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*)$'
|
||||||
end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
|
end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
|
||||||
@ -175,10 +176,10 @@ def process_folder(input_folder, output_folder):
|
|||||||
|
|
||||||
#TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次)_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏
|
#TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次)_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏
|
||||||
#TODO: 投标人须知正文这块,序号可能是乱序的,或许可以删除判断序号大小的逻辑,只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理
|
#TODO: 投标人须知正文这块,序号可能是乱序的,或许可以删除判断序号大小的逻辑,只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理
|
||||||
|
#TODO:招标文件111_tobidders_notice_part2.pdf 陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1)_tobidders_notice_part2.pdf
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
|
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
|
||||||
file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zb_tobidders_notice.pdf'
|
file_path=r'C:\Users\Administrator\Desktop\货物标\output4_2\招标文件111_tobidders_notice_part2.pdf'
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
||||||
output_folder = r'C:\Users\Administrator\Desktop\招标文件\output3\tmp'
|
output_folder = r'C:\Users\Administrator\Desktop\招标文件\output3\tmp'
|
||||||
try:
|
try:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user