12.9 修复解析bug
This commit is contained in:
parent
e5e63e400b
commit
c0c7871767
@ -76,7 +76,7 @@ def clean_page_content(text, common_header):
|
|||||||
text = re.sub(r'^第\d+页\s*', '', text)
|
text = re.sub(r'^第\d+页\s*', '', text)
|
||||||
# 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除
|
# 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除
|
||||||
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 投标人须知这块, 页码和顶部序号混在一起的时候也会把序号给去除了。'2018.' 20为页码 18.为序号
|
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 投标人须知这块, 页码和顶部序号混在一起的时候也会把序号给去除了。'2018.' 20为页码 18.为序号
|
||||||
text = re.sub(r'^\s*\/\s*(共\s*)?\d+\s*(页)?\s*', '', text) #删除/123 /共123 /共123页 /123页
|
text = re.sub(r'^\s*\/?\s*(共\s*)?\d+\s*(页)?\s*', '', text) #删除/123 /共123 /共123页 /123页
|
||||||
text = re.sub(r'^\s*[—-]\s*\d{1,3}\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
|
text = re.sub(r'^\s*[—-]\s*\d{1,3}\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ import re
|
|||||||
from functools import cmp_to_key
|
from functools import cmp_to_key
|
||||||
|
|
||||||
from flask_app.general.json_utils import clean_json_string
|
from flask_app.general.json_utils import clean_json_string
|
||||||
from flask_app.general.通义千问long import upload_file, qianwen_long
|
from flask_app.general.通义千问long import upload_file, qianwen_long_stream
|
||||||
|
|
||||||
|
|
||||||
def compare_headings(a, b):
|
def compare_headings(a, b):
|
||||||
@ -321,7 +321,11 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection):
|
|||||||
# }
|
# }
|
||||||
# """,
|
# """,
|
||||||
1:"""
|
1:"""
|
||||||
该招标文件中对投标文件的要求有哪些?请以json格式给我返回结果,外层键名为文中提到的有关投标文件的若干要求,对于各子要求,你可以用嵌套键值对组织回答,其中嵌套键名为你对相关子要求的总结或是原文中的标题,而最内层的键值应该完全与原文内容保持一致;若存在多个并列内容,那么键值为一个列表,列表中包含若干描述该要求的字符串,要求键值与原文内容一致,不得总结、删减。示例如下,仅供格式参考:
|
该招标文件中对投标文件的要求有哪些?请以json格式给我返回结果,外层键名为文中提到的有关投标文件的若干要求,对于各子要求,你可以用嵌套键值对组织回答,其中嵌套键名为你对相关子要求的总结或是原文中的标题,而最内层的键值应该完全与原文内容保持一致;若存在多个并列内容,那么键值为一个列表,列表中包含若干描述该要求的字符串,要求键值与原文内容一致,不得总结、删减。
|
||||||
|
其他要求:
|
||||||
|
1.若原文中为对应内容有表格表述,请将该表格用markdown语法返回,每行写在键值中的一个字符串中。
|
||||||
|
|
||||||
|
示例如下,仅供格式参考:
|
||||||
{
|
{
|
||||||
"编写要求":"编写要求xxx",
|
"编写要求":"编写要求xxx",
|
||||||
"格式要求":{
|
"格式要求":{
|
||||||
@ -355,7 +359,11 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection):
|
|||||||
# }
|
# }
|
||||||
# """,
|
# """,
|
||||||
2:"""
|
2:"""
|
||||||
该招标文件中开评定标要求(或磋商流程内容)是什么?请以json格式给我返回结果,外层键名为文中提到的有关开评定标要求(或磋商流程内容)的若干要求,对于各子要求,你可以用嵌套键值对组织回答,其中嵌套键名为你对相关子要求的总结或是原文中的标题,而最内层的键值应该完全与原文内容保持一致;若存在多个并列内容,那么键值为一个列表,列表中包含若干描述该要求的字符串,要求键值与原文内容一致,不得总结、删减。示例如下,仅供格式参考:
|
该招标文件中开评定标要求(或磋商流程内容)是什么?请以json格式给我返回结果,外层键名为文中提到的有关开评定标要求(或磋商流程内容)的若干要求,对于各子要求,你可以用嵌套键值对组织回答,其中嵌套键名为你对相关子要求的总结或是原文中的标题,而最内层的键值应该完全与原文内容保持一致;若存在多个并列内容,那么键值为一个列表,列表中包含若干描述该要求的字符串,要求键值与原文内容一致,不得总结、删减。
|
||||||
|
其他要求:
|
||||||
|
1.若原文中为对应内容有表格表述,请将该表格用markdown语法返回,每行写在键值中的一个字符串中。
|
||||||
|
|
||||||
|
示例如下,仅供格式参考:
|
||||||
{
|
{
|
||||||
"开标":"招标文件关于项目开标的要求",
|
"开标":"招标文件关于项目开标的要求",
|
||||||
"开标异议":"招标文件中关于开标异议的项",
|
"开标异议":"招标文件中关于开标异议的项",
|
||||||
@ -387,7 +395,7 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection):
|
|||||||
return {"error": f"无效的 selection 值: {selection}. 请选择 1、2 或 3。"}
|
return {"error": f"无效的 selection 值: {selection}. 请选择 1、2 或 3。"}
|
||||||
# 调用大模型并处理响应
|
# 调用大模型并处理响应
|
||||||
try:
|
try:
|
||||||
res = qianwen_long(file_id, user_query)
|
res = qianwen_long_stream(file_id, user_query)
|
||||||
cleaned_res = clean_json_string(res)
|
cleaned_res = clean_json_string(res)
|
||||||
return cleaned_res
|
return cleaned_res
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -51,15 +51,25 @@ def parse_text_by_heading(text):
|
|||||||
# 定义所有需要的正则表达式模式
|
# 定义所有需要的正则表达式模式
|
||||||
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') # 一、
|
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') # 一、
|
||||||
pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*') # (一)
|
pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*') # (一)
|
||||||
pattern_letter_initial = re.compile(r'^([A-Z])\.?\s*(.*)$') # 初始扫描时匹配 A内容 或 A. 内容
|
pattern_letter_initial = re.compile(r'^([A-Z])[..、]\s*(.*)$') # 初始扫描时匹配 A内容 或 A. 内容
|
||||||
pattern_letter = re.compile(r'^([A-Z])\.\s*(.*)$') # 主循环中严格匹配 A. 内容
|
pattern_letter = re.compile(r'^([A-Z])[..、]\s*(.*)$') # 主循环中严格匹配 A. 内容
|
||||||
pattern_arabic = re.compile(r'^(\d+、)\s*(.+)$') # 1、内容
|
# pattern_arabic = re.compile(r'^(\d+、)\s*(.+)$') # 1、内容
|
||||||
|
|
||||||
initial_heading_pattern = None
|
initial_heading_pattern = None
|
||||||
special_section_keywords = ['文件的组成', '文件的构成','文件包括:','文件包括:','雷同认定','包括以下内容'] # 定义特殊章节关键词
|
special_section_keywords = ['文件的组成', '文件的构成','文件包括:','文件包括:','雷同认定','包括以下内容'] # 定义特殊章节关键词
|
||||||
in_special_section = False # 标志是否在特殊章节中
|
in_special_section = False # 标志是否在特殊章节中
|
||||||
lines = text.split('\n')
|
lines = text.split('\n')
|
||||||
|
|
||||||
|
def get_current_number(key_chinese):
|
||||||
|
chinese_to_number = {
|
||||||
|
'一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
|
||||||
|
'六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
|
||||||
|
'十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15
|
||||||
|
}
|
||||||
|
if not key_chinese:
|
||||||
|
return -1
|
||||||
|
return chinese_to_number.get(key_chinese, -1)
|
||||||
|
|
||||||
def check_year_pattern(line):
|
def check_year_pattern(line):
|
||||||
# 检查是否为年份模式,如 '2014年'
|
# 检查是否为年份模式,如 '2014年'
|
||||||
line_no_spaces = line.replace(' ', '')
|
line_no_spaces = line.replace(' ', '')
|
||||||
@ -78,21 +88,26 @@ def parse_text_by_heading(text):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# 预先扫描前5行,检查是否有匹配任何标题模式
|
# 预先扫描前5行,检查是否有匹配任何标题模式 '一、总则' 这种
|
||||||
first_five_lines = lines[:5]
|
first_five_lines = lines[:5]
|
||||||
has_initial_heading_patterns = False
|
has_initial_heading_patterns = False
|
||||||
for line in first_five_lines:
|
for line in first_five_lines:
|
||||||
line_stripped = line.strip().replace('.', '.')
|
line_stripped = line.strip().replace('.', '.')
|
||||||
|
if line_stripped.startswith("##"):
|
||||||
|
line_stripped = line_stripped[2:] # Remove "##"
|
||||||
if (pattern_numbered.match(line_stripped) or
|
if (pattern_numbered.match(line_stripped) or
|
||||||
pattern_parentheses.match(line_stripped) or
|
pattern_parentheses.match(line_stripped) or
|
||||||
pattern_letter_initial.match(line_stripped) or
|
pattern_letter_initial.match(line_stripped)
|
||||||
pattern_arabic.match(line_stripped)):
|
):
|
||||||
has_initial_heading_patterns = True
|
has_initial_heading_patterns = True
|
||||||
break
|
break
|
||||||
|
|
||||||
for i, line in enumerate(lines):
|
for i, line in enumerate(lines):
|
||||||
|
in_double_hash_mode = False
|
||||||
line_stripped = line.strip().replace('.', '.')
|
line_stripped = line.strip().replace('.', '.')
|
||||||
|
if line_stripped.startswith("##"):
|
||||||
|
in_double_hash_mode = True
|
||||||
|
line_stripped = line_stripped[2:] # Remove "##"
|
||||||
# 如果在一个章节内,跳过年份模式的行
|
# 如果在一个章节内,跳过年份模式的行
|
||||||
if check_year_pattern(line_stripped) and current_key is not None:
|
if check_year_pattern(line_stripped) and current_key is not None:
|
||||||
current_content.append(line_stripped)
|
current_content.append(line_stripped)
|
||||||
@ -115,10 +130,10 @@ def parse_text_by_heading(text):
|
|||||||
dot_match = re.match(r'^[..](\d+(?:[..]\d+)*)\s*(.+)$', line_stripped)
|
dot_match = re.match(r'^[..](\d+(?:[..]\d+)*)\s*(.+)$', line_stripped)
|
||||||
|
|
||||||
# 匹配以点号开头但不带数字的情况,例如 '.投标文件的制作和签署'
|
# 匹配以点号开头但不带数字的情况,例如 '.投标文件的制作和签署'
|
||||||
dot_text_match = re.match(r'^[..]\s*(\D.+)$', line_stripped)
|
dot_text_match = re.match(r'^[..、]\s*(\D.+)$', line_stripped)
|
||||||
|
|
||||||
# 匹配不带点号的纯数字开头的情况,例如 '27xxxxx'
|
# 匹配不带点号的纯数字开头的情况,例如 '27xxxxx'
|
||||||
pure_number_match = re.match(r'^(\d+)([^.\d)()].*)', line_stripped) # 不允许出现右括号
|
pure_number_match = re.match(r'^(\d+)([^.\d)()号条款].*)', line_stripped) # 不允许出现右括号
|
||||||
|
|
||||||
if match:
|
if match:
|
||||||
new_key, line_content = match.groups()
|
new_key, line_content = match.groups()
|
||||||
@ -132,7 +147,7 @@ def parse_text_by_heading(text):
|
|||||||
# 保存中文标题的内容
|
# 保存中文标题的内容
|
||||||
if current_key_chinese is not None:
|
if current_key_chinese is not None:
|
||||||
data[current_key_chinese] = current_value_chinese
|
data[current_key_chinese] = current_value_chinese
|
||||||
current_key_chinese = None
|
# current_key_chinese = None
|
||||||
|
|
||||||
# 处理临时标题与新主标题的关联
|
# 处理临时标题与新主标题的关联
|
||||||
if temp_title:
|
if temp_title:
|
||||||
@ -156,45 +171,52 @@ def parse_text_by_heading(text):
|
|||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
|
||||||
|
|
||||||
elif dot_match:
|
elif dot_match:
|
||||||
# 处理以点号开头并带有数字的情况
|
if in_double_hash_mode:
|
||||||
sub_number, line_content = dot_match.groups()
|
# 处理以点号开头并带有数字的情况
|
||||||
sub_number = sub_number.replace('.', '.')
|
sub_number, line_content = dot_match.groups()
|
||||||
if last_main_number:
|
sub_number = sub_number.replace('.', '.')
|
||||||
new_key = f"{last_main_number}.{sub_number}"
|
if last_main_number:
|
||||||
|
new_key = f"{last_main_number}.{sub_number}"
|
||||||
|
if current_key is not None:
|
||||||
|
content_string = ''.join(current_content).strip()
|
||||||
|
data[current_key] = data.get(current_key, '') + content_string
|
||||||
|
current_key = new_key
|
||||||
|
current_content = [line_content]
|
||||||
|
append_newline = True
|
||||||
|
else:
|
||||||
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
|
||||||
|
else:
|
||||||
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
|
||||||
|
in_special_section)
|
||||||
|
elif dot_text_match:
|
||||||
|
if in_double_hash_mode:
|
||||||
|
# 处理以点号开头但不带数字的情况,自动生成下一个一级序号
|
||||||
|
temp_content = dot_text_match.group(1).strip()
|
||||||
|
if last_main_number:
|
||||||
|
# 生成下一个一级序号
|
||||||
|
try:
|
||||||
|
# 尝试将 last_main_number 转为整数并加1
|
||||||
|
next_main_number = str(int(last_main_number) + 1) + '.'
|
||||||
|
except ValueError:
|
||||||
|
# 如果 last_main_number 不是纯数字,处理为中文序号或其他逻辑
|
||||||
|
# 这里假设 last_main_number 是阿拉伯数字
|
||||||
|
next_main_number = '未识别的序号.'
|
||||||
|
else:
|
||||||
|
# 如果没有上一个主编号,默认从 '1.' 开始
|
||||||
|
next_main_number = '1.'
|
||||||
|
# 更新 current_key 和 last_main_number
|
||||||
if current_key is not None:
|
if current_key is not None:
|
||||||
content_string = ''.join(current_content).strip()
|
content_string = ''.join(current_content).strip()
|
||||||
data[current_key] = data.get(current_key, '') + content_string
|
data[current_key] = data.get(current_key, '') + content_string
|
||||||
current_key = new_key
|
current_key = next_main_number
|
||||||
current_content = [line_content]
|
current_content = [temp_content] # 不直接添加 '\n'
|
||||||
append_newline = True
|
last_main_number = next_main_number.rstrip('.')
|
||||||
else:
|
append_newline = True # 设置标志,下一次内容添加时需要 '\n'
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
|
|
||||||
elif dot_text_match:
|
|
||||||
# 处理以点号开头但不带数字的情况,自动生成下一个一级序号
|
|
||||||
temp_content = dot_text_match.group(1).strip()
|
|
||||||
if last_main_number:
|
|
||||||
# 生成下一个一级序号
|
|
||||||
try:
|
|
||||||
# 尝试将 last_main_number 转为整数并加1
|
|
||||||
next_main_number = str(int(last_main_number) + 1) + '.'
|
|
||||||
except ValueError:
|
|
||||||
# 如果 last_main_number 不是纯数字,处理为中文序号或其他逻辑
|
|
||||||
# 这里假设 last_main_number 是阿拉伯数字
|
|
||||||
next_main_number = '未识别的序号.'
|
|
||||||
else:
|
|
||||||
# 如果没有上一个主编号,默认从 '1.' 开始
|
|
||||||
next_main_number = '1.'
|
|
||||||
# 更新 current_key 和 last_main_number
|
|
||||||
if current_key is not None:
|
|
||||||
content_string = ''.join(current_content).strip()
|
|
||||||
data[current_key] = data.get(current_key, '') + content_string
|
|
||||||
current_key = next_main_number
|
|
||||||
current_content = [temp_content] # 不直接添加 '\n'
|
|
||||||
last_main_number = next_main_number.rstrip('.')
|
|
||||||
append_newline = True # 设置标志,下一次内容添加时需要 '\n'
|
|
||||||
|
|
||||||
continue # 跳过进一步处理该行
|
|
||||||
|
|
||||||
|
continue # 跳过进一步处理该行
|
||||||
|
else:
|
||||||
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
|
||||||
|
in_special_section)
|
||||||
elif pure_number_match:
|
elif pure_number_match:
|
||||||
# 处理不带点号的纯数字开头的情况
|
# 处理不带点号的纯数字开头的情况
|
||||||
new_key_candidate, line_content = pure_number_match.groups()
|
new_key_candidate, line_content = pure_number_match.groups()
|
||||||
@ -217,6 +239,8 @@ def parse_text_by_heading(text):
|
|||||||
temp_title = None # 重置临时标题
|
temp_title = None # 重置临时标题
|
||||||
|
|
||||||
# 开始新的标题
|
# 开始新的标题
|
||||||
|
if current_key_chinese is not None:
|
||||||
|
data[current_key_chinese] = current_value_chinese
|
||||||
if current_key is not None:
|
if current_key is not None:
|
||||||
content_string = ''.join(current_content).strip()
|
content_string = ''.join(current_content).strip()
|
||||||
data[current_key] = data.get(current_key, '') + content_string
|
data[current_key] = data.get(current_key, '') + content_string
|
||||||
@ -231,13 +255,13 @@ def parse_text_by_heading(text):
|
|||||||
else:
|
else:
|
||||||
# 根据预先设置的标志决定是否执行这部分代码
|
# 根据预先设置的标志决定是否执行这部分代码
|
||||||
if has_initial_heading_patterns and not skip_subheadings and not in_special_section:
|
if has_initial_heading_patterns and not skip_subheadings and not in_special_section:
|
||||||
numbered_match = pattern_numbered.match(line_stripped)
|
numbered_match = pattern_numbered.match(line_stripped) #一、
|
||||||
parentheses_match = pattern_parentheses.match(line_stripped)
|
parentheses_match = pattern_parentheses.match(line_stripped) #(一)
|
||||||
letter_match = pattern_letter.match(line_stripped)
|
letter_match = pattern_letter.match(line_stripped) #A. 内容
|
||||||
arabic_match = pattern_arabic.match(line_stripped)
|
# arabic_match = pattern_arabic.match(line_stripped) #1、内容
|
||||||
|
|
||||||
# 判断当前行是否匹配了任何标题模式
|
# 判断当前行是否匹配了任何标题模式
|
||||||
if numbered_match or parentheses_match or letter_match or arabic_match:
|
if numbered_match or parentheses_match or letter_match:
|
||||||
# 如果初始标题模式尚未设置,则记录当前匹配的标题模式
|
# 如果初始标题模式尚未设置,则记录当前匹配的标题模式
|
||||||
if initial_heading_pattern is None:
|
if initial_heading_pattern is None:
|
||||||
if numbered_match:
|
if numbered_match:
|
||||||
@ -246,8 +270,7 @@ def parse_text_by_heading(text):
|
|||||||
initial_heading_pattern = 'parentheses'
|
initial_heading_pattern = 'parentheses'
|
||||||
elif letter_match:
|
elif letter_match:
|
||||||
initial_heading_pattern = 'letter'
|
initial_heading_pattern = 'letter'
|
||||||
elif arabic_match:
|
|
||||||
initial_heading_pattern = 'arabic'
|
|
||||||
|
|
||||||
# 确定当前匹配的标题模式
|
# 确定当前匹配的标题模式
|
||||||
if numbered_match:
|
if numbered_match:
|
||||||
@ -256,44 +279,52 @@ def parse_text_by_heading(text):
|
|||||||
current_heading_pattern = 'parentheses'
|
current_heading_pattern = 'parentheses'
|
||||||
elif letter_match:
|
elif letter_match:
|
||||||
current_heading_pattern = 'letter'
|
current_heading_pattern = 'letter'
|
||||||
elif arabic_match:
|
|
||||||
current_heading_pattern = 'arabic'
|
|
||||||
|
|
||||||
# 如果当前标题模式与初始标题模式一致,创建新的键值对
|
# 如果当前标题模式与初始标题模式一致,创建新的键值对
|
||||||
if current_heading_pattern == initial_heading_pattern:
|
if current_heading_pattern == initial_heading_pattern:
|
||||||
|
new_key_chinese = None
|
||||||
|
new_value = None
|
||||||
# 保存之前的 key 的内容
|
# 保存之前的 key 的内容
|
||||||
if current_key is not None:
|
|
||||||
content_string = ''.join(current_content).strip()
|
|
||||||
data[current_key] = data.get(current_key, '') + content_string
|
|
||||||
current_content=[]
|
|
||||||
if current_key_chinese is not None:
|
if current_key_chinese is not None:
|
||||||
data[current_key_chinese] = current_value_chinese
|
data[current_key_chinese] = current_value_chinese
|
||||||
current_key_chinese = None
|
if current_key is not None:
|
||||||
|
content_string = ''.join(current_content).strip()
|
||||||
|
# 如果已经有内容,添加换行符
|
||||||
|
if data.get(current_key):
|
||||||
|
data[current_key] = data.get(current_key) + '\n' + content_string
|
||||||
|
else:
|
||||||
|
data[current_key] = content_string
|
||||||
|
current_content = []
|
||||||
# 处理匹配到的标题
|
# 处理匹配到的标题
|
||||||
if current_heading_pattern == 'numbered':
|
if current_heading_pattern == 'numbered':
|
||||||
current_key_chinese = numbered_match.group(1)
|
new_key_chinese = numbered_match.group(1)
|
||||||
current_value_chinese = line_stripped[numbered_match.end():].lstrip('..、,').replace(' ', '')
|
new_value = line_stripped[numbered_match.end():].lstrip('..、,').replace(' ', '')
|
||||||
elif current_heading_pattern == 'parentheses':
|
elif current_heading_pattern == 'parentheses':
|
||||||
current_key_chinese = parentheses_match.group(1)
|
new_key_chinese = parentheses_match.group(1)
|
||||||
current_value_chinese = line_stripped[parentheses_match.end():].lstrip('..、,').replace(' ', '')
|
new_value = line_stripped[parentheses_match.end():].lstrip('..、,').replace(' ', '')
|
||||||
elif current_heading_pattern == 'letter':
|
elif current_heading_pattern == 'letter':
|
||||||
# 字母标题处理
|
|
||||||
letter_key, letter_value = letter_match.groups()
|
letter_key, letter_value = letter_match.groups()
|
||||||
letter_to_chinese = {
|
letter_to_chinese = {
|
||||||
'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五',
|
'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五',
|
||||||
'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十',
|
'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十',
|
||||||
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
|
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
|
||||||
}
|
}
|
||||||
current_key_chinese = letter_to_chinese.get(letter_key, letter_key)
|
new_key_chinese = letter_to_chinese.get(letter_key, letter_key)
|
||||||
current_value_chinese = letter_value.lstrip('..、,').replace(' ', '')
|
new_value = letter_value.lstrip('..、,').replace(' ', '')
|
||||||
elif current_heading_pattern == 'arabic':
|
# 比较数字大小
|
||||||
arabic_key, arabic_value = arabic_match.groups()
|
current_number = get_current_number(current_key_chinese) if current_key_chinese else -1
|
||||||
current_key = arabic_key.replace('、', '.')
|
new_number = get_current_number(new_key_chinese)
|
||||||
current_content = [arabic_value.replace(' ', '')]
|
if new_number > current_number or current_key_chinese is None:
|
||||||
append_newline = True
|
# 设置新的键值对
|
||||||
last_main_number = current_key.rstrip('.')
|
current_key_chinese = new_key_chinese
|
||||||
continue
|
current_value_chinese = new_value
|
||||||
|
current_key=None
|
||||||
|
else:
|
||||||
|
# 如果新数字小于等于当前数字,将该行视为内容
|
||||||
|
if line_stripped:
|
||||||
|
append_newline = handle_content_append(current_content, line_stripped, append_newline,
|
||||||
|
keywords, in_special_section)
|
||||||
else:
|
else:
|
||||||
# 当前标题模式与初始模式不一致,将该行视为内容
|
# 当前标题模式与初始模式不一致,将该行视为内容
|
||||||
if line_stripped:
|
if line_stripped:
|
||||||
@ -349,6 +380,12 @@ def extract_text_from_pdf(file_path, start_word, end_pattern):
|
|||||||
end_index = matches[-1].start()
|
end_index = matches[-1].start()
|
||||||
cleaned_text = cleaned_text[:end_index]
|
cleaned_text = cleaned_text[:end_index]
|
||||||
|
|
||||||
|
lines = cleaned_text.split('\n')
|
||||||
|
for j, line in enumerate(lines):
|
||||||
|
if j == 0:
|
||||||
|
lines[j] = '##' + line
|
||||||
|
cleaned_text = '\n'.join(lines)
|
||||||
|
|
||||||
all_pages_text.append(cleaned_text)
|
all_pages_text.append(cleaned_text)
|
||||||
|
|
||||||
# 合并所有页面的文本
|
# 合并所有页面的文本
|
||||||
|
@ -12,9 +12,10 @@ def extract_text_by_page(file_path):
|
|||||||
page = reader.pages[page_num]
|
page = reader.pages[page_num]
|
||||||
text = page.extract_text()
|
text = page.extract_text()
|
||||||
if text:
|
if text:
|
||||||
|
print(text)
|
||||||
cleaned_text = clean_page_content(text,common_header)
|
cleaned_text = clean_page_content(text,common_header)
|
||||||
# cleaned_text=text
|
# cleaned_text=text
|
||||||
print(cleaned_text)
|
# print(cleaned_text)
|
||||||
print("-----------------"+str(page_num))
|
print("-----------------"+str(page_num))
|
||||||
result += cleaned_text
|
result += cleaned_text
|
||||||
# print(f"Page {page_num + 1} Content:\n{cleaned_text}")
|
# print(f"Page {page_num + 1} Content:\n{cleaned_text}")
|
||||||
@ -119,7 +120,7 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
||||||
# file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\东莞支队查验招标文件.pdf"
|
# file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\东莞支队查验招标文件.pdf"
|
||||||
file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
file_path = r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf'
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
||||||
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
||||||
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
|
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
|
||||||
|
@ -13,6 +13,8 @@ from openai import OpenAI
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
file_write_lock = threading.Lock()
|
file_write_lock = threading.Lock()
|
||||||
|
@sleep_and_retry
|
||||||
|
@limits(calls=2, period=1)
|
||||||
def upload_file(file_path,output_folder=""):
|
def upload_file(file_path,output_folder=""):
|
||||||
"""
|
"""
|
||||||
Uploads a file to DashScope and returns the file ID.
|
Uploads a file to DashScope and returns the file ID.
|
||||||
|
@ -10,7 +10,7 @@ from functools import wraps
|
|||||||
def rate_limiter():
|
def rate_limiter():
|
||||||
pass # 此函数仅用于限流控制,不执行任何操作
|
pass # 此函数仅用于限流控制,不执行任何操作
|
||||||
|
|
||||||
# 创建一个共享的装饰器
|
# 创建一个共享的装饰器,在被装饰函数 (func) 执行之前,先调用 rate_limiter()。
|
||||||
def shared_rate_limit(func):
|
def shared_rate_limit(func):
|
||||||
@wraps(func)
|
@wraps(func)
|
||||||
def wrapper(*args, **kwargs):
|
def wrapper(*args, **kwargs):
|
||||||
|
@ -1,17 +1,12 @@
|
|||||||
import re
|
import re
|
||||||
|
line_stripped="""1.采购人:陕西省某单位
|
||||||
# 合并后的正则表达式
|
2、采购代理机构:陕西坤硕项目管理有限公司
|
||||||
begin_pattern = re.compile(
|
3、供应商:响应招标并且符合招标文件规定资格条件和参加投标竞
|
||||||
r'(?<!见)' # 确保前面不是“见”
|
争的法人、其他组织或者自然人
|
||||||
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
|
"""
|
||||||
r'[\u4e00-\u9fff、()()]*?' # 匹配允许的字符(中文、顿号、括号)
|
pure_number_match = re.match(r'^(\d+)([^.\d)()、].*)', line_stripped) # 不允许出现右括号
|
||||||
r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审”
|
if pure_number_match:
|
||||||
r'(?=.*(?:办法|方法))' # 确保包含“办法”或“方法”
|
print("yes")
|
||||||
r'[\u4e00-\u9fff、()()]*\s*$' # 继续匹配允许的字符直到行尾
|
|
||||||
r'|\s*评标(办法|方法)前附表\s*$', # 或匹配“评标办法前附表”或“评标方法前附表”
|
|
||||||
re.MULTILINE
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# 测试字符串
|
# 测试字符串
|
||||||
test_strings = [
|
test_strings = [
|
||||||
@ -52,9 +47,9 @@ test_strings = [
|
|||||||
"""
|
"""
|
||||||
]
|
]
|
||||||
|
|
||||||
for test_string in test_strings:
|
# for test_string in test_strings:
|
||||||
match = re.search(begin_pattern, test_string)
|
# match = re.search(begin_pattern, test_string)
|
||||||
if match:
|
# if match:
|
||||||
print("Matched Content:", match.group()) # 输出匹配的内容
|
# print("Matched Content:", match.group()) # 输出匹配的内容
|
||||||
else:
|
# else:
|
||||||
print("No match found.")
|
# print("No match found.")
|
||||||
|
@ -113,7 +113,9 @@ def combine_evaluation_standards(evaluation_method):
|
|||||||
# user_query_2 = (
|
# user_query_2 = (
|
||||||
# "根据该文档中的评标办法前附表,请你列出该文件的技术评分,商务评分,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术评分'中新增子键名'备注'存放该信息。如果评分内容(因素)不是这3个,则返回文档中给定的评分内容(因素)以及它的评分要求。请以json格式返回结果,不要回答有关形式、资格、响应性评审标准的内容")
|
# "根据该文档中的评标办法前附表,请你列出该文件的技术评分,商务评分,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术评分'中新增子键名'备注'存放该信息。如果评分内容(因素)不是这3个,则返回文档中给定的评分内容(因素)以及它的评分要求。请以json格式返回结果,不要回答有关形式、资格、响应性评审标准的内容")
|
||||||
user_query_2 = (
|
user_query_2 = (
|
||||||
"""根据该文档中的评标办法表格,请你列出该文件的技术评分,商务评分,投标报价评审以及它们对应的具体评分要求,请以json格式返回结果,最外层键名分别是'技术评分','商务评分','投标报价评审',请在这三大项评分中分别用若干键值对表示具体评分项,外层键名为各评审因素,可能存在嵌套关系,但最内层键值为一个列表,列表中包含若干(可为一)描述该评审因素的评分及要求的字典,内层键名分别是'评分'和'要求',若无评分,可删去'评分'键值对,'要求'中说明了该评审因素的评分标准;若这三大项评分中存在其他信息,则在相应评分大块内部新增键名'备注'存放该信息,键值为具体的要求,否则不需要。如果评分内容(因素)不是这三大项,则返回文档中给定的评分内容(因素)以及它们的具体评分要求。
|
"""你是一个对招投标业务非常熟悉的专家。根据该文档中的评标办法表格,请你列出该文件的技术评分,商务评分,投标报价评审以及它们对应的具体评分要求,请以json格式返回结果,最外层键名分别是'技术评分','商务评分','投标报价评审',请在这三大项评分中分别用若干键值对表示具体评分项,外层键名为各评审因素,可能存在嵌套关系,但最内层键值为一个列表,列表中包含若干(可为一)描述该评审因素的评分及要求的字典,内层键名分别是'评分'和'要求',若无评分,可删去'评分'键值对,'要求'中说明了该评审因素的评分标准;若这三大项评分中存在其他信息,则在相应评分大块内部新增键名'备注'存放该信息,键值为具体的要求,否则不需要。如果评分内容(因素)不是这三大项,则返回文档中给定的评分内容(因素)以及它们的具体评分要求。
|
||||||
|
特殊情况处理:
|
||||||
|
如果评分内容(因素)不是这三大项,则返回表格中给定的评分内容(因素)代替最外层键名'技术评分','商务评分','投标报价评分',并且返回它们的具体评分要求。
|
||||||
|
|
||||||
要求与指南:
|
要求与指南:
|
||||||
1. 请首先定位评分细则的表格,不要回答有关资格审查的内容,也不要从评标办法正文中提取回答
|
1. 请首先定位评分细则的表格,不要回答有关资格审查的内容,也不要从评标办法正文中提取回答
|
||||||
@ -121,6 +123,7 @@ def combine_evaluation_standards(evaluation_method):
|
|||||||
3. 如果该招标活动有多个包,则最外层键名为对应的包名,否则最外层键名为各大评分项
|
3. 如果该招标活动有多个包,则最外层键名为对应的包名,否则最外层键名为各大评分项
|
||||||
4. 你无需将表格的单元格内的内容进行拆分,需要将它视为一个整体
|
4. 你无需将表格的单元格内的内容进行拆分,需要将它视为一个整体
|
||||||
5. '评分'的键值不能是一个范围数字,如'0-5分',应该是一个具体数字,如'5分',或者是一个定性的指标如'合格制'
|
5. '评分'的键值不能是一个范围数字,如'0-5分',应该是一个具体数字,如'5分',或者是一个定性的指标如'合格制'
|
||||||
|
6. 若表格中商务和技术评分混合一起,请你手动将它们区别,商务评分通常包含'售后服务'、'质量保证'、'业绩'、'企业人员'、'企业信用'等商务因素。
|
||||||
|
|
||||||
以下为示例输出,仅供格式参考:
|
以下为示例输出,仅供格式参考:
|
||||||
{
|
{
|
||||||
|
@ -210,7 +210,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
start_page = i
|
start_page = i
|
||||||
if start_page is not None and re.search(end_pattern, cleaned_text):
|
if start_page is not None and re.search(end_pattern, cleaned_text) and not re.search(begin_pattern,cleaned_text):
|
||||||
condition = i > start_page
|
condition = i > start_page
|
||||||
if condition:
|
if condition:
|
||||||
is_invalid_condition = output_suffix == "invalid" and i > 30 # 这边默认无效投标至少有30页
|
is_invalid_condition = output_suffix == "invalid" and i > 30 # 这边默认无效投标至少有30页
|
||||||
@ -362,7 +362,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
|||||||
(
|
(
|
||||||
re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))'),
|
re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))'),
|
||||||
# Alternative begin pattern
|
# Alternative begin pattern
|
||||||
re.compile(r'评标办法正文|评标办法|^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
re.compile(r'评标办法正文|评标方法正文|^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
||||||
# Alternative end pattern
|
# Alternative end pattern
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
@ -591,17 +591,15 @@ if __name__ == "__main__":
|
|||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标"
|
||||||
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf"
|
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf"
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
|
||||||
input_path=r"C:\Users\Administrator\Desktop\fsdownload\854fb19f-96d3-4b3e-b2ba-1a095344fd92\ztbfile.pdf"
|
input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest20.pdf"
|
||||||
output_folder = "C:\\Users\\Administrator\\Desktop\\new招标文件\\output3"
|
output_folder = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\all"
|
||||||
files=truncate_pdf_multiple(input_path,output_folder)
|
# files=truncate_pdf_multiple(input_path,output_folder)
|
||||||
|
# print(files)
|
||||||
# selections = [4, 1] # 仅处理 selection 4、1
|
# selections = [4, 1] # 仅处理 selection 4、1
|
||||||
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
|
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
|
||||||
for i in files:
|
selection = 2 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
|
||||||
print(type(i))
|
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||||
print(i)
|
print(generated_files)
|
||||||
# selection = 1 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
|
|
||||||
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
|
||||||
# print(generated_files)
|
|
||||||
# print("生成的文件:", generated_files)
|
# print("生成的文件:", generated_files)
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
print("耗时:" + str(end_time - start_time))
|
print("耗时:" + str(end_time - start_time))
|
||||||
|
@ -8,7 +8,7 @@ def convert_clause_to_json(file_path, output_folder, type=1):
|
|||||||
if type == 1:
|
if type == 1:
|
||||||
# 对于 type=1,使用原始字符串定义 start_word 和 end_pattern
|
# 对于 type=1,使用原始字符串定义 start_word 和 end_pattern
|
||||||
start_word = (
|
start_word = (
|
||||||
r'^\s*(?:[((]?\s*[一二12]?\s*[))]?\s*[、..]*\s*)?(说\s*明|总\s*则)'
|
r'^\s*(?:[((]?\s*[一二12]?\s*[))]?\s*[、..]*\s*)?(说\s*明|总\s*则|名\s*词\s*解\s*释)'
|
||||||
r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文)'
|
r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文)'
|
||||||
)
|
)
|
||||||
end_pattern = (
|
end_pattern = (
|
||||||
@ -94,7 +94,7 @@ def convert_clause_to_json(file_path, output_folder, type=1):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zbtest20_tobidders_notice.pdf'
|
# file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zbtest20_tobidders_notice.pdf'
|
||||||
file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zbtest4_tobidders_notice.pdf'
|
file_path=r'C:\Users\Administrator\Desktop\招标文件\招标test文件夹\all\招标02_tobidders_notice.pdf'
|
||||||
output_folder = r'C:\Users\Administrator\Desktop\招标文件\招标test文件夹\tmp'
|
output_folder = r'C:\Users\Administrator\Desktop\招标文件\招标test文件夹\tmp'
|
||||||
try:
|
try:
|
||||||
output_path = convert_clause_to_json(file_path,output_folder)
|
output_path = convert_clause_to_json(file_path,output_folder)
|
||||||
|
@ -8,12 +8,15 @@ from flask_app.general.clean_pdf import clean_page_content, extract_common_heade
|
|||||||
from flask_app.general.format_change import docx2pdf
|
from flask_app.general.format_change import docx2pdf
|
||||||
from flask_app.general.merge_pdfs import merge_and_cleanup, merge_pdfs, merge_selected_pdfs_for_goods
|
from flask_app.general.merge_pdfs import merge_and_cleanup, merge_pdfs, merge_selected_pdfs_for_goods
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
|
|
||||||
|
|
||||||
def get_global_logger(unique_id):
|
def get_global_logger(unique_id):
|
||||||
if unique_id is None:
|
if unique_id is None:
|
||||||
return logging.getLogger() # 获取默认的日志器
|
return logging.getLogger() # 获取默认的日志器
|
||||||
logger = logging.getLogger(unique_id)
|
logger = logging.getLogger(unique_id)
|
||||||
return logger
|
return logger
|
||||||
|
|
||||||
|
|
||||||
# fitz库版本
|
# fitz库版本
|
||||||
# def extract_common_header(pdf_path):
|
# def extract_common_header(pdf_path):
|
||||||
# doc = fitz.open(pdf_path)
|
# doc = fitz.open(pdf_path)
|
||||||
@ -59,6 +62,7 @@ def convert_to_pdf(file_path):
|
|||||||
return docx2pdf(file_path)
|
return docx2pdf(file_path)
|
||||||
return file_path
|
return file_path
|
||||||
|
|
||||||
|
|
||||||
def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
||||||
pdf_path = convert_to_pdf(file_path)
|
pdf_path = convert_to_pdf(file_path)
|
||||||
result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||||
@ -91,9 +95,15 @@ def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
|
|||||||
if start_page is None and re.search(begin_pattern, cleaned_text):
|
if start_page is None and re.search(begin_pattern, cleaned_text):
|
||||||
if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
|
if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
|
||||||
start_page = i
|
start_page = i
|
||||||
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
|
if start_page is not None:
|
||||||
end_page = i
|
if output_suffix == "tobidders_notice":
|
||||||
break
|
if re.search(end_pattern, cleaned_text) and i > start_page:
|
||||||
|
end_page = i
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
if re.search(end_pattern, cleaned_text) and i > start_page and not re.search(begin_pattern,cleaned_text):
|
||||||
|
end_page = i
|
||||||
|
break
|
||||||
return start_page, end_page
|
return start_page, end_page
|
||||||
|
|
||||||
|
|
||||||
@ -105,26 +115,33 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
|||||||
total_pages = len(pdf_document.pages) - 1 # 获取总页数
|
total_pages = len(pdf_document.pages) - 1 # 获取总页数
|
||||||
|
|
||||||
if output_suffix == "tobidders_notice":
|
if output_suffix == "tobidders_notice":
|
||||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
|
exclusion_pattern = re.compile(
|
||||||
|
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
|
||||||
start_page, mid_page, end_page = extract_pages_tobidders_notice(
|
start_page, mid_page, end_page = extract_pages_tobidders_notice(
|
||||||
pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern
|
pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern
|
||||||
)
|
)
|
||||||
if not start_page or not mid_page or not end_page:
|
if not start_page or not mid_page or not end_page:
|
||||||
print(f"三次提取tobidders_notice均失败!!{pdf_path}")
|
print(f"三次提取tobidders_notice均失败!!{pdf_path}")
|
||||||
return "",""
|
return "", ""
|
||||||
# if start_page is None or end_page is None or mid_page is None:
|
# if start_page is None or end_page is None or mid_page is None:
|
||||||
# print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
# print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
||||||
# return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page)
|
# return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page)
|
||||||
|
path1 = save_extracted_pages(pdf_document, start_page, mid_page, pdf_path, output_folder,
|
||||||
path1 = save_extracted_pages(pdf_document, start_page, mid_page, pdf_path, output_folder, "tobidders_notice_part1")
|
"tobidders_notice_part1")
|
||||||
path2 = save_extracted_pages(pdf_document, mid_page, end_page, pdf_path, output_folder, "tobidders_notice_part2")
|
if mid_page != end_page:
|
||||||
|
path2 = save_extracted_pages(pdf_document, mid_page, end_page, pdf_path, output_folder,
|
||||||
|
"tobidders_notice_part2")
|
||||||
|
else:
|
||||||
|
path2=path1
|
||||||
return path1, path2
|
return path1, path2
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# 原有的处理逻辑保持不变
|
# 原有的处理逻辑保持不变
|
||||||
if output_suffix == "qualification1" or output_suffix=="procurement" or output_suffix=="evaluation_method":
|
if output_suffix == "qualification1" or output_suffix == "procurement" or output_suffix == "evaluation_method":
|
||||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
|
exclusion_pattern = re.compile(
|
||||||
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern, output_suffix)
|
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
|
||||||
|
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
|
||||||
|
common_header, exclusion_pattern, output_suffix)
|
||||||
# 针对 selection = 6 的特殊处理
|
# 针对 selection = 6 的特殊处理
|
||||||
if output_suffix == "format":
|
if output_suffix == "format":
|
||||||
if start_page is None:
|
if start_page is None:
|
||||||
@ -137,7 +154,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
|||||||
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
||||||
if start_page is None or end_page is None:
|
if start_page is None or end_page is None:
|
||||||
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
||||||
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page)
|
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page)
|
||||||
elif output_suffix == "qualification1":
|
elif output_suffix == "qualification1":
|
||||||
truncate_pdf_main(pdf_path, output_folder, 2, "qualification3") # 合并'资格审查'章节和'评标办法'章节
|
truncate_pdf_main(pdf_path, output_folder, 2, "qualification3") # 合并'资格审查'章节和'评标办法'章节
|
||||||
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
||||||
@ -151,6 +168,7 @@ def get_patterns_for_procurement():
|
|||||||
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十1-9]+(?:章|部分).*(?:采购|需求).*',
|
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十1-9]+(?:章|部分).*(?:采购|需求).*',
|
||||||
# re.MULTILINE)
|
# re.MULTILINE)
|
||||||
begin_pattern = re.compile(
|
begin_pattern = re.compile(
|
||||||
|
r'(?<!见)'
|
||||||
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
|
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
|
||||||
r'[\u4e00-\u9fff、()()]*?' # 匹配允许的字符
|
r'[\u4e00-\u9fff、()()]*?' # 匹配允许的字符
|
||||||
r'(?:(?:服务|项目|商务|技术)[\u4e00-\u9fff、()()]*?要求[\u4e00-\u9fff、()()]*?\s*$|' # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求”
|
r'(?:(?:服务|项目|商务|技术)[\u4e00-\u9fff、()()]*?要求[\u4e00-\u9fff、()()]*?\s*$|' # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求”
|
||||||
@ -178,6 +196,8 @@ def get_patterns_for_evaluation_method():
|
|||||||
end_pattern = re.compile(
|
end_pattern = re.compile(
|
||||||
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
||||||
return begin_pattern, end_pattern
|
return begin_pattern, end_pattern
|
||||||
|
|
||||||
|
|
||||||
def get_patterns_for_notice():
|
def get_patterns_for_notice():
|
||||||
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*')
|
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*')
|
||||||
end_pattern = re.compile(
|
end_pattern = re.compile(
|
||||||
@ -186,6 +206,8 @@ def get_patterns_for_notice():
|
|||||||
re.MULTILINE
|
re.MULTILINE
|
||||||
)
|
)
|
||||||
return begin_pattern, end_pattern
|
return begin_pattern, end_pattern
|
||||||
|
|
||||||
|
|
||||||
def get_patterns_for_notice_twice():
|
def get_patterns_for_notice_twice():
|
||||||
begin_pattern = re.compile(
|
begin_pattern = re.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*', re.MULTILINE
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*', re.MULTILINE
|
||||||
@ -197,6 +219,7 @@ def get_patterns_for_notice_twice():
|
|||||||
)
|
)
|
||||||
return begin_pattern, end_pattern
|
return begin_pattern, end_pattern
|
||||||
|
|
||||||
|
|
||||||
# def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header,
|
# def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header,
|
||||||
# exclusion_pattern):
|
# exclusion_pattern):
|
||||||
# start_page = None
|
# start_page = None
|
||||||
@ -294,7 +317,7 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
|||||||
|
|
||||||
# 定义基础的 mid_pattern
|
# 定义基础的 mid_pattern
|
||||||
base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \
|
base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \
|
||||||
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则)'
|
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
|
||||||
|
|
||||||
# 合并基础模式和额外模式
|
# 合并基础模式和额外模式
|
||||||
if additional_mid_pattern:
|
if additional_mid_pattern:
|
||||||
@ -316,14 +339,14 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
|||||||
|
|
||||||
# 定义基础的 mid_pattern
|
# 定义基础的 mid_pattern
|
||||||
base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \
|
base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \
|
||||||
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则)'
|
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
|
||||||
combined_mid_pattern = re.compile(
|
combined_mid_pattern = re.compile(
|
||||||
rf'{base_mid_pattern}',
|
rf'{base_mid_pattern}',
|
||||||
re.MULTILINE
|
re.MULTILINE
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# 如果提供了固定的 end_pattern,则使用默认的 mid_pattern
|
# 如果提供了固定的 end_pattern,则使用默认的 mid_pattern
|
||||||
base_mid_pattern = r'.*[((]?\s*[一二12]?[))]?\s*[、..]*\s*(说\s*明|总\s*则)\s*$' #可以匹配"东莞市粤隆招标有限公司编制 46一、说明"
|
base_mid_pattern = r'.*[((]?\s*[一二12]?[))]?\s*[、..]*\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)\s*$' # 可以匹配"东莞市粤隆招标有限公司编制 46一、说明"
|
||||||
combined_mid_pattern = re.compile(
|
combined_mid_pattern = re.compile(
|
||||||
rf'{base_mid_pattern}',
|
rf'{base_mid_pattern}',
|
||||||
re.MULTILINE
|
re.MULTILINE
|
||||||
@ -332,7 +355,7 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
|||||||
|
|
||||||
# 识别中间页
|
# 识别中间页
|
||||||
if start_page is not None and mid_page is None and combined_mid_pattern:
|
if start_page is not None and mid_page is None and combined_mid_pattern:
|
||||||
if (start_page+1==i) and re.search(local_begin_pattern,cleaned_text):
|
if (start_page + 1 == i) and re.search(local_begin_pattern, cleaned_text):
|
||||||
continue
|
continue
|
||||||
if re.search(combined_mid_pattern, cleaned_text):
|
if re.search(combined_mid_pattern, cleaned_text):
|
||||||
mid_page = i
|
mid_page = i
|
||||||
@ -355,7 +378,7 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
|||||||
if not (start_page and mid_page and end_page):
|
if not (start_page and mid_page and end_page):
|
||||||
print(f"第二次尝试 tobidders_notice!{pdf_path}")
|
print(f"第二次尝试 tobidders_notice!{pdf_path}")
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
start_page,mid_page,end_page=extract_pages_twice_tobidders_notice(pdf_document,common_header,begin_page)
|
start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
|
||||||
if start_page and end_page and mid_page:
|
if start_page and end_page and mid_page:
|
||||||
return start_page, mid_page, end_page
|
return start_page, mid_page, end_page
|
||||||
else:
|
else:
|
||||||
@ -375,22 +398,24 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
|||||||
|
|
||||||
return start_page, mid_page, end_page
|
return start_page, mid_page, end_page
|
||||||
|
|
||||||
def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page):
|
|
||||||
output_suffix="tobidders_notice"
|
def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page):
|
||||||
|
output_suffix = "tobidders_notice"
|
||||||
begin_pattern = re.compile(
|
begin_pattern = re.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+',re.MULTILINE
|
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+',
|
||||||
|
re.MULTILINE
|
||||||
)
|
)
|
||||||
end_pattern = re.compile(
|
end_pattern = re.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)',re.MULTILINE # 捕获中文部分
|
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', re.MULTILINE # 捕获中文部分
|
||||||
)
|
)
|
||||||
exclusion_words = ["合同", "评标", "开标","评审","采购","资格"] # 在这里添加需要排除的关键词
|
exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"] # 在这里添加需要排除的关键词
|
||||||
|
|
||||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
|
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
|
||||||
|
|
||||||
# 提取第一部分
|
# 提取第一部分
|
||||||
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
|
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
|
||||||
if start_page1 is None or end_page1 is None:
|
if start_page1 is None or end_page1 is None:
|
||||||
return "", "",""
|
return "", "", ""
|
||||||
|
|
||||||
# 提取第二部分
|
# 提取第二部分
|
||||||
start_page2 = end_page1
|
start_page2 = end_page1
|
||||||
@ -406,16 +431,16 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page)
|
|||||||
# 检查是否包含排除关键词
|
# 检查是否包含排除关键词
|
||||||
if any(word in chapter_title for word in exclusion_words):
|
if any(word in chapter_title for word in exclusion_words):
|
||||||
# 如果包含排除关键词,直接返回相同的路径
|
# 如果包含排除关键词,直接返回相同的路径
|
||||||
return start_page1, end_page1,end_page1
|
return start_page1, end_page1, end_page1
|
||||||
|
|
||||||
# 如果不包含排除关键词,继续提取第二部分
|
# 如果不包含排除关键词,继续提取第二部分
|
||||||
_, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2-1, common_header,
|
_, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
|
||||||
exclusion_pattern,output_suffix)
|
exclusion_pattern, output_suffix)
|
||||||
|
|
||||||
if end_page2 is None:
|
if end_page2 is None:
|
||||||
return start_page1, end_page1,end_page1
|
return start_page1, end_page1, end_page1
|
||||||
|
|
||||||
return start_page1, end_page1,end_page2
|
return start_page1, end_page1, end_page2
|
||||||
|
|
||||||
|
|
||||||
def extract_pages_qualification(pdf_document, begin_page, common_header):
|
def extract_pages_qualification(pdf_document, begin_page, common_header):
|
||||||
@ -436,7 +461,6 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
|
|||||||
r'^(?:附录.*?[::]|附件.*?[::]|附表.*?[::]|附件\s*\d+).*$',
|
r'^(?:附录.*?[::]|附件.*?[::]|附表.*?[::]|附件\s*\d+).*$',
|
||||||
re.MULTILINE
|
re.MULTILINE
|
||||||
)
|
)
|
||||||
|
|
||||||
# 结束匹配模式 - 章节标题
|
# 结束匹配模式 - 章节标题
|
||||||
end_pattern_chapter = re.compile(
|
end_pattern_chapter = re.compile(
|
||||||
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$',
|
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$',
|
||||||
@ -446,7 +470,7 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
|
|||||||
print("第二次尝试:匹配附件")
|
print("第二次尝试:匹配附件")
|
||||||
start_page = None
|
start_page = None
|
||||||
end_page = None
|
end_page = None
|
||||||
include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查","资格检查"]
|
include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查", "资格检查"]
|
||||||
exclude_keywords = ["声明函", "承诺函"]
|
exclude_keywords = ["声明函", "承诺函"]
|
||||||
|
|
||||||
# 从指定的开始页开始遍历
|
# 从指定的开始页开始遍历
|
||||||
@ -493,9 +517,10 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
|
|||||||
return start_page, end_page
|
return start_page, end_page
|
||||||
|
|
||||||
|
|
||||||
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page):
|
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page):
|
||||||
try:
|
try:
|
||||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
|
exclusion_pattern = re.compile(
|
||||||
|
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
patterns = None
|
patterns = None
|
||||||
start_page = None
|
start_page = None
|
||||||
@ -506,13 +531,13 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,be
|
|||||||
elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3":
|
elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3":
|
||||||
patterns = [get_patterns_for_evaluation_method()]
|
patterns = [get_patterns_for_evaluation_method()]
|
||||||
elif output_suffix == "notice":
|
elif output_suffix == "notice":
|
||||||
patterns = [get_patterns_for_notice(),get_patterns_for_notice_twice()]
|
patterns = [get_patterns_for_notice(), get_patterns_for_notice_twice()]
|
||||||
elif output_suffix == "qualification1":
|
elif output_suffix == "qualification1":
|
||||||
start_page, end_page = extract_pages_qualification(pdf_document, begin_page, common_header)
|
start_page, end_page = extract_pages_qualification(pdf_document, begin_page, common_header)
|
||||||
if patterns:
|
if patterns:
|
||||||
for pattern_pair in patterns:
|
for pattern_pair in patterns:
|
||||||
start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
|
start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
|
||||||
common_header,exclusion_pattern, output_suffix)
|
common_header, exclusion_pattern, output_suffix)
|
||||||
if start_page is not None and end_page is not None:
|
if start_page is not None and end_page is not None:
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -568,10 +593,12 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo
|
|||||||
print(f"Error in save_extracted_pages: {e}")
|
print(f"Error in save_extracted_pages: {e}")
|
||||||
return "" # 返回空字符串
|
return "" # 返回空字符串
|
||||||
|
|
||||||
|
|
||||||
def get_start_and_common_header(input_path):
|
def get_start_and_common_header(input_path):
|
||||||
common_header = extract_common_header(input_path)
|
common_header = extract_common_header(input_path)
|
||||||
last_begin_index = 0
|
last_begin_index = 0
|
||||||
begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',re.MULTILINE)
|
begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
||||||
|
re.MULTILINE)
|
||||||
pdf_document = PdfReader(input_path)
|
pdf_document = PdfReader(input_path)
|
||||||
for i, page in enumerate(pdf_document.pages):
|
for i, page in enumerate(pdf_document.pages):
|
||||||
if i > 10:
|
if i > 10:
|
||||||
@ -581,8 +608,8 @@ def get_start_and_common_header(input_path):
|
|||||||
cleaned_text = clean_page_content(text, common_header)
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
if begin_pattern.search(cleaned_text) and not re.search(r'目\s*录', cleaned_text):
|
if begin_pattern.search(cleaned_text) and not re.search(r'目\s*录', cleaned_text):
|
||||||
last_begin_index = i # 更新第一个匹配的索引,页码从0开始
|
last_begin_index = i # 更新第一个匹配的索引,页码从0开始
|
||||||
return common_header,last_begin_index
|
return common_header, last_begin_index
|
||||||
return common_header,last_begin_index
|
return common_header, last_begin_index
|
||||||
|
|
||||||
|
|
||||||
def truncate_pdf_main(input_path, output_folder, selection, output_suffix="default"):
|
def truncate_pdf_main(input_path, output_folder, selection, output_suffix="default"):
|
||||||
@ -631,11 +658,13 @@ def process_input(input_path, output_folder, selection, output_suffix):
|
|||||||
|
|
||||||
# 根据选择设置对应的模式和结束模式
|
# 根据选择设置对应的模式和结束模式
|
||||||
if selection == 1:
|
if selection == 1:
|
||||||
begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$', re.MULTILINE)
|
begin_pattern = re.compile(
|
||||||
|
r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$', re.MULTILINE)
|
||||||
end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
||||||
local_output_suffix = "notice"
|
local_output_suffix = "notice"
|
||||||
elif selection == 2:
|
elif selection == 2:
|
||||||
begin_pattern = re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法|内容))')
|
begin_pattern = re.compile(
|
||||||
|
r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法|内容))')
|
||||||
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
||||||
local_output_suffix = "evaluation_method"
|
local_output_suffix = "evaluation_method"
|
||||||
elif selection == 3:
|
elif selection == 3:
|
||||||
@ -650,7 +679,7 @@ def process_input(input_path, output_folder, selection, output_suffix):
|
|||||||
local_output_suffix = "tobidders_notice"
|
local_output_suffix = "tobidders_notice"
|
||||||
elif selection == 5:
|
elif selection == 5:
|
||||||
begin_pattern = re.compile(
|
begin_pattern = re.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购内容|采购要求|需求).*') #包头中有一章'采购相关说明'
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购内容|采购要求|需求).*') # 包头中有一章'采购相关说明'
|
||||||
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
||||||
local_output_suffix = "procurement"
|
local_output_suffix = "procurement"
|
||||||
|
|
||||||
@ -681,7 +710,7 @@ def process_input(input_path, output_folder, selection, output_suffix):
|
|||||||
return ['']
|
return ['']
|
||||||
|
|
||||||
|
|
||||||
def truncate_pdf_multiple(pdf_path, output_folder,logger):
|
def truncate_pdf_multiple(pdf_path, output_folder, logger):
|
||||||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||||
truncate_files = []
|
truncate_files = []
|
||||||
|
|
||||||
@ -717,11 +746,12 @@ def truncate_pdf_multiple(pdf_path, output_folder,logger):
|
|||||||
# 合并失败,添加空字符串
|
# 合并失败,添加空字符串
|
||||||
truncate_files.append("")
|
truncate_files.append("")
|
||||||
logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}")
|
logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}")
|
||||||
logger.info("已截取文件路径"+str(truncate_files))
|
logger.info("已截取文件路径" + str(truncate_files))
|
||||||
return truncate_files
|
return truncate_files
|
||||||
|
|
||||||
#小解析,只需要前三章内容
|
|
||||||
def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="123"):
|
# 小解析,只需要前三章内容
|
||||||
|
def truncate_pdf_specific_goods(pdf_path, output_folder, selections, unique_id="123"):
|
||||||
"""
|
"""
|
||||||
处理 PDF 文件,选择指定的 selections,并合并结果。
|
处理 PDF 文件,选择指定的 selections,并合并结果。
|
||||||
|
|
||||||
@ -740,7 +770,9 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
|
|||||||
# 使用 ThreadPoolExecutor 进行多线程处理
|
# 使用 ThreadPoolExecutor 进行多线程处理
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
|
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
|
||||||
# 提交所有任务并保持 selection 顺序
|
# 提交所有任务并保持 selection 顺序
|
||||||
future_to_selection = {selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default") for selection in selections}
|
future_to_selection = {
|
||||||
|
selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default")
|
||||||
|
for selection in selections}
|
||||||
|
|
||||||
# 按 selection 顺序收集结果
|
# 按 selection 顺序收集结果
|
||||||
for selection in selections:
|
for selection in selections:
|
||||||
@ -769,22 +801,23 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
|
|||||||
logger.warning(f"合并失败,没有生成merged_baseinfo for {pdf_path}")
|
logger.warning(f"合并失败,没有生成merged_baseinfo for {pdf_path}")
|
||||||
return truncate_files
|
return truncate_files
|
||||||
|
|
||||||
|
|
||||||
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 工程标中,判断是符合性审查之后,可以将它们设为同一章
|
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 工程标中,判断是符合性审查之后,可以将它们设为同一章
|
||||||
|
|
||||||
#ztbfile.pdf少资格评审 包头少符合性评审
|
# ztbfile.pdf少资格评审 包头少符合性评审
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
logger=get_global_logger("123")
|
logger = get_global_logger("123")
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
||||||
input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2020-安徽-安徽省生态环境厅电梯采购.pdf"
|
input_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-陕西-陕西省某单位2024年执勤化妆服采购项目.pdf"
|
||||||
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
|
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
||||||
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
|
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
|
||||||
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
|
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
|
||||||
output_folder=r"C:\Users\Administrator\Desktop\招标文件-采购类\tmp2"
|
output_folder = r"C:\Users\Administrator\Desktop\招标文件-采购类\all"
|
||||||
# files = truncate_pdf_multiple(input_path, output_folder,logger)
|
files = truncate_pdf_multiple(input_path, output_folder,logger)
|
||||||
# selections = [3,5]
|
# selections = [3,5]
|
||||||
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
|
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
|
||||||
# print(files)
|
print(files)
|
||||||
selection = 2# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
# selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
||||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||||
print(generated_files)
|
# print(generated_files)
|
||||||
|
@ -134,13 +134,14 @@ def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json
|
|||||||
print(f"The specified file does not exist: {file_path}")
|
print(f"The specified file does not exist: {file_path}")
|
||||||
return ""
|
return ""
|
||||||
if type == 1:
|
if type == 1:
|
||||||
start_word = r'^\s*(?:[((]?\s*[一二12]?\s*[))]?\s*[、..]*\s*)?(说\s*明|总\s*则)'
|
start_word = r'^\s*(?:[((]?\s*[一二12]?\s*[))]?\s*[、..]*\s*)?(说\s*明|总\s*则|名\s*词\s*解\s*释)'
|
||||||
end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*)$'
|
end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*)$'
|
||||||
else:
|
else:
|
||||||
start_word = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*|.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*)$'
|
start_word = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*|.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*)$'
|
||||||
end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
|
end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
|
||||||
if file_path.endswith('.pdf'):
|
if file_path.endswith('.pdf'):
|
||||||
text = extract_text_from_pdf(file_path, start_word, end_pattern)
|
text = extract_text_from_pdf(file_path, start_word, end_pattern)
|
||||||
|
# print(text)
|
||||||
else:
|
else:
|
||||||
raise ValueError("Unsupported file format")
|
raise ValueError("Unsupported file format")
|
||||||
parsed_data = parse_text_by_heading(text)
|
parsed_data = parse_text_by_heading(text)
|
||||||
@ -174,13 +175,14 @@ def process_folder(input_folder, output_folder):
|
|||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
print(f"Error processing {file_name}: {e}")
|
print(f"Error processing {file_name}: {e}")
|
||||||
|
|
||||||
#TODO:招标文件111_tobidders_notice_part2.pdf 陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目(1)_tobidders_notice_part2.pdf
|
#TODO:招标文件111_tobidders_notice_part2.pdf 陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目(1)_tobidders_notice_part2.pdf 唐山市公安交通警察支队机动车查验机构视频存储回放系统竞争性谈判-招标文件正文(1)_tobidders_notice_part1.pdf
|
||||||
#TODO:19、竞争性磋商响应文件的加密 暂时没处理'19'缺失的情况
|
#TODO:2024-陕西-陕西省某单位2024年执勤化妆服采购项目.pdf
|
||||||
#TODO: .不予受理的情形 ,‘.后面必须跟中文或者空格’
|
#TODO: .不予受理的情形 ,‘.后面必须跟中文或者空格’
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
|
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
|
||||||
file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf'
|
file_path=r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf'
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
# file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf'
|
||||||
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\6.2定版视频会议磋商文件_tobidders_notice_part2.pdf'
|
||||||
output_folder = r'C:\Users\Administrator\Desktop\招标文件\output4\tmp'
|
output_folder = r'C:\Users\Administrator\Desktop\招标文件\output4\tmp'
|
||||||
try:
|
try:
|
||||||
output_path = convert_clause_to_json(file_path,output_folder,1)
|
output_path = convert_clause_to_json(file_path,output_folder,1)
|
||||||
|
@ -224,13 +224,14 @@ def combine_evaluation_standards(truncate_file):
|
|||||||
# user_query = "根据该文档中的评标办法前附表或者评分标准表,请你列出该文件的技术评分,商务评分,投标报价评审标准以及它们对应的具体评分要求,外层键名分别为'技术评分','商务评分','投标报价'。如果评分内容不是这3个,则返回文档中给定的评分内容以及它的评分要求,都以json的格式返回结果,如果该采购活动有多个包,则最外层键名为对应的包名。请不要回答有关资格审查的内容"
|
# user_query = "根据该文档中的评标办法前附表或者评分标准表,请你列出该文件的技术评分,商务评分,投标报价评审标准以及它们对应的具体评分要求,外层键名分别为'技术评分','商务评分','投标报价'。如果评分内容不是这3个,则返回文档中给定的评分内容以及它的评分要求,都以json的格式返回结果,如果该采购活动有多个包,则最外层键名为对应的包名。请不要回答有关资格审查的内容"
|
||||||
user_query = (
|
user_query = (
|
||||||
"""
|
"""
|
||||||
根据该文档中的评分标准表格中的内容,请你列出该文件的技术评分,商务评分,投标报价评审以及它们对应的具体评分要求,请以json格式返回结果,最外层键名分别是'技术评分','商务评分','投标报价评分',请在这三大项评分中分别用若干键值对表示具体评分项,外层键名为各评审因素,键值为一个列表,列表中包含若干(可为一)描述该评审因素的评分及要求的字典,内层键名分别是'评分'和'要求',若无评分,可删去'评分'键值对,'要求'中说明了该评审因素的评分标准;若这三大项评分中存在其他信息,则在相应评分大块内部新增键名'备注'存放该信息,键值为具体的要求,否则不需要。如果评分内容(因素)不是这三大项,则返回文档中给定的评分内容(因素)以及它们的具体评分要求。
|
你是一个对招投标业务非常熟悉的专家。根据该文档中的评分标准表格中的内容,请你列出该文件的技术评分,商务评分,投标报价评审以及它们对应的具体评分要求,请以json格式返回结果,最外层键名分别是'技术评分','商务评分','投标报价评分',请在这三大项评分中分别用若干键值对表示具体评分项,外层键名为各评审因素,键值为一个列表,列表中包含若干(可为一)描述该评审因素的评分及要求的字典,内层键名分别是'评分'和'要求',若无评分,可删去'评分'键值对,'要求'中说明了该评审因素的评分标准;若这三大项评分中存在其他信息,则在相应评分大块内部新增键名'备注'存放该信息,键值为具体的要求,否则不需要。如果评分内容(因素)不是这三大项,则返回表格中给定的评分内容(因素)以及它们的具体评分要求。
|
||||||
要求与指南:
|
要求与指南:
|
||||||
1.请首先定位评分细则的表格,不要回答有关资格审查、符合性审查的内容,也不要从评标办法正文中(表格外)提取回答
|
1.请首先定位评分细则的表格,不要回答有关资格审查、符合性审查的内容,也不要从评标办法正文中(表格外)提取回答
|
||||||
2.若大项的'xx评分'要求未在文中说明,则键名'xx评分'的键值设为'本项目无xx评分项',例如"技术评分":"本项目无技术评分项"
|
2.若大项的'xx评分'要求未在文中说明,则键名'xx评分'的键值设为'本项目无xx评分项',例如{"技术评分":"本项目无技术评分项"}
|
||||||
3. 如果该招标活动有多个包,则最外层键名为对应的包名,否则不需要
|
3. 如果该招标活动有多个包,则最外层键名为对应的包名,否则不需要
|
||||||
4.你无需将表格的单元格内的内容进行拆分,需要将它视为一个整体。
|
4.你无需将表格的单元格内的内容进行拆分,需要将它视为一个整体。
|
||||||
5. '评分'的键值不能是一个范围数字,如'0-5分',应该是一个具体数字,如'5分',或者是一个定性的指标如'合格制'
|
5. '评分'的键值不能是一个范围数字,如'0-5分',应该是一个具体数字,如'5分',或者是一个定性的指标如'合格制'
|
||||||
|
6. 若表格中商务和技术评分混合一起,请你手动将它们区别,商务评分通常包含'售后服务'、'质量保证'、'业绩'、'企业人员'、'企业信用'等商务因素。
|
||||||
|
|
||||||
以下为示例输出,仅供格式参考:
|
以下为示例输出,仅供格式参考:
|
||||||
{
|
{
|
||||||
@ -311,7 +312,7 @@ def combine_evaluation_standards(truncate_file):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
start_time=time.time()
|
start_time=time.time()
|
||||||
truncate_file=r"C:\Users\Administrator\Desktop\fsdownload\1ca1d27d-fc21-4697-8075-9027103df030\ztbfile_evaluation_method.pdf"
|
truncate_file=r"C:\Users\Administrator\Desktop\招标文件-采购类\tmp2\2024-新疆-塔城地区公安局食药环分局快检实验室项目_evaluation_method.pdf"
|
||||||
# truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output2\\2-招标文件(统计局智能终端二次招标)_evaluation_method.pdf"
|
# truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output2\\2-招标文件(统计局智能终端二次招标)_evaluation_method.pdf"
|
||||||
# truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output2\\广水市妇幼招标文件最新(W改)_evaluation_method.pdf"
|
# truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output2\\广水市妇幼招标文件最新(W改)_evaluation_method.pdf"
|
||||||
# truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\2d481945-1f82-45a5-8e56-7fafea4a7793\\ztbfile_evaluation_method.pdf"
|
# truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\2d481945-1f82-45a5-8e56-7fafea4a7793\\ztbfile_evaluation_method.pdf"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user