12.9 修复解析bug

This commit is contained in:
zy123 2024-12-09 17:38:01 +08:00
parent e5e63e400b
commit c0c7871767
13 changed files with 258 additions and 178 deletions

View File

@ -76,7 +76,7 @@ def clean_page_content(text, common_header):
text = re.sub(r'^第\d+页\s*', '', text) text = re.sub(r'^第\d+页\s*', '', text)
# 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除 # 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 投标人须知这块, 页码和顶部序号混在一起的时候也会把序号给去除了。'2018.' 20为页码 18.为序号 text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 投标人须知这块, 页码和顶部序号混在一起的时候也会把序号给去除了。'2018.' 20为页码 18.为序号
text = re.sub(r'^\s*\/\s*(共\s*)?\d+\s*(页)?\s*', '', text) #删除/123 /共123 /共123页 /123页 text = re.sub(r'^\s*\/?\s*(共\s*)?\d+\s*(页)?\s*', '', text) #删除/123 /共123 /共123页 /123页
text = re.sub(r'^\s*[—-]\s*\d{1,3}\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码 text = re.sub(r'^\s*[—-]\s*\d{1,3}\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
return text return text

View File

@ -4,7 +4,7 @@ import re
from functools import cmp_to_key from functools import cmp_to_key
from flask_app.general.json_utils import clean_json_string from flask_app.general.json_utils import clean_json_string
from flask_app.general.通义千问long import upload_file, qianwen_long from flask_app.general.通义千问long import upload_file, qianwen_long_stream
def compare_headings(a, b): def compare_headings(a, b):
@ -321,7 +321,11 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection):
# } # }
# """, # """,
1:""" 1:"""
该招标文件中对投标文件的要求有哪些请以json格式给我返回结果外层键名为文中提到的有关投标文件的若干要求对于各子要求你可以用嵌套键值对组织回答其中嵌套键名为你对相关子要求的总结或是原文中的标题而最内层的键值应该完全与原文内容保持一致若存在多个并列内容那么键值为一个列表列表中包含若干描述该要求的字符串要求键值与原文内容一致不得总结删减示例如下仅供格式参考 该招标文件中对投标文件的要求有哪些请以json格式给我返回结果外层键名为文中提到的有关投标文件的若干要求对于各子要求你可以用嵌套键值对组织回答其中嵌套键名为你对相关子要求的总结或是原文中的标题而最内层的键值应该完全与原文内容保持一致若存在多个并列内容那么键值为一个列表列表中包含若干描述该要求的字符串要求键值与原文内容一致不得总结删减
其他要求
1.若原文中为对应内容有表格表述请将该表格用markdown语法返回每行写在键值中的一个字符串中
示例如下仅供格式参考
{ {
"编写要求":"编写要求xxx", "编写要求":"编写要求xxx",
"格式要求":{ "格式要求":{
@ -355,7 +359,11 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection):
# } # }
# """, # """,
2:""" 2:"""
该招标文件中开评定标要求(或磋商流程内容)是什么请以json格式给我返回结果外层键名为文中提到的有关开评定标要求(或磋商流程内容)的若干要求对于各子要求你可以用嵌套键值对组织回答其中嵌套键名为你对相关子要求的总结或是原文中的标题而最内层的键值应该完全与原文内容保持一致若存在多个并列内容那么键值为一个列表列表中包含若干描述该要求的字符串要求键值与原文内容一致不得总结删减示例如下仅供格式参考 该招标文件中开评定标要求(或磋商流程内容)是什么请以json格式给我返回结果外层键名为文中提到的有关开评定标要求(或磋商流程内容)的若干要求对于各子要求你可以用嵌套键值对组织回答其中嵌套键名为你对相关子要求的总结或是原文中的标题而最内层的键值应该完全与原文内容保持一致若存在多个并列内容那么键值为一个列表列表中包含若干描述该要求的字符串要求键值与原文内容一致不得总结删减
其他要求
1.若原文中为对应内容有表格表述请将该表格用markdown语法返回每行写在键值中的一个字符串中
示例如下仅供格式参考
{ {
"开标":"招标文件关于项目开标的要求", "开标":"招标文件关于项目开标的要求",
"开标异议":"招标文件中关于开标异议的项", "开标异议":"招标文件中关于开标异议的项",
@ -387,7 +395,7 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection):
return {"error": f"无效的 selection 值: {selection}. 请选择 1、2 或 3。"} return {"error": f"无效的 selection 值: {selection}. 请选择 1、2 或 3。"}
# 调用大模型并处理响应 # 调用大模型并处理响应
try: try:
res = qianwen_long(file_id, user_query) res = qianwen_long_stream(file_id, user_query)
cleaned_res = clean_json_string(res) cleaned_res = clean_json_string(res)
return cleaned_res return cleaned_res
except Exception as e: except Exception as e:

View File

@ -51,15 +51,25 @@ def parse_text_by_heading(text):
# 定义所有需要的正则表达式模式 # 定义所有需要的正则表达式模式
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') # 一、 pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') # 一、
pattern_parentheses = re.compile(r'^\s*[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*') # (一) pattern_parentheses = re.compile(r'^\s*[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*') # (一)
pattern_letter_initial = re.compile(r'^([A-Z])\.?\s*(.*)$') # 初始扫描时匹配 A内容 或 A. 内容 pattern_letter_initial = re.compile(r'^([A-Z])[..、]\s*(.*)$') # 初始扫描时匹配 A内容 或 A. 内容
pattern_letter = re.compile(r'^([A-Z])\.\s*(.*)$') # 主循环中严格匹配 A. 内容 pattern_letter = re.compile(r'^([A-Z])[..、]\s*(.*)$') # 主循环中严格匹配 A. 内容
pattern_arabic = re.compile(r'^(\d+、)\s*(.+)$') # 1、内容 # pattern_arabic = re.compile(r'^(\d+、)\s*(.+)$') # 1、内容
initial_heading_pattern = None initial_heading_pattern = None
special_section_keywords = ['文件的组成', '文件的构成','文件包括:','文件包括:','雷同认定','包括以下内容'] # 定义特殊章节关键词 special_section_keywords = ['文件的组成', '文件的构成','文件包括:','文件包括:','雷同认定','包括以下内容'] # 定义特殊章节关键词
in_special_section = False # 标志是否在特殊章节中 in_special_section = False # 标志是否在特殊章节中
lines = text.split('\n') lines = text.split('\n')
def get_current_number(key_chinese):
chinese_to_number = {
'': 1, '': 2, '': 3, '': 4, '': 5,
'': 6, '': 7, '': 8, '': 9, '': 10,
'十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15
}
if not key_chinese:
return -1
return chinese_to_number.get(key_chinese, -1)
def check_year_pattern(line): def check_year_pattern(line):
# 检查是否为年份模式,如 '2014年' # 检查是否为年份模式,如 '2014年'
line_no_spaces = line.replace(' ', '') line_no_spaces = line.replace(' ', '')
@ -78,21 +88,26 @@ def parse_text_by_heading(text):
return True return True
return False return False
# 预先扫描前5行检查是否有匹配任何标题模式 # 预先扫描前5行检查是否有匹配任何标题模式 '一、总则' 这种
first_five_lines = lines[:5] first_five_lines = lines[:5]
has_initial_heading_patterns = False has_initial_heading_patterns = False
for line in first_five_lines: for line in first_five_lines:
line_stripped = line.strip().replace('', '.') line_stripped = line.strip().replace('', '.')
if line_stripped.startswith("##"):
line_stripped = line_stripped[2:] # Remove "##"
if (pattern_numbered.match(line_stripped) or if (pattern_numbered.match(line_stripped) or
pattern_parentheses.match(line_stripped) or pattern_parentheses.match(line_stripped) or
pattern_letter_initial.match(line_stripped) or pattern_letter_initial.match(line_stripped)
pattern_arabic.match(line_stripped)): ):
has_initial_heading_patterns = True has_initial_heading_patterns = True
break break
for i, line in enumerate(lines): for i, line in enumerate(lines):
in_double_hash_mode = False
line_stripped = line.strip().replace('', '.') line_stripped = line.strip().replace('', '.')
if line_stripped.startswith("##"):
in_double_hash_mode = True
line_stripped = line_stripped[2:] # Remove "##"
# 如果在一个章节内,跳过年份模式的行 # 如果在一个章节内,跳过年份模式的行
if check_year_pattern(line_stripped) and current_key is not None: if check_year_pattern(line_stripped) and current_key is not None:
current_content.append(line_stripped) current_content.append(line_stripped)
@ -115,10 +130,10 @@ def parse_text_by_heading(text):
dot_match = re.match(r'^[.](\d+(?:[.]\d+)*)\s*(.+)$', line_stripped) dot_match = re.match(r'^[.](\d+(?:[.]\d+)*)\s*(.+)$', line_stripped)
# 匹配以点号开头但不带数字的情况,例如 '.投标文件的制作和签署' # 匹配以点号开头但不带数字的情况,例如 '.投标文件的制作和签署'
dot_text_match = re.match(r'^[.]\s*(\D.+)$', line_stripped) dot_text_match = re.match(r'^[.]\s*(\D.+)$', line_stripped)
# 匹配不带点号的纯数字开头的情况,例如 '27xxxxx' # 匹配不带点号的纯数字开头的情况,例如 '27xxxxx'
pure_number_match = re.match(r'^(\d+)([^.\d)(].*)', line_stripped) # 不允许出现右括号 pure_number_match = re.match(r'^(\d+)([^.\d)(号条款].*)', line_stripped) # 不允许出现右括号
if match: if match:
new_key, line_content = match.groups() new_key, line_content = match.groups()
@ -132,7 +147,7 @@ def parse_text_by_heading(text):
# 保存中文标题的内容 # 保存中文标题的内容
if current_key_chinese is not None: if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese data[current_key_chinese] = current_value_chinese
current_key_chinese = None # current_key_chinese = None
# 处理临时标题与新主标题的关联 # 处理临时标题与新主标题的关联
if temp_title: if temp_title:
@ -156,45 +171,52 @@ def parse_text_by_heading(text):
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section) append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
elif dot_match: elif dot_match:
# 处理以点号开头并带有数字的情况 if in_double_hash_mode:
sub_number, line_content = dot_match.groups() # 处理以点号开头并带有数字的情况
sub_number = sub_number.replace('', '.') sub_number, line_content = dot_match.groups()
if last_main_number: sub_number = sub_number.replace('', '.')
new_key = f"{last_main_number}.{sub_number}" if last_main_number:
new_key = f"{last_main_number}.{sub_number}"
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
current_key = new_key
current_content = [line_content]
append_newline = True
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
in_special_section)
elif dot_text_match:
if in_double_hash_mode:
# 处理以点号开头但不带数字的情况,自动生成下一个一级序号
temp_content = dot_text_match.group(1).strip()
if last_main_number:
# 生成下一个一级序号
try:
# 尝试将 last_main_number 转为整数并加1
next_main_number = str(int(last_main_number) + 1) + '.'
except ValueError:
# 如果 last_main_number 不是纯数字,处理为中文序号或其他逻辑
# 这里假设 last_main_number 是阿拉伯数字
next_main_number = '未识别的序号.'
else:
# 如果没有上一个主编号,默认从 '1.' 开始
next_main_number = '1.'
# 更新 current_key 和 last_main_number
if current_key is not None: if current_key is not None:
content_string = ''.join(current_content).strip() content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string data[current_key] = data.get(current_key, '') + content_string
current_key = new_key current_key = next_main_number
current_content = [line_content] current_content = [temp_content] # 不直接添加 '\n'
append_newline = True last_main_number = next_main_number.rstrip('.')
else: append_newline = True # 设置标志,下一次内容添加时需要 '\n'
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
elif dot_text_match:
# 处理以点号开头但不带数字的情况,自动生成下一个一级序号
temp_content = dot_text_match.group(1).strip()
if last_main_number:
# 生成下一个一级序号
try:
# 尝试将 last_main_number 转为整数并加1
next_main_number = str(int(last_main_number) + 1) + '.'
except ValueError:
# 如果 last_main_number 不是纯数字,处理为中文序号或其他逻辑
# 这里假设 last_main_number 是阿拉伯数字
next_main_number = '未识别的序号.'
else:
# 如果没有上一个主编号,默认从 '1.' 开始
next_main_number = '1.'
# 更新 current_key 和 last_main_number
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
current_key = next_main_number
current_content = [temp_content] # 不直接添加 '\n'
last_main_number = next_main_number.rstrip('.')
append_newline = True # 设置标志,下一次内容添加时需要 '\n'
continue # 跳过进一步处理该行
continue # 跳过进一步处理该行
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
in_special_section)
elif pure_number_match: elif pure_number_match:
# 处理不带点号的纯数字开头的情况 # 处理不带点号的纯数字开头的情况
new_key_candidate, line_content = pure_number_match.groups() new_key_candidate, line_content = pure_number_match.groups()
@ -217,6 +239,8 @@ def parse_text_by_heading(text):
temp_title = None # 重置临时标题 temp_title = None # 重置临时标题
# 开始新的标题 # 开始新的标题
if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese
if current_key is not None: if current_key is not None:
content_string = ''.join(current_content).strip() content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string data[current_key] = data.get(current_key, '') + content_string
@ -231,13 +255,13 @@ def parse_text_by_heading(text):
else: else:
# 根据预先设置的标志决定是否执行这部分代码 # 根据预先设置的标志决定是否执行这部分代码
if has_initial_heading_patterns and not skip_subheadings and not in_special_section: if has_initial_heading_patterns and not skip_subheadings and not in_special_section:
numbered_match = pattern_numbered.match(line_stripped) numbered_match = pattern_numbered.match(line_stripped) #一、
parentheses_match = pattern_parentheses.match(line_stripped) parentheses_match = pattern_parentheses.match(line_stripped) #(一)
letter_match = pattern_letter.match(line_stripped) letter_match = pattern_letter.match(line_stripped) #A. 内容
arabic_match = pattern_arabic.match(line_stripped) # arabic_match = pattern_arabic.match(line_stripped) #1、内容
# 判断当前行是否匹配了任何标题模式 # 判断当前行是否匹配了任何标题模式
if numbered_match or parentheses_match or letter_match or arabic_match: if numbered_match or parentheses_match or letter_match:
# 如果初始标题模式尚未设置,则记录当前匹配的标题模式 # 如果初始标题模式尚未设置,则记录当前匹配的标题模式
if initial_heading_pattern is None: if initial_heading_pattern is None:
if numbered_match: if numbered_match:
@ -246,8 +270,7 @@ def parse_text_by_heading(text):
initial_heading_pattern = 'parentheses' initial_heading_pattern = 'parentheses'
elif letter_match: elif letter_match:
initial_heading_pattern = 'letter' initial_heading_pattern = 'letter'
elif arabic_match:
initial_heading_pattern = 'arabic'
# 确定当前匹配的标题模式 # 确定当前匹配的标题模式
if numbered_match: if numbered_match:
@ -256,44 +279,52 @@ def parse_text_by_heading(text):
current_heading_pattern = 'parentheses' current_heading_pattern = 'parentheses'
elif letter_match: elif letter_match:
current_heading_pattern = 'letter' current_heading_pattern = 'letter'
elif arabic_match:
current_heading_pattern = 'arabic'
# 如果当前标题模式与初始标题模式一致,创建新的键值对 # 如果当前标题模式与初始标题模式一致,创建新的键值对
if current_heading_pattern == initial_heading_pattern: if current_heading_pattern == initial_heading_pattern:
new_key_chinese = None
new_value = None
# 保存之前的 key 的内容 # 保存之前的 key 的内容
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = data.get(current_key, '') + content_string
current_content=[]
if current_key_chinese is not None: if current_key_chinese is not None:
data[current_key_chinese] = current_value_chinese data[current_key_chinese] = current_value_chinese
current_key_chinese = None if current_key is not None:
content_string = ''.join(current_content).strip()
# 如果已经有内容,添加换行符
if data.get(current_key):
data[current_key] = data.get(current_key) + '\n' + content_string
else:
data[current_key] = content_string
current_content = []
# 处理匹配到的标题 # 处理匹配到的标题
if current_heading_pattern == 'numbered': if current_heading_pattern == 'numbered':
current_key_chinese = numbered_match.group(1) new_key_chinese = numbered_match.group(1)
current_value_chinese = line_stripped[numbered_match.end():].lstrip('..、,').replace(' ', '') new_value = line_stripped[numbered_match.end():].lstrip('..、,').replace(' ', '')
elif current_heading_pattern == 'parentheses': elif current_heading_pattern == 'parentheses':
current_key_chinese = parentheses_match.group(1) new_key_chinese = parentheses_match.group(1)
current_value_chinese = line_stripped[parentheses_match.end():].lstrip('..、,').replace(' ', '') new_value = line_stripped[parentheses_match.end():].lstrip('..、,').replace(' ', '')
elif current_heading_pattern == 'letter': elif current_heading_pattern == 'letter':
# 字母标题处理
letter_key, letter_value = letter_match.groups() letter_key, letter_value = letter_match.groups()
letter_to_chinese = { letter_to_chinese = {
'A': '', 'B': '', 'C': '', 'D': '', 'E': '', 'A': '', 'B': '', 'C': '', 'D': '', 'E': '',
'F': '', 'G': '', 'H': '', 'I': '', 'J': '', 'F': '', 'G': '', 'H': '', 'I': '', 'J': '',
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五' 'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
} }
current_key_chinese = letter_to_chinese.get(letter_key, letter_key) new_key_chinese = letter_to_chinese.get(letter_key, letter_key)
current_value_chinese = letter_value.lstrip('..、,').replace(' ', '') new_value = letter_value.lstrip('..、,').replace(' ', '')
elif current_heading_pattern == 'arabic': # 比较数字大小
arabic_key, arabic_value = arabic_match.groups() current_number = get_current_number(current_key_chinese) if current_key_chinese else -1
current_key = arabic_key.replace('', '.') new_number = get_current_number(new_key_chinese)
current_content = [arabic_value.replace(' ', '')] if new_number > current_number or current_key_chinese is None:
append_newline = True # 设置新的键值对
last_main_number = current_key.rstrip('.') current_key_chinese = new_key_chinese
continue current_value_chinese = new_value
current_key=None
else:
# 如果新数字小于等于当前数字,将该行视为内容
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline,
keywords, in_special_section)
else: else:
# 当前标题模式与初始模式不一致,将该行视为内容 # 当前标题模式与初始模式不一致,将该行视为内容
if line_stripped: if line_stripped:
@ -349,6 +380,12 @@ def extract_text_from_pdf(file_path, start_word, end_pattern):
end_index = matches[-1].start() end_index = matches[-1].start()
cleaned_text = cleaned_text[:end_index] cleaned_text = cleaned_text[:end_index]
lines = cleaned_text.split('\n')
for j, line in enumerate(lines):
if j == 0:
lines[j] = '##' + line
cleaned_text = '\n'.join(lines)
all_pages_text.append(cleaned_text) all_pages_text.append(cleaned_text)
# 合并所有页面的文本 # 合并所有页面的文本

View File

@ -12,9 +12,10 @@ def extract_text_by_page(file_path):
page = reader.pages[page_num] page = reader.pages[page_num]
text = page.extract_text() text = page.extract_text()
if text: if text:
print(text)
cleaned_text = clean_page_content(text,common_header) cleaned_text = clean_page_content(text,common_header)
# cleaned_text=text # cleaned_text=text
print(cleaned_text) # print(cleaned_text)
print("-----------------"+str(page_num)) print("-----------------"+str(page_num))
result += cleaned_text result += cleaned_text
# print(f"Page {page_num + 1} Content:\n{cleaned_text}") # print(f"Page {page_num + 1} Content:\n{cleaned_text}")
@ -119,7 +120,7 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
if __name__ == '__main__': if __name__ == '__main__':
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf' # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
# file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\东莞支队查验招标文件.pdf" # file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\东莞支队查验招标文件.pdf"
file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf' file_path = r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf" # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf" # file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"

View File

@ -13,6 +13,8 @@ from openai import OpenAI
import os import os
file_write_lock = threading.Lock() file_write_lock = threading.Lock()
@sleep_and_retry
@limits(calls=2, period=1)
def upload_file(file_path,output_folder=""): def upload_file(file_path,output_folder=""):
""" """
Uploads a file to DashScope and returns the file ID. Uploads a file to DashScope and returns the file ID.

View File

@ -10,7 +10,7 @@ from functools import wraps
def rate_limiter(): def rate_limiter():
pass # 此函数仅用于限流控制,不执行任何操作 pass # 此函数仅用于限流控制,不执行任何操作
# 创建一个共享的装饰器 # 创建一个共享的装饰器,在被装饰函数 (func) 执行之前,先调用 rate_limiter()。
def shared_rate_limit(func): def shared_rate_limit(func):
@wraps(func) @wraps(func)
def wrapper(*args, **kwargs): def wrapper(*args, **kwargs):

View File

@ -1,17 +1,12 @@
import re import re
line_stripped="""1.采购人:陕西省某单位
# 合并后的正则表达式 2采购代理机构陕西坤硕项目管理有限公司
begin_pattern = re.compile( 3供应商响应招标并且符合招标文件规定资格条件和参加投标竞
r'(?<!见)' # 确保前面不是“见” 争的法人其他组织或者自然人
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分” """
r'[\u4e00-\u9fff、()]*?' # 匹配允许的字符(中文、顿号、括号) pure_number_match = re.match(r'^(\d+)([^.\d)()、].*)', line_stripped) # 不允许出现右括号
r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审” if pure_number_match:
r'(?=.*(?:办法|方法))' # 确保包含“办法”或“方法” print("yes")
r'[\u4e00-\u9fff、()]*\s*$' # 继续匹配允许的字符直到行尾
r'|\s*评标(办法|方法)前附表\s*$', # 或匹配“评标办法前附表”或“评标方法前附表”
re.MULTILINE
)
# 测试字符串 # 测试字符串
test_strings = [ test_strings = [
@ -52,9 +47,9 @@ test_strings = [
""" """
] ]
for test_string in test_strings: # for test_string in test_strings:
match = re.search(begin_pattern, test_string) # match = re.search(begin_pattern, test_string)
if match: # if match:
print("Matched Content:", match.group()) # 输出匹配的内容 # print("Matched Content:", match.group()) # 输出匹配的内容
else: # else:
print("No match found.") # print("No match found.")

View File

@ -113,14 +113,17 @@ def combine_evaluation_standards(evaluation_method):
# user_query_2 = ( # user_query_2 = (
# "根据该文档中的评标办法前附表,请你列出该文件的技术评分,商务评分,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术评分'中新增子键名'备注'存放该信息。如果评分内容因素不是这3个则返回文档中给定的评分内容因素以及它的评分要求。请以json格式返回结果不要回答有关形式、资格、响应性评审标准的内容") # "根据该文档中的评标办法前附表,请你列出该文件的技术评分,商务评分,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术评分'中新增子键名'备注'存放该信息。如果评分内容因素不是这3个则返回文档中给定的评分内容因素以及它的评分要求。请以json格式返回结果不要回答有关形式、资格、响应性评审标准的内容")
user_query_2 = ( user_query_2 = (
"""根据该文档中的评标办法表格请你列出该文件的技术评分商务评分投标报价评审以及它们对应的具体评分要求请以json格式返回结果最外层键名分别是'技术评分','商务评分','投标报价评审',请在这三大项评分中分别用若干键值对表示具体评分项,外层键名为各评审因素,可能存在嵌套关系,但最内层键值为一个列表,列表中包含若干(可为一)描述该评审因素的评分及要求的字典,内层键名分别是'评分''要求',若无评分,可删去'评分'键值对,'要求'中说明了该评审因素的评分标准;若这三大项评分中存在其他信息,则在相应评分大块内部新增键名'备注'存放该信息,键值为具体的要求,否则不需要。如果评分内容(因素)不是这三大项,则返回文档中给定的评分内容(因素)以及它们的具体评分要求。 """你是一个对招投标业务非常熟悉的专家。根据该文档中的评标办法表格请你列出该文件的技术评分商务评分投标报价评审以及它们对应的具体评分要求请以json格式返回结果最外层键名分别是'技术评分','商务评分','投标报价评审',请在这三大项评分中分别用若干键值对表示具体评分项,外层键名为各评审因素,可能存在嵌套关系,但最内层键值为一个列表,列表中包含若干(可为一)描述该评审因素的评分及要求的字典,内层键名分别是'评分''要求',若无评分,可删去'评分'键值对,'要求'中说明了该评审因素的评分标准;若这三大项评分中存在其他信息,则在相应评分大块内部新增键名'备注'存放该信息,键值为具体的要求,否则不需要。如果评分内容(因素)不是这三大项,则返回文档中给定的评分内容(因素)以及它们的具体评分要求。
特殊情况处理
如果评分内容因素不是这三大项则返回表格中给定的评分内容因素代替最外层键名'技术评分','商务评分','投标报价评分'并且返回它们的具体评分要求
要求与指南 要求与指南
1. 请首先定位评分细则的表格不要回答有关资格审查的内容也不要从评标办法正文中提取回答 1. 请首先定位评分细则的表格不要回答有关资格审查的内容也不要从评标办法正文中提取回答
2. 若大项的'xx评分'要求未在文中说明则键名'xx评分'的键值设为'本项目无xx评分项'例如"技术评分":"本项目无技术评分项" 2. 若大项的'xx评分'要求未在文中说明则键名'xx评分'的键值设为'本项目无xx评分项'例如"技术评分":"本项目无技术评分项"
3. 如果该招标活动有多个包则最外层键名为对应的包名,否则最外层键名为各大评分项 3. 如果该招标活动有多个包则最外层键名为对应的包名,否则最外层键名为各大评分项
4. 你无需将表格的单元格内的内容进行拆分需要将它视为一个整体 4. 你无需将表格的单元格内的内容进行拆分需要将它视为一个整体
5. '评分'的键值不能是一个范围数字'0-5分'应该是一个具体数字'5分'或者是一个定性的指标如'合格制' 5. '评分'的键值不能是一个范围数字'0-5分'应该是一个具体数字'5分'或者是一个定性的指标如'合格制'
6. 若表格中商务和技术评分混合一起请你手动将它们区别商务评分通常包含'售后服务''质量保证''业绩''企业人员''企业信用'等商务因素
以下为示例输出仅供格式参考 以下为示例输出仅供格式参考
{ {

View File

@ -210,7 +210,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
pass pass
else: else:
start_page = i start_page = i
if start_page is not None and re.search(end_pattern, cleaned_text): if start_page is not None and re.search(end_pattern, cleaned_text) and not re.search(begin_pattern,cleaned_text):
condition = i > start_page condition = i > start_page
if condition: if condition:
is_invalid_condition = output_suffix == "invalid" and i > 30 # 这边默认无效投标至少有30页 is_invalid_condition = output_suffix == "invalid" and i > 30 # 这边默认无效投标至少有30页
@ -362,7 +362,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
( (
re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))'), re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))'),
# Alternative begin pattern # Alternative begin pattern
re.compile(r'评标办法正文|评标办法|^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+') re.compile(r'评标办法正文|评标方法正文|^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+')
# Alternative end pattern # Alternative end pattern
), ),
( (
@ -591,17 +591,15 @@ if __name__ == "__main__":
# input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标" # input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标"
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
input_path=r"C:\Users\Administrator\Desktop\fsdownload\854fb19f-96d3-4b3e-b2ba-1a095344fd92\ztbfile.pdf" input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest20.pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\new招标文件\\output3" output_folder = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\all"
files=truncate_pdf_multiple(input_path,output_folder) # files=truncate_pdf_multiple(input_path,output_folder)
# print(files)
# selections = [4, 1] # 仅处理 selection 4、1 # selections = [4, 1] # 仅处理 selection 4、1
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections) # files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
for i in files: selection = 2 # 例如1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
print(type(i)) generated_files = truncate_pdf_main(input_path, output_folder, selection)
print(i) print(generated_files)
# selection = 1 # 例如1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print(generated_files)
# print("生成的文件:", generated_files) # print("生成的文件:", generated_files)
end_time = time.time() end_time = time.time()
print("耗时:" + str(end_time - start_time)) print("耗时:" + str(end_time - start_time))

View File

@ -8,7 +8,7 @@ def convert_clause_to_json(file_path, output_folder, type=1):
if type == 1: if type == 1:
# 对于 type=1使用原始字符串定义 start_word 和 end_pattern # 对于 type=1使用原始字符串定义 start_word 和 end_pattern
start_word = ( start_word = (
r'^\s*(?:[(]?\s*[一二12]?\s*[)]?\s*[、..]*\s*)?(说\s*明|总\s*则)' r'^\s*(?:[(]?\s*[一二12]?\s*[)]?\s*[、..]*\s*)?(说\s*明|总\s*则|名\s*词\s*解\s*释)'
r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文)' r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文)'
) )
end_pattern = ( end_pattern = (
@ -94,7 +94,7 @@ def convert_clause_to_json(file_path, output_folder, type=1):
if __name__ == "__main__": if __name__ == "__main__":
# file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zbtest20_tobidders_notice.pdf' # file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zbtest20_tobidders_notice.pdf'
file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zbtest4_tobidders_notice.pdf' file_path=r'C:\Users\Administrator\Desktop\招标文件\招标test文件夹\all\招标02_tobidders_notice.pdf'
output_folder = r'C:\Users\Administrator\Desktop\招标文件\招标test文件夹\tmp' output_folder = r'C:\Users\Administrator\Desktop\招标文件\招标test文件夹\tmp'
try: try:
output_path = convert_clause_to_json(file_path,output_folder) output_path = convert_clause_to_json(file_path,output_folder)

View File

@ -8,12 +8,15 @@ from flask_app.general.clean_pdf import clean_page_content, extract_common_heade
from flask_app.general.format_change import docx2pdf from flask_app.general.format_change import docx2pdf
from flask_app.general.merge_pdfs import merge_and_cleanup, merge_pdfs, merge_selected_pdfs_for_goods from flask_app.general.merge_pdfs import merge_and_cleanup, merge_pdfs, merge_selected_pdfs_for_goods
import concurrent.futures import concurrent.futures
def get_global_logger(unique_id): def get_global_logger(unique_id):
if unique_id is None: if unique_id is None:
return logging.getLogger() # 获取默认的日志器 return logging.getLogger() # 获取默认的日志器
logger = logging.getLogger(unique_id) logger = logging.getLogger(unique_id)
return logger return logger
# fitz库版本 # fitz库版本
# def extract_common_header(pdf_path): # def extract_common_header(pdf_path):
# doc = fitz.open(pdf_path) # doc = fitz.open(pdf_path)
@ -59,6 +62,7 @@ def convert_to_pdf(file_path):
return docx2pdf(file_path) return docx2pdf(file_path)
return file_path return file_path
def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
pdf_path = convert_to_pdf(file_path) pdf_path = convert_to_pdf(file_path)
result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
@ -91,9 +95,15 @@ def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
if start_page is None and re.search(begin_pattern, cleaned_text): if start_page is None and re.search(begin_pattern, cleaned_text):
if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page): if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
start_page = i start_page = i
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page: if start_page is not None:
end_page = i if output_suffix == "tobidders_notice":
break if re.search(end_pattern, cleaned_text) and i > start_page:
end_page = i
break
else:
if re.search(end_pattern, cleaned_text) and i > start_page and not re.search(begin_pattern,cleaned_text):
end_page = i
break
return start_page, end_page return start_page, end_page
@ -105,26 +115,33 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
total_pages = len(pdf_document.pages) - 1 # 获取总页数 total_pages = len(pdf_document.pages) - 1 # 获取总页数
if output_suffix == "tobidders_notice": if output_suffix == "tobidders_notice":
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成') exclusion_pattern = re.compile(
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
start_page, mid_page, end_page = extract_pages_tobidders_notice( start_page, mid_page, end_page = extract_pages_tobidders_notice(
pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern
) )
if not start_page or not mid_page or not end_page: if not start_page or not mid_page or not end_page:
print(f"三次提取tobidders_notice均失败!!{pdf_path}") print(f"三次提取tobidders_notice均失败!!{pdf_path}")
return "","" return "", ""
# if start_page is None or end_page is None or mid_page is None: # if start_page is None or end_page is None or mid_page is None:
# print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") # print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
# return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page) # return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page)
path1 = save_extracted_pages(pdf_document, start_page, mid_page, pdf_path, output_folder,
path1 = save_extracted_pages(pdf_document, start_page, mid_page, pdf_path, output_folder, "tobidders_notice_part1") "tobidders_notice_part1")
path2 = save_extracted_pages(pdf_document, mid_page, end_page, pdf_path, output_folder, "tobidders_notice_part2") if mid_page != end_page:
path2 = save_extracted_pages(pdf_document, mid_page, end_page, pdf_path, output_folder,
"tobidders_notice_part2")
else:
path2=path1
return path1, path2 return path1, path2
else: else:
# 原有的处理逻辑保持不变 # 原有的处理逻辑保持不变
if output_suffix == "qualification1" or output_suffix=="procurement" or output_suffix=="evaluation_method": if output_suffix == "qualification1" or output_suffix == "procurement" or output_suffix == "evaluation_method":
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成') exclusion_pattern = re.compile(
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern, output_suffix) r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
common_header, exclusion_pattern, output_suffix)
# 针对 selection = 6 的特殊处理 # 针对 selection = 6 的特殊处理
if output_suffix == "format": if output_suffix == "format":
if start_page is None: if start_page is None:
@ -137,7 +154,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix) return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
if start_page is None or end_page is None: if start_page is None or end_page is None:
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page) return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page)
elif output_suffix == "qualification1": elif output_suffix == "qualification1":
truncate_pdf_main(pdf_path, output_folder, 2, "qualification3") # 合并'资格审查'章节和'评标办法'章节 truncate_pdf_main(pdf_path, output_folder, 2, "qualification3") # 合并'资格审查'章节和'评标办法'章节
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix) return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
@ -151,6 +168,7 @@ def get_patterns_for_procurement():
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十1-9]+(?:章|部分).*(?:采购|需求).*', # r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十1-9]+(?:章|部分).*(?:采购|需求).*',
# re.MULTILINE) # re.MULTILINE)
begin_pattern = re.compile( begin_pattern = re.compile(
r'(?<!见)'
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分” r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
r'[\u4e00-\u9fff、()]*?' # 匹配允许的字符 r'[\u4e00-\u9fff、()]*?' # 匹配允许的字符
r'(?:(?:服务|项目|商务|技术)[\u4e00-\u9fff、()]*?要求[\u4e00-\u9fff、()]*?\s*$|' # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求” r'(?:(?:服务|项目|商务|技术)[\u4e00-\u9fff、()]*?要求[\u4e00-\u9fff、()]*?\s*$|' # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求”
@ -178,6 +196,8 @@ def get_patterns_for_evaluation_method():
end_pattern = re.compile( end_pattern = re.compile(
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE) r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE)
return begin_pattern, end_pattern return begin_pattern, end_pattern
def get_patterns_for_notice(): def get_patterns_for_notice():
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*') begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*')
end_pattern = re.compile( end_pattern = re.compile(
@ -186,6 +206,8 @@ def get_patterns_for_notice():
re.MULTILINE re.MULTILINE
) )
return begin_pattern, end_pattern return begin_pattern, end_pattern
def get_patterns_for_notice_twice(): def get_patterns_for_notice_twice():
begin_pattern = re.compile( begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*', re.MULTILINE r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*', re.MULTILINE
@ -197,6 +219,7 @@ def get_patterns_for_notice_twice():
) )
return begin_pattern, end_pattern return begin_pattern, end_pattern
# def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header, # def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header,
# exclusion_pattern): # exclusion_pattern):
# start_page = None # start_page = None
@ -294,7 +317,7 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
# 定义基础的 mid_pattern # 定义基础的 mid_pattern
base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \ base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则)' r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
# 合并基础模式和额外模式 # 合并基础模式和额外模式
if additional_mid_pattern: if additional_mid_pattern:
@ -316,14 +339,14 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
# 定义基础的 mid_pattern # 定义基础的 mid_pattern
base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \ base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则)' r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
combined_mid_pattern = re.compile( combined_mid_pattern = re.compile(
rf'{base_mid_pattern}', rf'{base_mid_pattern}',
re.MULTILINE re.MULTILINE
) )
else: else:
# 如果提供了固定的 end_pattern则使用默认的 mid_pattern # 如果提供了固定的 end_pattern则使用默认的 mid_pattern
base_mid_pattern = r'.*[(]?\s*[一二12]?[)]?\s*[、..]*\s*(说\s*明|总\s*则)\s*$' #可以匹配"东莞市粤隆招标有限公司编制 46一、说明" base_mid_pattern = r'.*[(]?\s*[一二12]?[)]?\s*[、..]*\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)\s*$' # 可以匹配"东莞市粤隆招标有限公司编制 46一、说明"
combined_mid_pattern = re.compile( combined_mid_pattern = re.compile(
rf'{base_mid_pattern}', rf'{base_mid_pattern}',
re.MULTILINE re.MULTILINE
@ -332,7 +355,7 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
# 识别中间页 # 识别中间页
if start_page is not None and mid_page is None and combined_mid_pattern: if start_page is not None and mid_page is None and combined_mid_pattern:
if (start_page+1==i) and re.search(local_begin_pattern,cleaned_text): if (start_page + 1 == i) and re.search(local_begin_pattern, cleaned_text):
continue continue
if re.search(combined_mid_pattern, cleaned_text): if re.search(combined_mid_pattern, cleaned_text):
mid_page = i mid_page = i
@ -355,7 +378,7 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
if not (start_page and mid_page and end_page): if not (start_page and mid_page and end_page):
print(f"第二次尝试 tobidders_notice!{pdf_path}") print(f"第二次尝试 tobidders_notice!{pdf_path}")
pdf_document = PdfReader(pdf_path) pdf_document = PdfReader(pdf_path)
start_page,mid_page,end_page=extract_pages_twice_tobidders_notice(pdf_document,common_header,begin_page) start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
if start_page and end_page and mid_page: if start_page and end_page and mid_page:
return start_page, mid_page, end_page return start_page, mid_page, end_page
else: else:
@ -375,22 +398,24 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
return start_page, mid_page, end_page return start_page, mid_page, end_page
def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page):
output_suffix="tobidders_notice" def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page):
output_suffix = "tobidders_notice"
begin_pattern = re.compile( begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+',re.MULTILINE r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+',
re.MULTILINE
) )
end_pattern = re.compile( end_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)',re.MULTILINE # 捕获中文部分 r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', re.MULTILINE # 捕获中文部分
) )
exclusion_words = ["合同", "评标", "开标","评审","采购","资格"] # 在这里添加需要排除的关键词 exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"] # 在这里添加需要排除的关键词
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成') exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
# 提取第一部分 # 提取第一部分
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header) start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
if start_page1 is None or end_page1 is None: if start_page1 is None or end_page1 is None:
return "", "","" return "", "", ""
# 提取第二部分 # 提取第二部分
start_page2 = end_page1 start_page2 = end_page1
@ -406,16 +431,16 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page)
# 检查是否包含排除关键词 # 检查是否包含排除关键词
if any(word in chapter_title for word in exclusion_words): if any(word in chapter_title for word in exclusion_words):
# 如果包含排除关键词,直接返回相同的路径 # 如果包含排除关键词,直接返回相同的路径
return start_page1, end_page1,end_page1 return start_page1, end_page1, end_page1
# 如果不包含排除关键词,继续提取第二部分 # 如果不包含排除关键词,继续提取第二部分
_, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2-1, common_header, _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
exclusion_pattern,output_suffix) exclusion_pattern, output_suffix)
if end_page2 is None: if end_page2 is None:
return start_page1, end_page1,end_page1 return start_page1, end_page1, end_page1
return start_page1, end_page1,end_page2 return start_page1, end_page1, end_page2
def extract_pages_qualification(pdf_document, begin_page, common_header): def extract_pages_qualification(pdf_document, begin_page, common_header):
@ -436,7 +461,6 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
r'^(?:附录.*?[:]|附件.*?[:]|附表.*?[:]|附件\s*\d+).*$', r'^(?:附录.*?[:]|附件.*?[:]|附表.*?[:]|附件\s*\d+).*$',
re.MULTILINE re.MULTILINE
) )
# 结束匹配模式 - 章节标题 # 结束匹配模式 - 章节标题
end_pattern_chapter = re.compile( end_pattern_chapter = re.compile(
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
@ -446,7 +470,7 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
print("第二次尝试:匹配附件") print("第二次尝试:匹配附件")
start_page = None start_page = None
end_page = None end_page = None
include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查","资格检查"] include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查", "资格检查"]
exclude_keywords = ["声明函", "承诺函"] exclude_keywords = ["声明函", "承诺函"]
# 从指定的开始页开始遍历 # 从指定的开始页开始遍历
@ -493,9 +517,10 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
return start_page, end_page return start_page, end_page
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page): def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page):
try: try:
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成') exclusion_pattern = re.compile(
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
pdf_document = PdfReader(pdf_path) pdf_document = PdfReader(pdf_path)
patterns = None patterns = None
start_page = None start_page = None
@ -506,13 +531,13 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,be
elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3": elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3":
patterns = [get_patterns_for_evaluation_method()] patterns = [get_patterns_for_evaluation_method()]
elif output_suffix == "notice": elif output_suffix == "notice":
patterns = [get_patterns_for_notice(),get_patterns_for_notice_twice()] patterns = [get_patterns_for_notice(), get_patterns_for_notice_twice()]
elif output_suffix == "qualification1": elif output_suffix == "qualification1":
start_page, end_page = extract_pages_qualification(pdf_document, begin_page, common_header) start_page, end_page = extract_pages_qualification(pdf_document, begin_page, common_header)
if patterns: if patterns:
for pattern_pair in patterns: for pattern_pair in patterns:
start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page, start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
common_header,exclusion_pattern, output_suffix) common_header, exclusion_pattern, output_suffix)
if start_page is not None and end_page is not None: if start_page is not None and end_page is not None:
break break
@ -568,10 +593,12 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo
print(f"Error in save_extracted_pages: {e}") print(f"Error in save_extracted_pages: {e}")
return "" # 返回空字符串 return "" # 返回空字符串
def get_start_and_common_header(input_path): def get_start_and_common_header(input_path):
common_header = extract_common_header(input_path) common_header = extract_common_header(input_path)
last_begin_index = 0 last_begin_index = 0
begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',re.MULTILINE) begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
re.MULTILINE)
pdf_document = PdfReader(input_path) pdf_document = PdfReader(input_path)
for i, page in enumerate(pdf_document.pages): for i, page in enumerate(pdf_document.pages):
if i > 10: if i > 10:
@ -581,8 +608,8 @@ def get_start_and_common_header(input_path):
cleaned_text = clean_page_content(text, common_header) cleaned_text = clean_page_content(text, common_header)
if begin_pattern.search(cleaned_text) and not re.search(r'\s*录', cleaned_text): if begin_pattern.search(cleaned_text) and not re.search(r'\s*录', cleaned_text):
last_begin_index = i # 更新第一个匹配的索引页码从0开始 last_begin_index = i # 更新第一个匹配的索引页码从0开始
return common_header,last_begin_index return common_header, last_begin_index
return common_header,last_begin_index return common_header, last_begin_index
def truncate_pdf_main(input_path, output_folder, selection, output_suffix="default"): def truncate_pdf_main(input_path, output_folder, selection, output_suffix="default"):
@ -631,11 +658,13 @@ def process_input(input_path, output_folder, selection, output_suffix):
# 根据选择设置对应的模式和结束模式 # 根据选择设置对应的模式和结束模式
if selection == 1: if selection == 1:
begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$', re.MULTILINE) begin_pattern = re.compile(
r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$', re.MULTILINE)
end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE) end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE)
local_output_suffix = "notice" local_output_suffix = "notice"
elif selection == 2: elif selection == 2:
begin_pattern = re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法|内容))') begin_pattern = re.compile(
r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法|内容))')
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+') end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
local_output_suffix = "evaluation_method" local_output_suffix = "evaluation_method"
elif selection == 3: elif selection == 3:
@ -650,7 +679,7 @@ def process_input(input_path, output_folder, selection, output_suffix):
local_output_suffix = "tobidders_notice" local_output_suffix = "tobidders_notice"
elif selection == 5: elif selection == 5:
begin_pattern = re.compile( begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购内容|采购要求|需求).*') #包头中有一章'采购相关说明' r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购内容|采购要求|需求).*') # 包头中有一章'采购相关说明'
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+') end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
local_output_suffix = "procurement" local_output_suffix = "procurement"
@ -681,7 +710,7 @@ def process_input(input_path, output_folder, selection, output_suffix):
return [''] return ['']
def truncate_pdf_multiple(pdf_path, output_folder,logger): def truncate_pdf_multiple(pdf_path, output_folder, logger):
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
truncate_files = [] truncate_files = []
@ -717,11 +746,12 @@ def truncate_pdf_multiple(pdf_path, output_folder,logger):
# 合并失败,添加空字符串 # 合并失败,添加空字符串
truncate_files.append("") truncate_files.append("")
logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}") logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}")
logger.info("已截取文件路径"+str(truncate_files)) logger.info("已截取文件路径" + str(truncate_files))
return truncate_files return truncate_files
#小解析,只需要前三章内容
def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="123"): # 小解析,只需要前三章内容
def truncate_pdf_specific_goods(pdf_path, output_folder, selections, unique_id="123"):
""" """
处理 PDF 文件选择指定的 selections并合并结果 处理 PDF 文件选择指定的 selections并合并结果
@ -740,7 +770,9 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
# 使用 ThreadPoolExecutor 进行多线程处理 # 使用 ThreadPoolExecutor 进行多线程处理
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
# 提交所有任务并保持 selection 顺序 # 提交所有任务并保持 selection 顺序
future_to_selection = {selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default") for selection in selections} future_to_selection = {
selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default")
for selection in selections}
# 按 selection 顺序收集结果 # 按 selection 顺序收集结果
for selection in selections: for selection in selections:
@ -769,22 +801,23 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
logger.warning(f"合并失败没有生成merged_baseinfo for {pdf_path}") logger.warning(f"合并失败没有生成merged_baseinfo for {pdf_path}")
return truncate_files return truncate_files
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 工程标中,判断是符合性审查之后,可以将它们设为同一章 # TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 工程标中,判断是符合性审查之后,可以将它们设为同一章
#ztbfile.pdf少资格评审 包头少符合性评审 # ztbfile.pdf少资格评审 包头少符合性评审
if __name__ == "__main__": if __name__ == "__main__":
logger=get_global_logger("123") logger = get_global_logger("123")
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2020-安徽-安徽省生态环境厅电梯采购.pdf" input_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-陕西-陕西省某单位2024年执勤化妆服采购项目.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp" # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
output_folder=r"C:\Users\Administrator\Desktop\招标文件-采购类\tmp2" output_folder = r"C:\Users\Administrator\Desktop\招标文件-采购类\all"
# files = truncate_pdf_multiple(input_path, output_folder,logger) files = truncate_pdf_multiple(input_path, output_folder,logger)
# selections = [3,5] # selections = [3,5]
# files=truncate_pdf_specific_goods(input_path,output_folder,selections) # files=truncate_pdf_specific_goods(input_path,output_folder,selections)
# print(files) print(files)
selection = 2# 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 # selection = 4 # 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
generated_files = truncate_pdf_main(input_path, output_folder, selection) # generated_files = truncate_pdf_main(input_path, output_folder, selection)
print(generated_files) # print(generated_files)

View File

@ -134,13 +134,14 @@ def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json
print(f"The specified file does not exist: {file_path}") print(f"The specified file does not exist: {file_path}")
return "" return ""
if type == 1: if type == 1:
start_word = r'^\s*(?:[(]?\s*[一二12]?\s*[)]?\s*[、..]*\s*)?(说\s*明|总\s*则)' start_word = r'^\s*(?:[(]?\s*[一二12]?\s*[)]?\s*[、..]*\s*)?(说\s*明|总\s*则|名\s*词\s*解\s*释)'
end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*)$' end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*)$'
else: else:
start_word = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*|.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*)$' start_word = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*|.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*)$'
end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+' end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
if file_path.endswith('.pdf'): if file_path.endswith('.pdf'):
text = extract_text_from_pdf(file_path, start_word, end_pattern) text = extract_text_from_pdf(file_path, start_word, end_pattern)
# print(text)
else: else:
raise ValueError("Unsupported file format") raise ValueError("Unsupported file format")
parsed_data = parse_text_by_heading(text) parsed_data = parse_text_by_heading(text)
@ -174,13 +175,14 @@ def process_folder(input_folder, output_folder):
except ValueError as e: except ValueError as e:
print(f"Error processing {file_name}: {e}") print(f"Error processing {file_name}: {e}")
#TODO:招标文件111_tobidders_notice_part2.pdf 陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目(1)_tobidders_notice_part2.pdf #TODO:招标文件111_tobidders_notice_part2.pdf 陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目(1)_tobidders_notice_part2.pdf 唐山市公安交通警察支队机动车查验机构视频存储回放系统竞争性谈判-招标文件正文(1)_tobidders_notice_part1.pdf
#TODO19、竞争性磋商响应文件的加密 暂时没处理'19'缺失的情况 #TODO:2024-陕西-陕西省某单位2024年执勤化妆服采购项目.pdf
#TODO: .不予受理的情形 .后面必须跟中文或者空格’ #TODO: .不予受理的情形 .后面必须跟中文或者空格’
if __name__ == "__main__": if __name__ == "__main__":
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf' # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf' file_path=r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf' # file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\6.2定版视频会议磋商文件_tobidders_notice_part2.pdf'
output_folder = r'C:\Users\Administrator\Desktop\招标文件\output4\tmp' output_folder = r'C:\Users\Administrator\Desktop\招标文件\output4\tmp'
try: try:
output_path = convert_clause_to_json(file_path,output_folder,1) output_path = convert_clause_to_json(file_path,output_folder,1)

View File

@ -224,13 +224,14 @@ def combine_evaluation_standards(truncate_file):
# user_query = "根据该文档中的评标办法前附表或者评分标准表,请你列出该文件的技术评分,商务评分,投标报价评审标准以及它们对应的具体评分要求,外层键名分别为'技术评分','商务评分','投标报价'。如果评分内容不是这3个则返回文档中给定的评分内容以及它的评分要求都以json的格式返回结果如果该采购活动有多个包则最外层键名为对应的包名。请不要回答有关资格审查的内容" # user_query = "根据该文档中的评标办法前附表或者评分标准表,请你列出该文件的技术评分,商务评分,投标报价评审标准以及它们对应的具体评分要求,外层键名分别为'技术评分','商务评分','投标报价'。如果评分内容不是这3个则返回文档中给定的评分内容以及它的评分要求都以json的格式返回结果如果该采购活动有多个包则最外层键名为对应的包名。请不要回答有关资格审查的内容"
user_query = ( user_query = (
""" """
根据该文档中的评分标准表格中的内容请你列出该文件的技术评分商务评分投标报价评审以及它们对应的具体评分要求请以json格式返回结果最外层键名分别是'技术评分','商务评分','投标报价评分',请在这三大项评分中分别用若干键值对表示具体评分项外层键名为各评审因素键值为一个列表列表中包含若干可为一描述该评审因素的评分及要求的字典内层键名分别是'评分''要求'若无评分可删去'评分'键值对'要求'中说明了该评审因素的评分标准若这三大项评分中存在其他信息则在相应评分大块内部新增键名'备注'存放该信息键值为具体的要求否则不需要如果评分内容因素不是这三大项则返回文档中给定的评分内容因素以及它们的具体评分要求 你是一个对招投标业务非常熟悉的专家根据该文档中的评分标准表格中的内容请你列出该文件的技术评分商务评分投标报价评审以及它们对应的具体评分要求请以json格式返回结果最外层键名分别是'技术评分','商务评分','投标报价评分',请在这三大项评分中分别用若干键值对表示具体评分项外层键名为各评审因素键值为一个列表列表中包含若干可为一描述该评审因素的评分及要求的字典内层键名分别是'评分''要求'若无评分可删去'评分'键值对'要求'中说明了该评审因素的评分标准若这三大项评分中存在其他信息则在相应评分大块内部新增键名'备注'存放该信息键值为具体的要求否则不需要如果评分内容因素不是这三大项则返回表格中给定的评分内容因素以及它们的具体评分要求
要求与指南 要求与指南
1.请首先定位评分细则的表格不要回答有关资格审查符合性审查的内容也不要从评标办法正文中表格外提取回答 1.请首先定位评分细则的表格不要回答有关资格审查符合性审查的内容也不要从评标办法正文中表格外提取回答
2.若大项的'xx评分'要求未在文中说明则键名'xx评分'的键值设为'本项目无xx评分项'例如"技术评分":"本项目无技术评分项" 2.若大项的'xx评分'要求未在文中说明则键名'xx评分'的键值设为'本项目无xx评分项'例如{"技术评分":"本项目无技术评分项"}
3. 如果该招标活动有多个包则最外层键名为对应的包名,否则不需要 3. 如果该招标活动有多个包则最外层键名为对应的包名,否则不需要
4.你无需将表格的单元格内的内容进行拆分需要将它视为一个整体 4.你无需将表格的单元格内的内容进行拆分需要将它视为一个整体
5. '评分'的键值不能是一个范围数字'0-5分'应该是一个具体数字'5分'或者是一个定性的指标如'合格制' 5. '评分'的键值不能是一个范围数字'0-5分'应该是一个具体数字'5分'或者是一个定性的指标如'合格制'
6. 若表格中商务和技术评分混合一起请你手动将它们区别商务评分通常包含'售后服务''质量保证''业绩''企业人员''企业信用'等商务因素
以下为示例输出仅供格式参考 以下为示例输出仅供格式参考
{ {
@ -311,7 +312,7 @@ def combine_evaluation_standards(truncate_file):
if __name__ == "__main__": if __name__ == "__main__":
start_time=time.time() start_time=time.time()
truncate_file=r"C:\Users\Administrator\Desktop\fsdownload\1ca1d27d-fc21-4697-8075-9027103df030\ztbfile_evaluation_method.pdf" truncate_file=r"C:\Users\Administrator\Desktop\招标文件-采购类\tmp2\2024-新疆-塔城地区公安局食药环分局快检实验室项目_evaluation_method.pdf"
# truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output2\\2-招标文件统计局智能终端二次招标_evaluation_method.pdf" # truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output2\\2-招标文件统计局智能终端二次招标_evaluation_method.pdf"
# truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output2\\广水市妇幼招标文件最新W改_evaluation_method.pdf" # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output2\\广水市妇幼招标文件最新W改_evaluation_method.pdf"
# truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\2d481945-1f82-45a5-8e56-7fafea4a7793\\ztbfile_evaluation_method.pdf" # truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\2d481945-1f82-45a5-8e56-7fafea4a7793\\ztbfile_evaluation_method.pdf"