1.23 修了开评定标提示词 废标无效标 增加了'十、废标条件' 这种的提取

This commit is contained in:
zy123 2025-01-23 14:01:21 +08:00
parent e4279e9f2d
commit ab74e4d03e
6 changed files with 93 additions and 96 deletions

View File

@ -46,7 +46,7 @@ def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selection
invalid_endpage=invalid_result[1]
end_page = invalid_endpage if invalid_endpage > 120 else -1 #一般情况下Invalid_path的end_page之后都是不需要的信息无需进行正则匹配
# 检查 PDF 页数逻辑
skip, empty_return = check_pdf_pages(invalid_path, mode, logger)
skip, empty_return = check_pdf_pages(invalid_path, mode, logger) #页数少于30不切分
if skip:
return empty_return

View File

@ -273,8 +273,9 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection):
# }
# """,
2:"""该招标文件中有关开标、评标、定标、中标要求、内容(或磋商流程内容)是什么?请以 JSON 格式返回结果。
格式要求
要求与指南
- 外层键名应为文中提到的有关开标评标定标中标要求或磋商流程内容的大项概览性标题
- 提取的内容应该是招标文件中有关开评定标等环节的流程性说明及要求无需提取具体的评分细则以及资格审查细则
- 对于每个大项下的子要求使用嵌套的键值对进行组织
-其中嵌套键名应为原文中的具体标题或对相关子要求的简明总结
-最内层的键值应与原文内容保持一致不得进行任何总结删减或改写默认键值是单独的字符串如果一个子要求包含多个并列内容键值应为一个字符串列表数组其中每个元素都是子要求内容
@ -430,10 +431,10 @@ def extract_from_notice(merged_baseinfo_path, clause_path, type):
if __name__ == "__main__":
# file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json'
merged_baseinfo_path=r"C:\Users\Administrator\Desktop\fsdownload\b29de31a-297e-42cf-b9ba-6859e530a472\ztbfile_merged_baseinfo.pdf"
merged_baseinfo_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\2025-湖北-通羊镇港口村人饮项目谈判文件.docx"
clause_path=r"D:\flask_project\flask_app\static\output\output1\ce679bd7-a17a-4a95-bfc9-3801bd4a86e4\tmp\clause1.json"
try:
res = extract_from_notice(merged_baseinfo_path,clause_path, 1) # 可以改变此处的 type 参数测试不同的场景
res = extract_from_notice(merged_baseinfo_path,"", 2) # 可以改变此处的 type 参数测试不同的场景 #1: ["投标", "投标文件", "响应文件"], 2:开评定标
res2 = json.dumps(res, ensure_ascii=False, indent=4)
print(res2)
except ValueError as e:

View File

@ -75,8 +75,7 @@ def parse_text_by_heading(text):
# pattern_arabic = re.compile(r'^(\d+、)\s*(.+)$') # 1、内容
initial_heading_pattern = None
special_section_keywords = ['文件的组成', '文件的构成', '文件包括:', '文件包括:', '雷同认定',
'以下','情形之一','情况之一','下列'] # 定义特殊章节关键词
special_section_keywords = [r'文件.*(组成|构成|包括|包含)'] #注意,只考虑对文件组成/构成换行,因为一般一行能展示的下,如“投标函”;暂不考虑'下列情形',因为添加换行会导致语义的不连贯。
in_special_section = False # 标志是否在特殊章节中
lines = text.split('\n')
@ -138,7 +137,7 @@ def parse_text_by_heading(text):
# 检查是否进入或退出特殊章节
if is_heading(line_stripped):
if any(keyword in line_stripped for keyword in special_section_keywords):
if any(re.search(pattern, line_stripped) for pattern in special_section_keywords):
in_special_section = True
elif in_special_section:
in_special_section = False
@ -183,7 +182,7 @@ def parse_text_by_heading(text):
data[current_key] = data.get(current_key, '') + content_string
current_key = new_key
current_content = [line_content]
append_newline = len(new_key.rstrip('.').split('.')) <= 2 or any(keyword in line_content for keyword in special_section_keywords)
append_newline = len(new_key.rstrip('.').split('.')) <= 2 or any(re.search(pattern, line_content) for pattern in special_section_keywords)
last_main_number = new_key.split('.')[0]
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
@ -195,7 +194,7 @@ def parse_text_by_heading(text):
sub_number, line_content = dot_match.groups()
sub_number = sub_number.replace('', '.')
# 检查是否进入或退出特殊章节
if any(keyword in line_content for keyword in special_section_keywords):
if any(re.search(pattern, line_stripped) for pattern in special_section_keywords):
in_special_section = True
elif in_special_section:
in_special_section = False
@ -218,7 +217,7 @@ def parse_text_by_heading(text):
# 处理以点号开头但不带数字的情况,自动生成下一个一级序号
temp_content = dot_text_match.group(1).strip()
# 检查是否进入或退出特殊章节
if any(keyword in temp_content for keyword in special_section_keywords):
if any(re.search(pattern, line_stripped) for pattern in special_section_keywords):
in_special_section = True
elif in_special_section:
in_special_section = False

View File

@ -5,6 +5,7 @@ import regex
import time
from concurrent.futures import ThreadPoolExecutor
from flask_app.general.doubao import generate_full_user_query
from flask_app.general.format_change import pdf2docx
from flask_app.general.insert_del_pagemark import insert_mark
from flask_app.general.通义千问long import qianwen_plus
from flask_app.general.通用功能函数 import process_string_list
@ -74,11 +75,6 @@ def preprocess_paragraphs(elements):
processed = [] # 初始化处理后的段落列表
index = 0
flag = False # 初始化标志位
is_combine_table = False
# 定义两个新的正则表达式模式
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
pattern_parentheses = re.compile(r'^\s*[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*')
# 定义列表项的模式
list_item_pattern = re.compile(
@ -134,9 +130,6 @@ def preprocess_paragraphs(elements):
# 跳过空白段落和页面标记
if not text or pattern_marker.search(text):
continue
# 跳过匹配排除模式的段落
if (pattern_numbered.match(text) or pattern_parentheses.match(text)) and len(text) < 8:
continue
return text, i
return '', -1
@ -196,12 +189,6 @@ def preprocess_paragraphs(elements):
index = skip_index
continue # 继续下一个循环
# 检查当前段落是否匹配任一排除模式
if (pattern_numbered.match(current_text) or pattern_parentheses.match(current_text)) and len(current_text) < 8:
# 如果匹配则跳过当前段落不添加到processed列表中
index += 1
continue
# 检查是否为以数字序号开头的段落
match = pattern_numeric_header.match(current_text)
if not match:
@ -255,7 +242,9 @@ def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keyword
continue_collecting = False
current_section_pattern = None
active_key = None
# 新增的条件:检查是否匹配 pattern_numbered 或 pattern_parentheses
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
pattern_parentheses = re.compile(r'^\s*[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*')
def match_keywords(text, patterns):
# 首先检查关键词是否匹配
for pattern in patterns:
@ -268,14 +257,55 @@ def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keyword
if text == "":
return current_index
def process_matching_section():
nonlocal current_index, continue_collecting, active_key, current_section_pattern
found_next_number = False
current_section_pattern = None
while current_index < len(processed_paragraphs) - 1:
current_index += 1
next_text = processed_paragraphs[current_index].strip()
if not next_text:
continue # Skip empty lines
if not found_next_number:
# Modify the regular expression to support the required format
number_pattern = (
r'^([A-Za-z0-9]+(?:[.][A-Za-z0-9]+)*)|'
r'([(]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[)])|'
r'((?:[一二三四五六七八九十]{1,2}|\d+)\s*、)'
)
next_section_number = re.match(number_pattern, next_text)
if next_section_number:
found_next_number = True
# Determine dynamic pattern based on section number format
if next_section_number.group(1):
section_parts = next_section_number.group(1).split('.')
dynamic_pattern = r'^' + r'[.]'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
elif next_section_number.group(2):
dynamic_pattern = r'^[\(\]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[\)\]'
elif next_section_number.group(3):
dynamic_pattern = r'^(?:[一二三四五六七八九十]{1,2}|\d+)\s*、'
current_section_pattern = re.compile(dynamic_pattern)
if current_section_pattern and re.match(current_section_pattern, next_text):
extracted_paragraphs[active_key].append(next_text)
else:
continue_collecting = False
active_key = None
break
if continue_collecting:
# 如果是收集状态并且下面有表格则把表格内容全部追加到active_key中去
if text == '[$$table_start$$]':
current_index += 1
while (processed_paragraphs[current_index] != '[$$table_over$$]'):
while processed_paragraphs[current_index] != '[$$table_over$$]':
extracted_paragraphs[active_key].append(processed_paragraphs[current_index])
current_index += 1
return current_index
if current_section_pattern and re.match(current_section_pattern, text):
continue_collecting = False
active_key = None
@ -287,86 +317,41 @@ def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keyword
if match_keywords(text, keywords):
active_key = text
extracted_paragraphs[active_key] = [text]
if match_keywords(text, follow_up_keywords):
if (re.match(pattern_numbered, text) or re.match(pattern_parentheses, text)) and len(text) < 10:
continue_collecting = True
section_number = re.match(r'^(\d+([.]\d+)*)\s*[.]?', text) # 修改后的正则,支持 '数字 、' 和 '数字.'
if section_number: #当前匹配的行前有序号,那么就匹配到下个相似序号为止停止收集
process_matching_section()
elif match_keywords(text, follow_up_keywords):
continue_collecting = True
section_number = re.match(r'^(\d+([.]\d+)*)\s*[.]?', text)
if section_number:
current_section_number = section_number.group(1)
level_count = current_section_number.count('.')
# 获取章节的各级部分
parts = current_section_number.split('.')
# Pattern to match current level, e.g., 3.4.5 添加负向前瞻以防止匹配四级或更高层级
pattern = r'^' + (r'\d+\s*[.]\s*') * level_count + r'\d+' + r'(?!\s*[.]\s*\d+)'
matched_patterns = [pattern] # start with the full pattern
# for i in range(1, 6): #同级与matched_patterns = [pattern]重复了,故注释
# # 复制 parts 列表以避免修改原列表
# new_parts = parts. copy()
# new_parts[-1] = str(int(new_parts[-1]) + i)
# # 使用不同的分隔符
# next_pattern = r'^' + r'\s*[.]\s*'.join(new_parts)
# matched_patterns.append(next_pattern)
# Parent section (if applicable)
if len(parts) > 1:
for i in range(1, 6): #考虑原文档的书写不规范,跳序号的情况,目前设置了范围<5
matched_patterns = []
for i in range(1, 6):
parent_section_parts = parts[:-1].copy()
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + i)
parent_pattern = r'^' + r'\s*[.]\s*'.join(parent_section_parts) + r'(?!\s*[.]\s*\d+)'
matched_patterns.append(parent_pattern)
# 添加对 '数字 、' 格式的支持
digit_comma_pattern = r'^\d+\s*、'
matched_patterns.append(digit_comma_pattern)
# 获取当前顶级章节编号
current_top_level_num = int(current_section_number.split('.')[0])
for i in range(1, 6):
next_top_level_num = current_top_level_num + i
next_top_level_pattern = r'^' + str(next_top_level_num) + r'\s*[.]'
# 检查是否已经包含了该模式,避免重复
if next_top_level_pattern not in matched_patterns:
matched_patterns.append(next_top_level_pattern)
# Combine the patterns
combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
current_section_pattern = re.compile(combined_pattern)
else:
found_next_number = False
current_section_pattern = None
while current_index < len(processed_paragraphs) - 1:
current_index += 1
next_text = processed_paragraphs[current_index].strip()
# 添加对空白行的处理
if not next_text:
continue # 跳过空白行,进入下一个循环
if not found_next_number:
# 修改后的正则,支持 '数字 、' 格式
number_pattern = (
r'^([A-Za-z0-9]+(?:[.][A-Za-z0-9]+)*)|' # 第一部分:匹配 A1, 1.1, abc.123 等
r'([(]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[)])|' # 第二部分:匹配 (一), 12其中中文数字1到2位或任意位数字
r'((?:[一二三四五六七八九十]{1,2}|\d+)\s*、)' # 第三部分:匹配 一、12、等其中中文数字1到2位或任意位数字
)
next_section_number = re.match(number_pattern,next_text) #re.match总是从字符串开头匹配即使正则模式内部没有 ^ 锚点。
if next_section_number:
found_next_number = True
if next_section_number.group(1):
section_parts = next_section_number.group(1).split('.')
dynamic_pattern = r'^' + r'[.]'.join(
[r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
elif next_section_number.group(2):
dynamic_pattern = r'^[\(\]\s*(?:[一二三四五六七八九十]{1,2}|\d+)\s*[\)\]'
elif next_section_number.group(3):
dynamic_pattern = r'^(?:[一二三四五六七八九十]{1,2}|\d+)\s*、'
current_section_pattern = re.compile(dynamic_pattern)
if current_section_pattern and re.match(current_section_pattern, next_text):
extracted_paragraphs[active_key].append(next_text)
else:
continue_collecting = False
active_key = None
break
process_matching_section(None) # Reusing the common function for matching sections
return current_index
@ -677,12 +662,12 @@ if __name__ == '__main__':
# clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
# doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
# doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
pdf_path = r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\“天府粮仓”青白江区“一园三片”数字化提升和标准化体系建设项目(三次)招标文件N510113202400010820250102001.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\2025-湖北-通羊镇港口村人饮项目谈判文件.pdf'
output_dir = r"D:\flask_project\flask_app\static\output\output1\63504625-9178-47df-b513-17709434fa68\tmp"
output_dir = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\废标"
# invalid_added = insert_mark(pdf_path)
# # invalid_added_docx = pdf2docx(invalid_added)
invalid_added_docx=r'D:\flask_project\flask_app\static\output\output1\63504625-9178-47df-b513-17709434fa68\invalid_added.docx'
# invalid_added_docx = pdf2docx(invalid_added)
invalid_added_docx=r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\invalid_added.docx'
results = combine_find_invalid(invalid_added_docx, output_dir)
end_time = time.time()
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))

View File

@ -379,12 +379,23 @@ def qianwen_plus(user_query, need_extra=False):
if __name__ == "__main__":
# Example file path - replace with your actual file path
file_path = r"C:\Users\Administrator\Desktop\fsdownload\adfcf61a-23fc-4e5f-8084-3eb478c45e21\invalid_del.docx"
file_path = r"C:\Users\Administrator\Desktop\fsdownload\29457826-1e99-4e98-9c90-1cfb5d175579\invalid_del.docx"
file_id = upload_file(file_path)
# print(file_id)
user_query1 ="""请从提供的招标文件中提取与“信息公示媒介”相关的信息(如补充说明、文件澄清、评标结果等)公示媒介(如网址、官网)。按以下 JSON 格式输出信息,其中键名为“信息公示媒介”,键值为一个字符串列表。如果存在多个信息公示媒介,请分别将原文中相关表述逐字添加至字符串中。注意,若只有一个信息公示媒介,字符串列表中只包含一个字符串。示例输出格式如下,仅供参考:
user_query1 ="""该招标文件对响应文件投标文件偏离项的要求或内容是怎样的请不要回答具体的技术参数也不要回答具体的评分要求。请以json格式给我提供信息外层键名为'偏离',若存在嵌套信息,嵌套内容键名为文件中对应字段或是你的总结,键值为原文对应内容。若文中没有关于偏离项的相关内容,在键值中填'未知'
禁止内容
确保键值内容均基于提供的实际招标文件内容禁止使用任何预设的示例作为回答
禁止返回markdown格式请提取具体的偏离相关内容
示例1嵌套键值对情况
{
"信息公示媒介":["招标公告在政府采购网www.test.gov.cn发布。","中标结果将在采购网(www.test.bid.cn)予以公告。"]
"偏离":{
"技术要求":"以★标示的内容不允许负偏离",
"商务要求":"以★标示的内容不允许负偏离"
}
}
示例2无嵌套键值对情况
{
"偏离":"所有参数需在技术响应偏离表内响应,如应答有缺项,且无有效证明材料的,评标委员会有权不予认可,视同负偏离处理"
}
"""
# # res1,res2=qianwen_long_stream(file_id,user_query1,2,1,True)

View File

@ -63,9 +63,10 @@
}
}
9.该招标文件对响应文件投标文件偏离项的要求或内容是怎样的请不要回答具体的技术参数也不要回答具体的评分要求。请以json格式给我提供信息外层键名为'偏离',若存在嵌套信息,嵌套内容键名为文件中对应字段或是你的总结,而嵌套键值必须与原文保持一致。若文中没有关于偏离项的相关内容,在键值中填'未知'。
9.该招标文件对响应文件投标文件偏离项的要求或内容是怎样的请不要回答具体的技术参数也不要回答具体的评分要求。请以json格式给我提供信息外层键名为'偏离',若存在嵌套信息,嵌套内容键名为文件中对应字段或是你的总结,键值为原文对应内容。若文中没有关于偏离项的相关内容,在键值中填'未知'。
禁止内容:
确保键值内容均基于提供的实际招标文件内容,禁止使用任何预设的示例作为回答。
禁止返回markdown格式请提取具体的偏离相关内容。
示例1嵌套键值对情况
{
"偏离":{