This commit is contained in:
zy123 2024-10-12 18:01:59 +08:00
parent 2226d27a3c
commit 93751f7e74
19 changed files with 1117 additions and 697 deletions

View File

@ -2,45 +2,67 @@ import requests
import mimetypes
def download_file(url, local_filename):
"""
下载文件并保存到本地基于Content-Type设置文件扩展名
参数:
- url (str): 文件的URL地址
- local_filename (str): 本地保存的文件名不含扩展名
返回:
- tuple: (完整文件名, 文件类型代码)
文件类型代码:
1 - .docx
2 - .pdf
3 - .doc
4 - 其他
- None: 下载失败
"""
try:
with requests.get(url, stream=True) as response:
response.raise_for_status() # 确保请求成功,否则抛出异常
# 获取文件类型并设置适当的文件扩展名
content_type = response.headers.get('Content-Type')
extension = mimetypes.guess_extension(content_type, strict=False)
if not extension:
# 如果无法猜测扩展名,默认使用 .docx
extension = '.docx'
full_filename = local_filename + extension # 追加扩展名
# 获取Content-Type并猜测文件扩展名
content_type = response.headers.get('Content-Type', '')
extension = mimetypes.guess_extension(content_type, strict=False) or '.docx'
# 分离文件名和现有扩展名
base, ext = os.path.splitext(local_filename)
if ext.lower() != extension:
full_filename = base + extension
else:
full_filename = local_filename
# 下载并保存文件
with open(full_filename, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
if chunk: # 避免写入空块
file.write(chunk)
# 根据扩展名返回对应的值
if extension == '.docx':
return full_filename,1
elif extension == '.pdf':
return full_filename,2
else:
return full_filename,3
# 定义扩展名到代码的映射
extension_mapping = {
'.docx': 1,
'.pdf': 2,
'.doc': 3
}
file_code = extension_mapping.get(extension.lower(), 4)
return full_filename, file_code
except requests.HTTPError as e:
print(f"download: HTTP Error: {e}")
return None
print(f"download: HTTP 错误: {e}")
except requests.RequestException as e:
print(f"download: Error downloading the file: {e}")
return None
print(f"download: 下载文件时出错: {e}")
except Exception as e:
print(f"download: An error occurred: {e}")
print(f"download: 发生错误: {e}")
return None
if __name__ == '__main__':
# 测试下载的URL
test_url ="https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/01%20%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6-%E5%A4%A7%E8%8D%94%E5%8E%BF%E5%85%AC%E5%AE%89%E5%B1%80%E6%83%85%E6%8C%87%E5%8B%A4%E8%88%86%E4%B8%80%E4%BD%93%E5%8C%96%E5%B9%B3%E5%8F%B0%E5%BB%BA%E8%AE%BE%E9%A1%B9%E7%9B%AE%28%E4%BA%8C%E6%AC%A1%29.pdf?Expires=1726661197&OSSAccessKeyId=TMP.3KdMWkHL1nqHypaY6LmhnzqJTRkTzuYLNNPhW9KZruLGLaM3YL7F45NfuF3JT4CszSe2FD7ZH6WZFUTumokmJsSEW6pPh6&Signature=7BIGVSb9YGYLKFBXeTQZm7QnTI8%3D"
local_file_name = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output\\downloaded_file'
file_path = download_file(test_url, local_file_name)
test_url ="https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/01%20%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6-%E5%A4%A7%E8%8D%94%E5%8E%BF%E5%85%AC%E5%AE%89%E5%B1%80%E6%83%85%E6%8C%87%E5%8B%A4%E8%88%86%E4%B8%80%E4%BD%93%E5%8C%96%E5%B9%B3%E5%8F%B0%E5%BB%BA%E8%AE%BE%E9%A1%B9%E7%9B%AE%28%E4%BA%8C%E6%AC%A1%29.pdf?Expires=1728751377&OSSAccessKeyId=TMP.3KeYhAGeJr2LGiNctSPvSmdSwxhrU8pbaDYRJcNCCgv8ijyWN613QahKb3nhXydfAvHaqpw4nTHXMzq7hmTHmNnPA77DgL&Signature=mwPHW8v7dVmHP1udTDL%2ByzllwCE%3D"
local_file_name = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output\\testdownload'
file_path,file_type = download_file(test_url, local_file_name)
if file_path:
print(f"Downloaded file path: {file_path}")
print(file_type)

View File

@ -41,10 +41,10 @@ def get_filename_and_folder(file_path):
#参数为要转换的文件路径,以及输出的文件名,文件类型为自动补全。
def pdf2docx(local_path_in):
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/p2d'
receive_download_url = upload_file(local_path_in, remote_url)
receive_download_url = upload_file(local_path_in, remote_url) #转换完成,得到下载链接
filename, folder = get_filename_and_folder(local_path_in) #输入输出在同一个文件夹
local_path_out=os.path.join(folder,filename) #输出文件名
downloaded_filepath,file_type=download_file(receive_download_url, local_path_out)
local_filename=os.path.join(folder,filename) #输出文件名 C:\Users\Administrator\Desktop\货物标\zbfiles\6.2定版视频会议磋商文件 不带后缀
downloaded_filepath,file_type=download_file(receive_download_url, local_filename)
print(f"format_change p2d:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath
@ -52,14 +52,14 @@ def docx2pdf(local_path_in):
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p'
receive_download_url = upload_file(local_path_in, remote_url)
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
local_path_out = os.path.join(folder, filename) # 输出文件名
downloaded_filepath,file_type = download_file(receive_download_url, local_path_out)
local_filename = os.path.join(folder, filename) # 输出文件名
downloaded_filepath,file_type = download_file(receive_download_url, local_filename)
print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath
if __name__ == '__main__':
# 替换为你的文件路径和API URL
local_path_in="C:\\Users\\Administrator\\Desktop\\招标文件\\output\\test111.pdf"
local_path_in="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).doc"
# pdf2docx(local_path_in)
downloaded_file=docx2pdf(local_path_in)
print(downloaded_file)

View File

@ -8,7 +8,8 @@ from flask import Flask, request, jsonify, Response, stream_with_context, g
import json
import os
from flask_app.main.download import download_file
from flask_app.main.招标文件解析 import main_processing
from flask_app.main.招标文件解析 import engineering_bid_main
from flask_app.货物标.货物标解析main import goods_bid_main
app = Flask(__name__)
@ -74,6 +75,51 @@ def create_logger():
# logger.error('Exception occurred: ' + str(e)) # 使用全局 logger 记录
# return jsonify({'error': str(e)}), 500
# def download_and_process_file(file_url):
# logger = g.logger
# unique_id = g.unique_id
# output_folder = f"flask_app/static/output/{unique_id}" # 直接使用全局 unique_id 构建路径
# filename = "ztbfile"
# downloaded_filename = os.path.join(output_folder, filename)
#
# # 下载文件,假设 download_file 函数已正确处理异常并返回文件路径
# downloaded_filepath, file_type = download_file(file_url, downloaded_filename)
#
# if downloaded_filepath is None or file_type == 3:
# logger.error("Unsupported file type or failed to download file")
# return None, output_folder
#
# logger.info("Local file path: " + downloaded_filepath)
# processed_file_path = engineering_bid_main(output_folder, downloaded_filepath, file_type, unique_id)
# return processed_file_path, output_folder
# def generate_response(final_json_path):
# logger = g.logger
# # 检查final_json_path是否为空或None
# if not final_json_path:
# logger.error('Empty or None path provided for final_json.')
# return jsonify({'error': 'No path provided for final_json.'}), 400
# if not os.path.exists(final_json_path):
# logger.error('final_json not found at path: ' + final_json_path)
# return jsonify({'error': 'final_json not found'}), 404
# with open(final_json_path, 'r', encoding='utf-8') as f:
# logger.info('final_json_path:' + final_json_path)
# zbparse_data = json.load(f)
# json_str = json.dumps(zbparse_data, ensure_ascii=False)
# return jsonify({
# 'message': 'File uploaded and processed successfully',
# 'filename': os.path.basename(final_json_path),
# 'data': json_str
# })
def validate_request(default_zb_type=1):
if not request.is_json:
return jsonify({'error': 'Missing JSON in request'}), 400
file_url = request.json.get('file_url')
zb_type = request.json.get('zb_type', default_zb_type)
if not file_url:
return jsonify({'error': 'No file URL provided'}), 400
return file_url,zb_type
# 流式
@app.route('/upload', methods=['POST'])
@ -83,12 +129,12 @@ def zbparse():
# 获取并显示接收到的 JSON 数据
received_data = request.get_json()
logger.info("Received JSON data: " + str(received_data))
file_url = validate_request()
file_url,zb_type = validate_request()
if isinstance(file_url, tuple): # Check if the returned value is an error response
return file_url
try:
logger.info("starting parsing url:" + file_url)
return Response(stream_with_context(process_and_stream(file_url)), content_type='text/event-stream')
return Response(stream_with_context(process_and_stream(file_url,zb_type)), content_type='text/event-stream')
except Exception as e:
logger.error('Exception occurred: ' + str(e))
return jsonify({'error': str(e)}), 500
@ -112,17 +158,27 @@ def post_processing(data,includes):
return result
# 分段返回
def process_and_stream(file_url):
def process_and_stream(file_url,zb_type):
"""
下载文件并进行处理支持工程标和货物标的处理
参数
- file_url (str): 文件的URL地址
- zb_type (int): 标的类型1表示工程标2表示货物标
返回
- generator: 生成处理过程中的流式响应
"""
logger = g.logger
unique_id = g.unique_id
output_folder = f"flask_app/static/output/{unique_id}"
filename = "ztbfile"
downloaded_filename = os.path.join(output_folder, filename)
downloaded_filepath, file_type = download_file(file_url, downloaded_filename)
if downloaded_filepath is None or file_type == 3:
logger.error("Unsupported file type or failed to download file")
# 下载文件
downloaded = download_file(file_url, downloaded_filename)
if not downloaded:
logger.error("下载文件失败或不支持的文件类型")
error_response = {
'message': 'File processing failed',
'filename': None,
@ -131,12 +187,31 @@ def process_and_stream(file_url):
yield f"data: {json.dumps(error_response)}\n\n"
return
logger.info("Local file path: " + downloaded_filepath)
downloaded_filepath, file_type = downloaded
# 检查文件类型
if file_type == 4:
logger.error("不支持的文件类型")
error_response = {
'message': 'Unsupported file type',
'filename': None,
'data': json.dumps({'error': 'Unsupported file type'})
}
yield f"data: {json.dumps(error_response)}\n\n"
return
logger.info("本地文件路径: " + downloaded_filepath)
combined_data = {}
# 根据zb_type选择调用的处理函数
processing_functions = {
1: engineering_bid_main,
2: goods_bid_main
}
processing_func = processing_functions.get(zb_type, engineering_bid_main)
# 从 main_processing 获取数据
for data in main_processing(output_folder, downloaded_filepath, file_type, unique_id):
for data in processing_func(output_folder, downloaded_filepath, file_type, unique_id):
if not data.strip():
logger.error("Received empty data, skipping JSON parsing.")
continue # Skip processing empty data
@ -160,7 +235,7 @@ def process_and_stream(file_url):
}
yield f"data: {json.dumps(response, ensure_ascii=False)}\n\n"
# 日志记录已合并数据
logger.info(f"Updated combined data: {json.dumps(combined_data, ensure_ascii=False, indent=4)}")
logger.info(f"合并后的数据: {json.dumps(combined_data, ensure_ascii=False, indent=4)}")
# **保存 combined_data 到 output_folder 下的 'final_result.json'**
output_json_path = os.path.join(output_folder, 'final_result.json')
includes = ["基础信息", "资格审查", "商务评分", "技术评分", "无效标与废标项", "投标文件要求", "开评定标流程"]
@ -168,9 +243,9 @@ def process_and_stream(file_url):
try:
with open(output_json_path, 'w', encoding='utf-8') as json_file:
json.dump(result, json_file, ensure_ascii=False, indent=4)
logger.info(f"Combined data saved to '{output_json_path}'")
logger.info(f"合并后的数据已保存到 '{output_json_path}'")
except IOError as e:
logger.error(f"Error saving JSON file: {e}")
logger.error(f"保存JSON文件时出错: {e}")
# 最后发送合并后的完整数据
complete_response = {
'message': 'Combined data',
@ -186,34 +261,6 @@ def process_and_stream(file_url):
}
yield f"data: {json.dumps(final_response)}\n\n"
def validate_request():
if not request.is_json:
return jsonify({'error': 'Missing JSON in request'}), 400
file_url = request.json.get('file_url')
if not file_url:
return jsonify({'error': 'No file URL provided'}), 400
return file_url
def download_and_process_file(file_url):
logger = g.logger
unique_id = g.unique_id
output_folder = f"flask_app/static/output/{unique_id}" # 直接使用全局 unique_id 构建路径
filename = "ztbfile"
downloaded_filename = os.path.join(output_folder, filename)
# 下载文件,假设 download_file 函数已正确处理异常并返回文件路径
downloaded_filepath, file_type = download_file(file_url, downloaded_filename)
if downloaded_filepath is None or file_type == 3:
logger.error("Unsupported file type or failed to download file")
return None, output_folder
logger.info("Local file path: " + downloaded_filepath)
processed_file_path = main_processing(output_folder, downloaded_filepath, file_type, unique_id)
return processed_file_path, output_folder
@app.route('/api/test_zbparse', methods=['POST'])
def test_zbparse():
try:
@ -346,26 +393,6 @@ def test_process_and_stream():
yield f"data: {json.dumps(final_response, ensure_ascii=False)}\n\n"
def generate_response(final_json_path):
logger = g.logger
# 检查final_json_path是否为空或None
if not final_json_path:
logger.error('Empty or None path provided for final_json.')
return jsonify({'error': 'No path provided for final_json.'}), 400
if not os.path.exists(final_json_path):
logger.error('final_json not found at path: ' + final_json_path)
return jsonify({'error': 'final_json not found'}), 404
with open(final_json_path, 'r', encoding='utf-8') as f:
logger.info('final_json_path:' + final_json_path)
zbparse_data = json.load(f)
json_str = json.dumps(zbparse_data, ensure_ascii=False)
return jsonify({
'message': 'File uploaded and processed successfully',
'filename': os.path.basename(final_json_path),
'data': json_str
})
# @app.route('/get_json', methods=['POST'])
# def testjson():
# final_json_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp4\\fd55f067-2cf6-475c-b7ce-4498f6606bf6\\final_result.json"

View File

@ -1,368 +1,92 @@
# -*- coding: utf-8 -*-
import json
import os.path
import time
import re
from flask_app.main.json_utils import combine_json_results, nest_json_under_key
from flask_app.main.通义千问long import upload_file, qianwen_long
from concurrent.futures import ThreadPoolExecutor
from flask_app.main.禁止投标情形 import find_forbidden, process_string_list
from collections import defaultdict
#如果当前段落有序号,则向下匹配直接遇到相同的序号样式
#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。
def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
from collections import OrderedDict
from docx import Document
import re
if isinstance(keywords, str):
keywords = [keywords]
doc = Document(doc_path)
extracted_paragraphs = OrderedDict()
continue_collecting = False
current_section_pattern = None
active_key = None
def match_keywords(text, patterns):
# 首先检查关键词是否匹配
for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
# 如果当前的模式是 '不\s*得',则额外检查是否匹配 '不得分'
if pattern == r'\s*得':
post_match_text = text[match.end():].strip()
if post_match_text.startswith(""):
continue # 如果是"不得分",跳过这个匹配
return True
return False
def extract_from_text(text, current_index):
nonlocal continue_collecting, current_section_pattern, active_key
if text == "":
return current_index
if continue_collecting:
if current_section_pattern and re.match(current_section_pattern, text):
continue_collecting = False
active_key = None
else:
if active_key is not None:
extracted_paragraphs[active_key].append(text)
return current_index
if match_keywords(text, keywords):
active_key = text
extracted_paragraphs[active_key] = [text]
if match_keywords(text, follow_up_keywords):
continue_collecting = True
section_number = re.match(r'(\d+(\s*\.\s*\d+)*)', text)
if section_number:
current_section_number = section_number.group(1)
level_count = current_section_number.count('.')
# Pattern to match current level, e.g., 3.4.5
pattern = r'^' + (r'\d+\s*\.\s*') * level_count + r'\d+'
# Generate patterns for next section at same level and parent level
parts = current_section_number.split('.')
matched_patterns = [pattern] # start with the full pattern
# Next section at same level
parts[-1] = str(int(parts[-1]) + 1)
next_pattern = r'^' + r'\s*\.\s*'.join(parts)
matched_patterns.append(next_pattern)
# Parent section (if applicable)
if len(parts) > 1:
parent_section_parts = parts[:-1]
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1)
parent_pattern = r'^' + r'\s*\.\s*'.join(parent_section_parts)
matched_patterns.append(parent_pattern)
# Combine the patterns
combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
current_section_pattern = re.compile(combined_pattern)
else:
found_next_number = False
current_section_pattern = None
while current_index < len(doc.paragraphs) - 1:
current_index += 1
next_text = doc.paragraphs[current_index].text.strip()
if not found_next_number:
next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))', next_text)
if next_section_number:
found_next_number = True
if next_section_number.group(1):
section_parts = next_section_number.group(1).split('.')
dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
elif next_section_number.group(2):
dynamic_pattern = r'^[\(\]\d+[\)\]'
current_section_pattern = re.compile(dynamic_pattern)
if current_section_pattern and re.match(current_section_pattern, next_text):
extracted_paragraphs[active_key].append(next_text)
else:
continue_collecting = False
active_key=None
break
return current_index
index = 0
while index < len(doc.paragraphs):
index = extract_from_text(doc.paragraphs[index].text.strip(), index)
index += 1
return extracted_paragraphs
def preprocess_text_list(text_list):
new_text_list = []
# 正则表达式匹配中文字符或标点后的空格,该空格后紧跟字母、数字或带括号的数字
split_pattern = re.compile(r'(?<=[\u4e00-\u9fff。?!;])(?=\s+[a-zA-Z\d]|\s+\([1-9]\d*\)|\s+\[1-9]\d*\)')
for text in text_list:
# 使用正则表达式检查并拆分元素
parts = split_pattern.split(text)
new_text_list.extend(part.strip() for part in parts if part.strip()) # 添加非空字符串检查
return new_text_list
def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达式提取到的东西格式化
all_texts1 = []
all_texts2=[]
# 定义用于分割句子的正则表达式,包括中文和西文的结束标点
split_pattern = r'(?<=[。!?\!\?])'
for key, text_list in extracted_contents.items():
if len(text_list) == 1:
for data in text_list:
# 检查是否包含任何需要排除的字符串
if any(exclude in data for exclude in excludes):
continue # 如果包含任何排除字符串,跳过这个数据
# 去掉开头的序号,包括字母+数字的格式 以及括号+数字
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)'
data = re.sub(pattern, '', data).strip()
keyword_match = re.search(keywords, data)
if keyword_match:
# 从关键词位置开始查找结束标点符号
start_pos = keyword_match.start()
# 截取从关键词开始到后面的内容
substring = data[start_pos:]
# 按定义的结束标点分割
sentences = re.split(split_pattern, substring, 1)
if len(sentences) > 0 and sentences[0]:
# 只取第一句,保留标点
cleaned_text = data[:start_pos] + sentences[0] #eg:经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。潜在投标人应自行承担现场考察的全部费用、责任和风险。
# 经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。
else:
cleaned_text = data # 如果没有标点,使用整个字符串
else:
# 如果没有找到关键词,保留原文本
cleaned_text = data
all_texts1.append(cleaned_text) # 将处理后的文本添加到结果列表
else:
print(text_list)
new_text_list=preprocess_text_list(text_list)
print(new_text_list)
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)'
data = re.sub(pattern, '', new_text_list[0]).strip() #去除序号
# 将修改后的第一个元素和剩余的元素连接起来
new_text_list[0] = data # 更新列表中的第一个元素
joined_text = "\n".join(new_text_list) # 如果列表中有多个元素,则连接它们
all_texts2.append(joined_text) # 将每个列表的内容添加到 all_texts 中
return all_texts1,all_texts2 #all_texts1要额外用gpt all_text2直接返回结果
def find_sentences_with_keywords(data, keywords, follow_up_keywords):
"""递归查找并返回包含关键词的句子列表,并根据是否存在后续关键词分别存储到两个列表中。"""
sentences1 = [] # 保存没有后续关键词的情况
sentences2 = [] # 保存有后续关键词的情况
if isinstance(data, dict):
for value in data.values():
result1, result2 = find_sentences_with_keywords(value, keywords, follow_up_keywords)
sentences1.extend(result1)
sentences2.extend(result2)
elif isinstance(data, list):
for item in data:
result1, result2 = find_sentences_with_keywords(item, keywords, follow_up_keywords)
sentences1.extend(result1)
sentences2.extend(result2)
elif isinstance(data, str):
# 分割句子,保证句子完整性(按标点符号和序号分割)
split_sentences = re.split(r'(?<=[。!?\!\?])|(?=\d+[\\.])|(?=[(]\d+[)])', data) # 扩展匹配序号分割
i = 0
while i < len(split_sentences):
sentence = split_sentences[i].strip()
if re.search(keywords, sentence, re.IGNORECASE):
follow_up_present = any(
re.search(follow_up, sentence, re.IGNORECASE) for follow_up in follow_up_keywords)
if follow_up_present:
# 如果存在后续关键词,则从当前位置开始截取
start_index = i
end_index = start_index
found_next_section = False
for j in range(start_index + 1, len(split_sentences)):
if re.match(r'\d+\.\d+(\.\d+)?', split_sentences[j].strip()):
end_index = j
found_next_section = True
break
if found_next_section:
full_text = ' '.join(split_sentences[start_index:end_index]).strip()
else:
full_text = ' '.join(split_sentences[start_index:]).strip()
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
data=re.sub(pattern,'',full_text)
sentences2.append(data) # 存储有后续关键词的情况
i = end_index if found_next_section else len(split_sentences)
else:
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
data = re.sub(pattern, '', sentence).replace('\n','').strip()
sentences1.append(data) # 存储没有后续关键词的情况
i += 1
else:
i += 1
return sentences1, sentences2 # 返回两个列表
def extract_sentences_from_json(json_path, keywords,follow_up_keywords):
with open(json_path, 'r', encoding='utf-8') as file:
data = json.load(file)
"""从JSON数据中提取包含关键词的句子。"""
return find_sentences_with_keywords(data, keywords,follow_up_keywords)
#处理无效投标
def extract_values_if_contains(data, includes):
def parse_json_with_duplicates(json_string):
"""
递归检查字典中的值是否包含列表 'includes' 中的内容
如果包含将这些值添加到一个列表中并返回
解析具有重复键的 JSON 字符串将所有重复的键值对存储为列表
参数:
data (dict): 字典或从 JSON 解析得到的数据
includes (list): 包含要检查的关键词的列表
Args:
json_string (str): 需要解析的 JSON 字符串
返回:
list: 包含满足条件的值的列表
Returns:
dict: 解析后的字典重复的键对应的值为列表
"""
included_values = [] # 初始化结果列表
# 定义递归函数来处理嵌套字典
def recursive_search(current_data):
if isinstance(current_data, dict):
for key, value in current_data.items():
def custom_object_pairs_hook(pairs):
d = defaultdict(list)
for key, value in pairs:
# 如果值是字典或列表,递归处理
if isinstance(value, dict):
# 如果值是字典,递归搜索
recursive_search(value)
elif isinstance(value, str):
# 如果值是字符串,检查是否包含任何 includes 中的关键词
if any(include in value for include in includes):
included_values.append(value)
elif isinstance(current_data, list):
for item in current_data:
# 如果是列表,递归每个元素
recursive_search(item)
value = process_dict(value)
elif isinstance(value, list):
value = process_list(value)
d[key].append(value)
# 将有多个值的键转换为列表,单个值的键保持原样
return {key: (values if len(values) > 1 else values[0]) for key, values in d.items()}
# 开始递归搜索
recursive_search(data)
def process_dict(d):
"""
递归处理字典确保所有重复键的值为列表
return included_values
Args:
d (dict): 需要处理的字典
Returns:
dict: 处理后的字典
"""
return custom_object_pairs_hook(d.items())
#你是一个文本助手,文本内的信息以'...............'分割,你负责准确筛选所需的信息并返回,每块信息要求完整,不遗漏,你不得擅自进行总结或删减。
#以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果x为符合的信息的序号。
#以上是原文内容,文本内的信息以'...............'分割请你根据该信息回答否决投标或拒绝投标或无效投标或使投标失效的情况有哪些文本中可能存在无关的信息请你准确筛选所需的信息并返回。最终结果以json列表格式返回给我键名为'否决和无效投标情形',你的回答完全忠于原文内容,且回答内容与原文内容一致,要求完整与准确,不能擅自总结或者概括。",
def process_list(l):
"""
递归处理列表确保列表中的所有字典也被处理
def handle_query(file_path, user_query, output_file, result_key, keywords, truncate_json_path):
excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:"]
follow_up_keywords = [r'\s*形\s*之\s*一', r'\s*况\s*之\s*一', r'\s*列', r'\s*下']
extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) #字典结果
# print(extracted_contents)
all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords)
qianwen_txt = all_texts1 + all_tables1
# Proceed only if there is content to write
if qianwen_txt:
with open(output_file, 'w', encoding='utf-8') as file:
# 初始化一个计数器
counter = 1
for content in qianwen_txt:
file.write("..............."+'\n')
# 写入内容前加上序号,后面接一个点和空格,然后是内容
file.write(f"{counter}. {content}\n")
# 更新计数器,每次循环递增
counter += 1
file_id = upload_file(output_file)
qianwen_ans = qianwen_long(file_id, user_query)
selected_contents = set() # 使用 set 去重
num_list = process_string_list(qianwen_ans)
print(num_list)
Args:
l (list): 需要处理的列表
for index in num_list:
if index - 1 < len(qianwen_txt):
content = qianwen_txt[index - 1] # 转换序号为索引假设序号从1开始
selected_contents.add(content)
# 将 all_texts2 和 all_tables2 中的内容也添加到 set 中
selected_contents.update(all_texts2)
selected_contents.update(all_tables2)
# 将 set 转换为 list 来返回结果
res = {result_key: list(selected_contents)}
# 将结果转换为JSON字符串
# os.remove(output_file) # Remove the file after use
# print(f"Deleted temporary file: {output_file}")
else:
res = {result_key: ""} # Set the response to empty if no contents were extracted
return res
Returns:
list: 处理后的列表
"""
return [process_dict(item) if isinstance(item, dict) else item for item in l]
def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path):
queries = [
(r'\s*决|无\s*效\s*投\s*标|无\s*效\s*文\s*件|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果x为符合的信息的序号若情况不存在返回[]。",
os.path.join(output_dir, "temp1.txt"), "否决和无效投标情形"),
(r'\s*标',
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:废标项的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果x为符合的信息的序号若情况不存在返回[]。",
os.path.join(output_dir, "temp2.txt"), "废标项"),
(r'\s*得|禁\s*止\s*投\s*标',"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,每条信息规定了各方不得存在的情形,请回答:在这些信息中,主语是投标人或中标人或供应商或联合体投标各方或磋商小组的信息有哪些?不要返回主语是招标人或采购人或评标委员会的信息,请你筛选所需的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,示例返回为[1,4,6],若情况不存在,返回[]。",
os.path.join(output_dir,"temp3.txt"),"不得存在的情形")
]
results = []
return json.loads(json_string, object_pairs_hook=custom_object_pairs_hook)
# 使用线程池来并行处理查询
with ThreadPoolExecutor() as executor:
futures = []
for keywords, user_query, output_file, result_key in queries:
future = executor.submit(handle_query, file_path, user_query, output_file, result_key, keywords,
truncate_json_path)
futures.append(future)
time.sleep(1) # 暂停1秒后再提交下一个任务
# 示例使用
input_string = '''
{
"商务评分": {
"综合实力": {
"评分": "5分",
"要求": "投标人具备有效期内的 ISO9001质量体系认证证书、具备有效期内的OHSA18001职业健康安全体系认证证书、具备有效期内的 IS014001环境管理体系认证证书、具备有效期内的 ISO20000信息技术服务体系认证证书、具备有效期内的 ISO27001信息安全体系认证证书全部满足得 5分每缺一项扣 1分开标时需提供原件"
},
"综合实力": {
"评分": "2分",
"要求": "投标人具备电子与智能化工程专业承包二级资质及以上证书得 2分不能够提供不得分开标时需提供原件"
},
"综合实力": {
"评分": "2分",
"要求": "投标人具有建筑机电安装工程专业承包三级资质或以上资质得 2分否则不得分。证书开标原件备查"
},
"综合实力": {
"评分": "3分",
"要求": "投标人需具有健全的信息技术运维服务能力通过ITSS信息技术服务运维标准符合性认证得 3分投标时需提供相关证书原件予以证明否则不得分。"
},
"综合实力": {
"评分": "2分",
"要求": "投标人具备 CCRC信息安全服务安全集成三级及以上证书得 2分不能够提供不得分开标时需提供原件"
},
"类似业绩": {
"评分": "4分",
"要求": "近三年(自投标截止时间前推 36个月以合同签订日期为准中标人作为独立承包人有已完成的类似业绩项目建设内容含应包含会议系统设备采购项目及改造每提供一份业绩得 2分最多可得 4分。业绩证明材料须提供中标公示截图、中标通知书、合同复印件,开标时备查,否则不得分。)"
},
"质量保证": {
"评分": "2分",
"要求": "投标人所投的MCU及视频会议终端设备产品如果不是自己生产的需提供制造商出具的授权及满足招标质保要求的售后服务承诺函提供得 2分开标时提供授权书及售后服务承诺函原件予以证明否则不得分。"
}
}
}
'''
for future in futures:
results.append(future.result())
# 解析具有重复键的 JSON 字符串
parsed_data = parse_json_with_duplicates(input_string)
# #禁止投标
# print("starting不得存在的情形...")
# forbidden_res = find_forbidden(truncate_json_path, clause_path)
# results.append(forbidden_res)
combined_dict = {}
for d in results:
combined_dict.update(d)
print("无效标与废标done...")
# return nest_json_under_key(combined_dict, "无效标与废标项")
return {"无效标与废标项":combined_dict}
if __name__ == '__main__':
start_time = time.time()
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\磋商文件_tobidders_notice_part1\\truncate_output.json"
clause_path="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause磋商文件_tobidders_notice_part2.json"
output_dir = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\invalid"
# doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx'
doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件.docx'
results = combine_find_invalid(doc_path, output_dir,truncate_json_path,clause_path)
end_time = time.time()
print("Elapsed time:", str(end_time - start_time))
print("Results:", json.dumps(results,ensure_ascii=False,indent=4))
# 打印结果
print(json.dumps(parsed_data, ensure_ascii=False, indent=4))

View File

@ -115,11 +115,35 @@ def combine_evaluation_standards(truncate2):
user_query_2 = (
"""
根据该文档中的评标办法前附表请你列出该文件的技术评分商务评分投标报价评审标准以及它们对应的具体评分要求请以json格式返回结果请在这三大块评分中分别用若干键值对表示具体要求其内层的键名为'评分''要求'若这三大块评分中存在其他信息则在相应评分大块中新增键名'备注'存放该信息键值为具体的要求否则不需要如果评分内容因素不是这3个则返回文档中给定的评分内容因素以及它们的具体评分要求不要回答有关形式资格响应性评审标准的内容若存在未知信息填充'未知'以下为示例输出
{
"技术评分": {
"主要监理岗位的职责": {
"评分": "4分",
"要求": "1、总监理工程师的职责全面、清晰、合理得 1.2-2分一般的1.2分。2、其他主要监理人员及岗位的职责全面、清晰、合理得 1.2-2分一般的 1.2分。"
}
},
"商务评分": {
"控制系统内主板": {
"评分": "10分",
"要求": "所投电梯控制系统内主板为制造商原厂原品牌制造生产且为进口部件得 10分。提供进口部件报关单及原产地证明扫描件加盖公章否则不得分"
},
"制造商技术实力": [
{
"评分": "3分",
"要求": "一级证书得3分二级证书得1分其他不得分"
},
{
"评分": "2分",
"要求": "行业销量排名连续前 2 名,得 2 分,第 4-6 名得 0.5 分,其他不得分"
}
]
},
"投标报价评审": {
"投标报价是否出现违反计价规范": {
"评分": "合格/不合格",
"要求": "A:投标报价未违反计价规范的评审意见为“合格”B投标报价违反计价规范的评审意见为“不合格”"
}
}
}
"""
)
@ -133,7 +157,7 @@ def combine_evaluation_standards(truncate2):
# return evaluation_combined_res
return update_json #商务标技术标整合
if __name__ == "__main__":
truncate2="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest1_evaluation_method.pdf"
truncate2="C:\\Users\\Administrator\\Desktop\\招标文件\\招标01_evaluation_method.pdf"
evaluation_standards_res=combine_evaluation_standards(truncate2)
# 从结果中提取"商务标"和"技术标"
technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})}

View File

@ -160,7 +160,9 @@ def multi_threading(queries, knowledge_name="", file_id="", llm_type=1):
while not result_queue.empty():
index, result = result_queue.get()
results[index] = result
# 检查是否所有结果都是 None
if all(result is None for result in results):
return []
# 返回一个保证是列表的结构
return results
@ -194,9 +196,9 @@ if __name__ == "__main__":
# for question, response in results:
# print(f"Question: {question}")
# print(f"Response: {response}")
ques=["该招标文件的工程名称项目名称招标编号是招标人是招标代理机构是请按json格式给我提供信息键名分别是'工程名称','招标编号','招标人','招标代理机构',若存在未知信息,在对应的键值中填'未知'","该招标文件的工程概况或项目概况招标范围是请按json格式给我提供信息键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'"]
results = multi_threading(ques, "招标解析5word")
ques=["关于'资格要求',本采购文件第一章第二款要求的内容是怎样的请按json格式给我提供信息键名为'资格要求',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'"]
# ques=["该招标文件的工程名称项目名称招标编号是招标人是招标代理机构是请按json格式给我提供信息键名分别是'工程名称','招标编号','招标人','招标代理机构',若存在未知信息,在对应的键值中填'未知'。","该招标文件的工程概况或项目概况招标范围是请按json格式给我提供信息键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。"]
results = multi_threading(ques, "6.2视频会议docx")
if not results:
print("errror!")
else:

View File

@ -265,8 +265,8 @@ def truncate_pdf_multiple(input_path, output_folder):
if __name__ == "__main__":
# input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74\\ztbfile.pdf"
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74"
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标02.pdf"
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\test4"
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标03.pdf"
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件"
truncate_pdf_multiple(input_path,output_folder)
# selection = 3 # 例如1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
# generated_files = truncate_pdf_main(input_path, output_folder, selection)

View File

@ -256,7 +256,7 @@ def extract_from_notice(clause_path, type):
data = json.load(file)
extracted_data = extract_json(data, target_values) # 读取json
# print(json.dumps(extracted_data,ensure_ascii=False,indent=4))
sorted_data = sort_clean_data_keys(extracted_data) # 对键进行排序
sorted_data = sort_clean_data_keys(extracted_data) # 对输入的字典 data 的键进行预处理和排序
transformed_data = transform_json(sorted_data)
final_result = process_nested_data(transformed_data)
return final_result

View File

@ -27,7 +27,6 @@ def get_global_logger(unique_id):
logger=None
# 可能有问题pdf转docx导致打勾符号消失
# 创建全局线程池
executor = ThreadPoolExecutor()
def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
@ -64,7 +63,7 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
# 提前返回,不等待 future_knowledge 完成,返回包含 Future 对象
return {
'input_file_path': downloaded_file_path,
'file_path': downloaded_file_path,
'output_folder': output_folder,
'truncate0': truncate0,
'truncate1': truncate1,
@ -181,7 +180,7 @@ def fetch_bid_opening(clause_path):
# 'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'],
# processed_data['truncate3'],
# processed_data['knowledge_name'], processed_data['truncate0_jsonpath'],
# processed_data['clause_path'],processed_data['input_file_path'],processed_data['output_folder']),
# processed_data['clause_path'],processed_data['file_path'],processed_data['output_folder']),
# 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']),
# 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
# output_folder, processed_data['truncate0_jsonpath'],
@ -216,7 +215,7 @@ def fetch_bid_opening(clause_path):
#分段返回
def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_id):
global logger
logger = get_global_logger(unique_id)
@ -271,7 +270,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'],
processed_data['truncate3'], knowledge_name,
processed_data['truncate0_jsonpath'],
processed_data['clause_path'], processed_data['input_file_path'],
processed_data['clause_path'], processed_data['file_path'],
processed_data['output_folder']),
}

View File

@ -1,14 +1,59 @@
def test_append_newline():
def check_append_newline(key):
append_newline = len(key.split('.')) == 2
return append_newline
import re
# 测试用例
test_cases = ["1.1", "1."]
for case in test_cases:
result = check_append_newline(case)
print(f"序号 '{case}': append_newline = {result}")
def generate_questions(input_list):
template = (
"关于'{key}',{value}的内容是怎样的请按json格式给我提供信息键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'"
)
# 运行测试
test_append_newline()
questions = []
for input_dict in input_list:
for key, value in input_dict.items():
processed_value = preprocess_value(value)
question = template.format(key=key, value=processed_value)
questions.append(question)
return questions
def preprocess_value(value):
# 使用正则表达式查找"第X章"或"第X款"
chapter_match = re.search(r'第(.+?)章', value)
clause_match = re.search(r'第(.+?)款', value)
if chapter_match or clause_match:
# 以逗号、句号、问号、感叹号为分隔符
separators = r'[,。?!,\?!]'
# 分隔符检测函数,确保括号成对闭合时才用作分隔符
def is_separator(ch, count):
return count['('] == count[')'] and count[''] == count[''] and re.match(separators, ch)
parts = []
current_part = []
count = {'(': 0, ')': 0, '': 0, '': 0}
for ch in value:
if ch in count:
count[ch] += 1
if is_separator(ch, count):
parts.append("".join(current_part).strip())
current_part = []
else:
current_part.append(ch)
if current_part:
parts.append("".join(current_part).strip())
# 查找包含章节或条款的部分
target_part = next((part for part in parts if '' in part or '' in part), None)
if target_part:
# 删除开头的"符合"或"应满足"
target_part = re.sub(r'^(符合|应满足)\s*', '', target_part.strip())
return target_part
# 如果没有找到特定章节或条款,返回原始值
return value
input_list=[{'资格性审查标准.资格要求': '符合本采购文件第一章第二款要求,并提供合格有效的证明材料'}]
res=generate_questions(input_list)
print(res)

View File

@ -0,0 +1,2 @@
def combine_basic_info():
return True

View File

@ -2,7 +2,6 @@ import json
import re
#提取两个大标题之间的内容
def extract_between_sections(data, target_values):
target_found = False
extracted_data = {}
@ -49,25 +48,6 @@ def process_with_outer_key(data):
processed_data[outer_key] = processed_inner_data
return processed_data
def sort_clean_data_keys(data):
# 预处理:删除键名中的空格
def preprocess_key(key):
return re.sub(r'\s+', '', key)
# 将键转换成由整数构成的元组,作为排序依据
def key_func(key):
return tuple(int(part) for part in re.split(r'\D+', key) if part)
# 创建一个新的字典,键名经过预处理
preprocessed_data = {preprocess_key(key): value for key, value in data.items()}
# 对预处理后的字典键进行排序
sorted_keys = sorted(preprocessed_data.keys(), key=key_func)
# 创建一个新的字典,按照排序后的键添加键值对
sorted_data = {key: preprocessed_data[key] for key in sorted_keys}
return sorted_data
# 转换结构化的JSON数据
def transform_json(data):
@ -230,8 +210,6 @@ def extract_from_notice(clause_path, type):
data = json.load(file)
extracted_data = extract_between_sections(data, target_values) # 读取json
transformed_data=process_with_outer_key(extracted_data)
# sorted_data=sort_clean_data_keys(extracted_data) #对键进行排序
# transformed_data = transform_json(extracted_data)
final_result=process_nested_data(transformed_data)
return final_result

View File

@ -32,18 +32,15 @@ def extract_text_from_pdf(file_path, start_word, end_pattern):
# 在最后一页查找结束位置
if i == len(pdf_document.pages) - 1:
for pattern in end_pattern:
matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE))
matches = list(re.finditer(end_pattern, cleaned_text, re.MULTILINE))
if matches:
end_index = matches[-1].start()
cleaned_text = cleaned_text[:end_index]
break
all_pages_text.append(cleaned_text)
# 合并所有页面的文本
full_text = "\n".join(all_pages_text)
# print(full_text)
return full_text
#fitz库版本
@ -246,6 +243,84 @@ def parse_text_by_heading(text):
return data
#type=2时提取货物标的第一章招标公告时采用该逻辑
def parse_text_to_dict(text):
"""
解析文本根据大标题划分内容生成字典
参数:
text (str): 要解析的文本
返回:
dict: 大标题作为键内容作为值的字典
"""
# 正则表达式模式:匹配以一至十的汉字数字开头,后跟顿号和任意字符,且位于行首
pattern = re.compile(r'^([一二三四五六七八九十]+\s*、\s*.*)$', re.MULTILINE)
# 使用 re.finditer 找到所有大标题的位置
matches = list(pattern.finditer(text))
result = {}
for i, match in enumerate(matches):
title = match.group(1).strip() # 获取大标题文本
start = match.end() # 内容的起始位置
if i + 1 < len(matches):
end = matches[i + 1].start() # 下一个大标题的起始位置
else:
end = len(text) # 最后一个大标题,内容到文本末尾
content = text[start:end].strip() # 获取内容并去除前后空白
# 规范化换行符,并移除每行开头和结尾的空白
content = content.replace('\r\n', '\n') # 统一换行符
content = re.sub(r'[ \t]+\n', '\n', content) # 移除行尾空白
content = re.sub(r'^[ \t]+|[ \t]+$', '', content, flags=re.MULTILINE) # 移除行首和行尾空白
content = clean_content(content) # 处理内容中的换行符
result[title] = content
return result
def clean_content(content):
"""
处理内容中的换行符
- 保留在子项编号前的换行符
- 保留在冒号 ':' 或全角冒号 '' 前的第一个换行符
- 移除其他位置的换行符不留下额外的空格
参数:
content (str): 要处理的内容字符串
返回:
str: 处理后的内容字符串
"""
# 定义子项编号的正则模式,包括:
# - 数字+点号+数字(如 1.1 或 11
# - 数字+顿号(如 2、
# - 点号+数字(如 .3 或 3
# - 数字+右括号(如 1) 或 1
# - 圆括号包围的数字(如 (5)
# - 全角圆括号包围的数字(如 5
# - 数字+点号(如 1. 或 1
numbering_pattern = r'(?:\d+[.]\d+(?:[.]\d+)*|\d+、|[.]\d+|\d+[)]|\(\d+\)|\d+|\d+[.])'
# 定义需要保留换行符的情况:
# 1. 换行符前面是子项编号
# 2. 换行符前面是任意非空白字符,且后面跟着 ':' 或 ''
pattern_keep = r'\n(?=(?:' + numbering_pattern + r'|[^:\n\r\f\v]+[:]))'
# 定义占位符,用于暂时替换需要保留的换行符
placeholder = "___PLACEHOLDER___"
# Step 1: 将需要保留的换行符替换为占位符
content_with_placeholder = re.sub(pattern_keep, placeholder, content)
# Step 2: 移除所有剩余的换行符
content_no_newlines = content_with_placeholder.replace('\n', '')
# Step 3: 将占位符替换回换行符
cleaned_content = content_no_newlines.replace(placeholder, '\n')
return cleaned_content
# def convert_to_json(file_path, start_word, end_phrases):
# if file_path.endswith('.docx'):
# text = extract_text_from_docx(file_path)
@ -263,19 +338,21 @@ def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json
return ""
if type == 1:
start_word = r'^\s*[(]?\s*[一1]\s*[)]?\s*[、.]*\s*(说\s*明|总\s*则)'
end_pattern = [r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+']
else:
start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:'
end_pattern=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
text = extract_text_from_pdf(file_path, start_word, end_pattern)
result = parse_text_by_heading(text)
else:
start_word = r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*'
end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人|磋商|供应商)须知前附表)'
text = extract_text_from_pdf(file_path, start_word, end_pattern)
result=parse_text_to_dict(text)
# result = convert_to_json(input_path, start_word, end_pattern)
# 检查输出文件夹是否存在,如果不存在则创建
if not os.path.exists(output_folder):
os.makedirs(output_folder)
print(f"Created output folder: {output_folder}")
# file_name = "clause1.json" if type == 1 else "clause2.json"
file_name = f"clause{suffix_counter}.json"
file_name = "clause1.json" if type == 1 else "clause2.json"
# file_name = f"clause{suffix_counter}.json"
output_path = os.path.join(output_folder, file_name)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=4, ensure_ascii=False)
@ -303,18 +380,18 @@ def process_folder(input_folder, output_folder):
#TODO:标题'一'应该越来越大,与'1.1'的逻辑分开'
#TODO:911904c3-4d6e-47e0-acd8-b05e582209cb文件夹运行有问题百炼出错了
if __name__ == "__main__":
# # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
# file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\竞争性谈判文件(3)_tobidders_notice_part2.pdf'
# # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf'
# output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2'
# try:
# output_path = convert_clause_to_json(file_path,output_folder)
# print(f"Final JSON result saved to: {output_path}")
# except ValueError as e:
# print("Error:", e)
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output5\\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件_notice.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf'
output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output5\\tmp2'
try:
output_path = convert_clause_to_json(file_path,output_folder,2)
print(f"Final JSON result saved to: {output_path}")
except ValueError as e:
print("Error:", e)
input_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4'
output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1'
# 调用 process_folder 来处理整个文件夹中的所有文件
process_folder(input_folder, output_folder)
# input_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4'
# output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1'
#
# # 调用 process_folder 来处理整个文件夹中的所有文件
# process_folder(input_folder, output_folder)

View File

@ -31,7 +31,7 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
match = re.search(pattern, text, re.IGNORECASE)
if match:
# 如果当前的模式是 '不\s*得',则额外检查是否匹配 '不得分'
if pattern == r'\s*得':
if '\s*得' in pattern: # 使用字符串匹配检查模式
post_match_text = text[match.end():].strip()
if post_match_text.startswith(""):
continue # 如果是"不得分",跳过这个匹配
@ -259,6 +259,12 @@ def extract_table_with_keywords(data, keywords, follow_up_keywords):
while i < len(split_sentences):
sentence = split_sentences[i].strip()
if re.search(keywords, sentence, re.IGNORECASE):
# 处理 "不\s*得" 的特殊情况,跳过包含 "不得分" 的句子
if re.search(r'\s*得', sentence, re.IGNORECASE):
post_match_text = sentence[re.search(r'\s*得', sentence).end():].strip()
if post_match_text.startswith(""):
i += 1 # 跳过这个句子
continue
follow_up_present = any(
re.search(follow_up, sentence, re.IGNORECASE) for follow_up in follow_up_keywords
)
@ -334,13 +340,13 @@ def extract_values_if_contains(data, includes):
#以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果x为符合的信息的序号。
#以上是原文内容,文本内的信息以'...............'分割请你根据该信息回答否决投标或拒绝投标或无效投标或使投标失效的情况有哪些文本中可能存在无关的信息请你准确筛选所需的信息并返回。最终结果以json列表格式返回给我键名为'否决和无效投标情形',你的回答完全忠于原文内容,且回答内容与原文内容一致,要求完整与准确,不能擅自总结或者概括。",
def handle_query(file_path, user_query, output_file, result_key, keywords,truncate_file):
def handle_query(file_path, user_query, output_file, result_key, keywords):
excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:","我方"]
follow_up_keywords = [r'\s*形\s*之\s*一', r'\s*况\s*之\s*一', r'\s*列', r'\s*下']
extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) #字典结果
all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
table_data_list=read_docx_last_column(truncate_file) #从投标人须知前附表中提取信息生成列表data每个元素为'一行信息'
# table_data_list=read_tables_from_docx(file_path)
# table_data_list=read_docx_last_column(truncate_file) #从投标人须知前附表中提取信息生成列表data每个元素为'一行信息'
table_data_list=read_tables_from_docx(file_path)
all_tables1, all_tables2 = extract_table_with_keywords(table_data_list, keywords,follow_up_keywords)
qianwen_txt = all_texts1 + all_tables1
# Proceed only if there is content to write
@ -376,7 +382,7 @@ def handle_query(file_path, user_query, output_file, result_key, keywords,trunca
return res
def combine_find_invalid(file_path, output_dir,truncate_file):
def combine_find_invalid(file_path, output_dir):
queries = [
(r'\s*决|无\s*效\s*投\s*标|无\s*效\s*文\s*件|文\s*件\s*无\s*效|无\s*效\s*响\s*应|无\s*效\s*报\s*价|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果x为符合的信息的序号若情况不存在返回[]。",
@ -393,7 +399,7 @@ def combine_find_invalid(file_path, output_dir,truncate_file):
with ThreadPoolExecutor() as executor:
futures = []
for keywords, user_query, output_file, result_key in queries:
future = executor.submit(handle_query, file_path, user_query, output_file, result_key, keywords,truncate_file)
future = executor.submit(handle_query, file_path, user_query, output_file, result_key, keywords)
futures.append(future)
time.sleep(1) # 暂停1秒后再提交下一个任务
@ -416,12 +422,12 @@ def combine_find_invalid(file_path, output_dir,truncate_file):
if __name__ == '__main__':
start_time = time.time()
# truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json"
truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招标文件实高电子显示屏_tobidders_notice_part1.docx"
clause_path="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause招标文件实高电子显示屏_tobidders_notice_part2.json"
# truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招标文件实高电子显示屏_tobidders_notice_part1.docx"
clause_path="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause招标文件广水市教育局封闭管理项目二次_tobidders_notice_part2.json"
output_dir = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\invalid"
# doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx'
doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件(实高电子显示屏.docx'
results = combine_find_invalid(doc_path, output_dir,truncate_file)
doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件(广水市教育局封闭管理项目二次.docx'
results = combine_find_invalid(doc_path, output_dir)
end_time = time.time()
print("Elapsed time:", str(end_time - start_time))
print("Results:", json.dumps(results,ensure_ascii=False,indent=4))

View File

@ -1,89 +0,0 @@
# -*- encoding:utf-8 -*-
import json
from flask_app.main.通义千问long import upload_file, qianwen_long
from flask_app.main.json_utils import clean_json_string
def combine_technical_and_business(data, target_values):
extracted_data = {} # 根级别存储所有数据
technical_found = False
business_found = False
def extract_nested(data, parent_key='', is_technical=False, is_business=False):
nonlocal technical_found, business_found
if isinstance(data, dict):
for key, value in data.items():
current_key = f"{parent_key}.{key}" if parent_key else key
# 检查是否为技术标的内容
if any(target in key for target in target_values):
if not is_technical:
extracted_data[key] = value
technical_found = True
continue
# 默认其他所有内容都归为商务标
else:
if not is_business:
if '商务标' not in extracted_data:
extracted_data['商务标'] = {}
extracted_data['商务标'][key] = value
business_found = True
continue
if isinstance(value, dict) or isinstance(value, list):
extract_nested(value, current_key, is_technical, is_business)
elif isinstance(data, list):
for index, item in enumerate(data):
extract_nested(item, f"{parent_key}[{index}]", is_technical, is_business)
extract_nested(data)
if not technical_found:
extracted_data['技术标'] = ''
if not business_found:
extracted_data['商务标'] = ''
return extracted_data
#如果外键直接是'评分因素',应该这个函数可以提取其中内容。
def process_data_based_on_key(data, word):
# 获取字典的键列表
keys = list(data.keys())
# 检查键的数量是否为1并且该键是否包含指定的词
if len(keys) == 1 and word in keys[0]:
# 返回内层的字典
return data[keys[0]]
# 如果条件不满足,则返回原始字典
return data
def get_evaluation_standards(truncate_file):
file_id = upload_file(truncate_file)
user_query = "根据该文档中的评标办法前附表或者评分标准表,请你列出该文件的技术标,商务标,投标报价评审标准以及它们对应的具体评分要求,外层键名分别为'技术标','商务标','投标报价'。如果评分内容不是这3个则返回文档中给定的评分内容以及它的评分要求都以json的格式返回结果如果该采购活动有多个包则最外层键名为对应的包名。请不要回答有关资格审查的内容"
evaluation_res = qianwen_long(file_id, user_query)
cleaned_evaluation_res = clean_json_string(evaluation_res)
result_data = process_data_based_on_key(cleaned_evaluation_res, '评分')
include = ['一包', '二包', '三包', '四包', '五包']
target_values = ['技术', '设计', '实施']
updated_jsons = {}
# 检查是否有外层键匹配include列表
if any(key for key in result_data if any(included in key for included in include)):
# 有匹配的项,处理这些项
for key in result_data:
if any(item in key for item in include):
inner_dict = result_data[key]
updated_jsons[key] = combine_technical_and_business(inner_dict, target_values)
else:
# 没有匹配的项,对整个字典运行
updated_jsons = combine_technical_and_business(result_data, target_values)
# 将updated_jsons转换为JSON格式
evaluation_combined_res = json.dumps(updated_jsons, ensure_ascii=False, indent=4)
return evaluation_combined_res
if __name__ == "__main__":
truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output2\\竞争性谈判文件(3)_evaluation_method.pdf"
res=get_evaluation_standards(truncate_file)
print(res)

View File

@ -0,0 +1,229 @@
# -*- encoding:utf-8 -*-
import json
import re
from collections import defaultdict
from flask_app.main.通义千问long import upload_file, qianwen_long
def combine_technical_and_business(data, target_values):
extracted_data = {} # 根级别存储所有数据
technical_found = False
business_found = False
def extract_nested(data, parent_key='', is_technical=False, is_business=False):
nonlocal technical_found, business_found
if isinstance(data, dict):
for key, value in data.items():
current_key = f"{parent_key}.{key}" if parent_key else key
# 检查是否为技术标的内容
if any(target in key for target in target_values):
if not is_technical:
extracted_data[key] = value
technical_found = True
continue
# 默认其他所有内容都归为商务标
else:
if not is_business:
if '商务标' not in extracted_data:
extracted_data['商务标'] = {}
extracted_data['商务标'][key] = value
business_found = True
continue
if isinstance(value, dict) or isinstance(value, list):
extract_nested(value, current_key, is_technical, is_business)
elif isinstance(data, list):
for index, item in enumerate(data):
extract_nested(item, f"{parent_key}[{index}]", is_technical, is_business)
extract_nested(data)
if not technical_found:
extracted_data['技术标'] = ''
if not business_found:
extracted_data['商务标'] = ''
return extracted_data
# 防止外键只有一个'一包'的情况
def process_data_based_on_key(data):
exclude_word = ["", "未知", "评分因素"]
# 获取字典的键列表
keys = list(data.keys())
# 检查键的数量是否为1并且 exclude_word 中的任何词包含在 keys[0] 中
if len(keys) == 1 and any(word in keys[0] for word in exclude_word):
# 返回内层的字典
return data[keys[0]]
# 如果条件不满足,则返回原始字典
return data
def parse_json_with_duplicates(raw_string):
"""
解析具有重复键的 JSON 字符串将所有重复的键值对存储为列表
Args:
json_string (str): 需要解析的 JSON 字符串
Returns:
dict: 解析后的字典重复的键对应的值为列表
eg:输入"综合实力": {
"评分": "2分",
"要求": "投标人具备电子与智能化工程专业承包二级资质及以上证书得 2分不能够提供不得分开标时需提供原件"
},
"综合实力": {
"评分": "2分",
"要求": "投标人具有建筑机电安装工程专业承包三级资质或以上资质得 2分否则不得分。证书开标原件备查"
}
输出"综合实力": [
{
"评分": "2分",
"要求": "投标人具备电子与智能化工程专业承包二级资质及以上证书得 2分不能够提供不得分开标时需提供原件"
},
{
"评分": "2分",
"要求": "投标人具有建筑机电安装工程专业承包三级资质或以上资质得 2分否则不得分。证书开标原件备查"
}]
"""
def custom_object_pairs_hook(pairs):
d = defaultdict(list)
for key, value in pairs:
# 如果值是字典或列表,递归处理
if isinstance(value, dict):
value = process_dict(value)
elif isinstance(value, list):
value = process_list(value)
d[key].append(value)
# 将有多个值的键转换为列表,单个值的键保持原样
return {key: (values if len(values) > 1 else values[0]) for key, values in d.items()}
def process_dict(d):
"""
递归处理字典确保所有重复键的值为列表
Args:
d (dict): 需要处理的字典
Returns:
dict: 处理后的字典
"""
return custom_object_pairs_hook(d.items())
def process_list(l):
"""
递归处理列表确保列表中的所有字典也被处理
Args:
l (list): 需要处理的列表
Returns:
list: 处理后的列表
"""
return [process_dict(item) if isinstance(item, dict) else item for item in l]
"""输入字符串,提取 { 和 } 之间的内容,并将其解析为字典"""
if not raw_string.strip():
return {}
match = re.search(r'\{[\s\S]*\}', raw_string)
if match:
try:
json_string = match.group(0)
return json.loads(json_string, object_pairs_hook=custom_object_pairs_hook)
except json.JSONDecodeError as e:
print(f"json_utils: extract_content_from_json: JSON decode error: {e}")
return {}
else:
print("json_utils: extract_content_from_json: No valid JSON content found.")
return {}
def combine_evaluation_standards(truncate_file):
file_id = upload_file(truncate_file)
user_query1 = "根据该文档,你判断它是否有具体的关于技术评分或商务评分或投标报价的评分要求,如果有,返回'',否则返回''" # 应对竞争性谈判这种无评分要求的情况
judge_res = qianwen_long(file_id, user_query1)
# 默认 judge 为 True
judge = True
# 检查 judge_res 的内容
if '' in judge_res:
judge = False
if judge:
# 执行 user_query 相关的逻辑
# user_query = "根据该文档中的评标办法前附表或者评分标准表,请你列出该文件的技术评分,商务评分,投标报价评审标准以及它们对应的具体评分要求,外层键名分别为'技术评分','商务评分','投标报价'。如果评分内容不是这3个则返回文档中给定的评分内容以及它的评分要求都以json的格式返回结果如果该采购活动有多个包则最外层键名为对应的包名。请不要回答有关资格审查的内容"
user_query = (
"""
根据该文档中的评标办法前附表请你列出该文件的技术评分商务评分投标报价评审以及它们对应的具体评分要求请以json格式返回结果请在这三大块评分中分别用若干键值对表示具体要求请精确到具体的评审项内层的键名为'评分''要求'若这三大块评分中存在其他信息则在相应评分大块中新增键名'备注'存放该信息键值为具体的要求否则不需要如果评分内容因素不是这3个则返回文档中给定的评分内容因素以及它们的具体评分要求如果该采购活动有多个包则最外层键名为对应的包名,否则不需要不要回答有关资格审查的内容若存在未知信息填充'未知'以下为示例输出
{
"一包": {
"技术评分": {
"主要监理岗位的职责": {
"评分": "4分",
"要求": "1、总监理工程师的职责全面、清晰、合理得 1.2-2分一般的1.2分。2、其他主要监理人员及岗位的职责全面、清晰、合理得 1.2-2分一般的 1.2分。"
}
},
"商务评分": {
"控制系统内主板": {
"评分": "10分",
"要求": "所投电梯控制系统内主板为制造商原厂原品牌制造生产且为进口部件得 10分。提供进口部件报关单及原产地证明扫描件加盖公章否则不得分"
},
"制造商技术实力": [
{
"评分": "3分",
"要求": "一级证书得3分二级证书得1分其他不得分"
},
{
"评分": "2分",
"要求": "行业销量排名连续前 2 名,得 2 分,第 4-6 名得 0.5 分,其他不得分"
}
]
},
"投标报价评审": {
"投标报价是否出现违反计价规范": {
"评分": "合格/不合格",
"要求": "A:投标报价未违反计价规范的评审意见为“合格”B投标报价违反计价规范的评审意见为“不合格”"
}
}
}
}
"""
)
# 执行第二个查询
evaluation_res = qianwen_long(file_id, user_query)
cleaned_evaluation_res = parse_json_with_duplicates(evaluation_res)
result_data = process_data_based_on_key(cleaned_evaluation_res)
include = ['一包', '二包', '三包', '四包', '五包']
target_values = ['技术', '设计', '实施']
updated_jsons = {}
# 检查是否有外层键匹配 include 列表
if any(key for key in result_data if any(included in key for included in include)):
# 有匹配的项,处理这些项
for key in result_data:
if any(item in key for item in include):
inner_dict = result_data[key]
updated_jsons[key] = combine_technical_and_business(inner_dict, target_values)
else:
# 没有匹配的项,对整个字典运行
updated_jsons = combine_technical_and_business(result_data, target_values)
return updated_jsons
else:
# 如果 judge 是 False直接返回默认的技术标和商务标的结构
result_data = {}
result_data['技术标'] = ''
result_data['商务标'] = ''
return result_data
if __name__ == "__main__":
# truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output2\\2-招标文件_evaluation_method.pdf"
# truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output2\\2-招标文件统计局智能终端二次招标_evaluation_method.pdf"
# truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output2\\广水市妇幼招标文件最新W改_evaluation_method.pdf"
truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output2\\622二次视频会议磋商文件_evaluation_method.pdf"
res = combine_evaluation_standards(truncate_file)
print(json.dumps(res, ensure_ascii=False, indent=4))

View File

@ -196,7 +196,9 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
return generated_files
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,output_suffix="normal"):
#默认逻辑是start_page匹配上就不再设置了一般不匹配上目录的原因是设置了begin_page=5但是匹配'第一章 招标公告'的时候start_page可能会错误匹配到目录。
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
output_suffix="normal"):
start_page = None
end_page = None
for i, page in enumerate(pdf_document.pages):
@ -208,8 +210,13 @@ def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
else:
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text):
continue
if output_suffix == "notice":
if re.search(begin_pattern, cleaned_text) and i > begin_page:
start_page = i
else:
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
start_page = i
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
end_page = i
break
@ -237,7 +244,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
# 原有的处理逻辑保持不变
if output_suffix == "qualification1":
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern)
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern,output_suffix)
if start_page is None or end_page is None:
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header)
@ -260,7 +267,7 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, beg
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
start_page = i
if start_page is not None and mid_page is None and re.search(
r'^\s*[(]?\s*[一1]\s*[)]?\s*[、.]*\s*(说\s*明|总\s*则)', cleaned_text, re.MULTILINE):
r'^\s*[(]?\s*[一1]\s*[)]?\s*[、.]*\s*(说\s*明|总\s*则)', cleaned_text, re.MULTILINE):
mid_page = i
if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page:
end_page = i
@ -300,6 +307,15 @@ def get_patterns_for_qualification():
return begin_pattern_new, end_pattern_new
def get_patterns_for_notice():
begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*', re.MULTILINE
)
end_pattern = re.compile(
r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)', re.MULTILINE
)
return begin_pattern, end_pattern
def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header): #投标人须知前附表/正文二次提取
begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知前附表)+'
@ -331,17 +347,23 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
pdf_document = PdfReader(pdf_path)
patterns = None
begin_page=0
if output_suffix == "procurement":
patterns = [get_patterns_for_procurement()]
begin_page=5
elif output_suffix == "evaluation_method" or output_suffix=="qualification2" or output_suffix=="qualification3":
patterns = [get_patterns_for_evaluation_method()]
begin_page = 5
elif output_suffix == "qualification1":
patterns = [get_patterns_for_qualification()] # This now returns a tuple of pattern pairs
begin_page = 5
elif output_suffix == "notice":
patterns=[get_patterns_for_notice()]
begin_page = 0
# Try each set of patterns until a valid range is found
for pattern_pair in patterns:
start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], 5, common_header,
exclusion_pattern)
start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page, common_header,
exclusion_pattern,output_suffix)
if start_page is not None and end_page is not None:
break
if start_page is None or end_page is None:
@ -404,14 +426,23 @@ def truncate_pdf_main(input_path, output_folder, selection, output_suffix="defau
elif selection ==4: #投标人须知前附表和正文
begin_page=1
begin_pattern = re.compile(
r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)', re.MULTILINE
r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人|磋商|供应商)须知前附表)', re.MULTILINE
)
end_pattern=re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE
)
local_output_suffix = "tobidders_notice"
elif selection==5:
begin_page=0
begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*'
)
end_pattern = re.compile(
r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人|磋商|供应商)须知前附表)'
)
local_output_suffix = "notice"
else:
print("无效的选择")
print("无效的选择:请选择1-6")
return None
# 如果传入的 output_suffix 是 'default',则使用本地生成的 output_suffix
@ -425,7 +456,7 @@ def truncate_pdf_main(input_path, output_folder, selection, output_suffix="defau
def truncate_pdf_multiple(input_path, output_folder):
truncate_files = []
for selection in range(1, 5):
for selection in range(1, 6):
files = truncate_pdf_main(input_path, output_folder, selection)
truncate_files.extend(files)
return truncate_files
@ -434,7 +465,8 @@ def truncate_pdf_multiple(input_path, output_folder):
# TODO:交通智能系统和招标(1)(1)文件有问题 sele=4的时候excludsion有问题
if __name__ == "__main__":
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output4"
# truncate_pdf_multiple(input_path,output_folder)
selection = 4 # 例如1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1和qualification2 4.投标人须知前附表
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\outputtest"
# files=truncate_pdf_multiple(input_path,output_folder)
# print(files)
selection = 1 # 例如1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2
generated_files = truncate_pdf_main(input_path, output_folder, selection)

View File

@ -1,13 +1,182 @@
#竞磋 竞谈 磋商 询价 邀请 单一来源
import json
from flask_app.main.format_change import docx2pdf, pdf2docx
from flask_app.货物标.货物标截取pdf import truncate_pdf_main
def main_processing(output_folder,file_path,file_type, unique_id):
if file_type==1:
from flask_app.main.json_utils import transform_json_values
from flask_app.货物标.基础信息解析main import combine_basic_info
from flask_app.货物标.投标人须知正文提取指定内容货物标版 import extract_from_notice
from flask_app.货物标.货物标截取pdf import truncate_pdf_multiple
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
from flask_app.main.知识库操作 import addfileToKnowledge, deleteKnowledge
from flask_app.货物标.投标人须知正文条款提取成json文件货物标版 import convert_clause_to_json
from flask_app.货物标.无效标和废标和禁止投标整合货物标版 import combine_find_invalid
from flask_app.货物标.资格审查main import combine_qualification_review
from flask_app.货物标.评分标准提取main import combine_evaluation_standards
import logging
def get_global_logger(unique_id):
if unique_id is None:
return logging.getLogger() # 获取默认的日志器
logger = logging.getLogger(unique_id)
return logger
logger=None
# 创建全局线程池
executor = ThreadPoolExecutor()
def preprocess_files(output_folder, file_path, file_type, unique_id):
logger.info("starting 文件预处理...")
logger.info("output_folder..." + output_folder)
# 根据文件类型处理文件路径
if file_type == 1: # docx
docx_path = file_path
pdf_path = docx2pdf(file_path)
elif file_type==2:
pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理
elif file_type == 2: # pdf
pdf_path = file_path
docx_path=pdf2docx(file_path)
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
else:
print("未传入指定格式的文件!")
logger.error("Unsupported file type provided. Preprocessing halted.")
return None
truncate_file=truncate_pdf_main(pdf_path,output_folder,1)
# 异步上传知识库
future_knowledge = executor.submit(addfileToKnowledge, docx_path, "招标解析" + unique_id)
# 调用截取PDF多次
truncate_files = truncate_pdf_multiple(pdf_path, output_folder) #index: 0->商务技术服务要求 1->评标办法 2->资格审查 3->投标人须知前附表 4->投标人须知正文
# 处理各个部分
invalid_docpath = docx_path #docx截取无效标部分
procurement_path = truncate_files[0] #商务技术服务要求
evaluation_method_path = truncate_files[1] #评标办法
qualification_path=truncate_files[2] #资格审查
tobidders_notice_path = truncate_files[4] #投标人须知正文
notice_path=truncate_files[5]
clause_path = convert_clause_to_json(tobidders_notice_path, output_folder) # 投标人须知正文条款pdf->json
logger.info("文件预处理done")
# 提前返回,不等待 future_knowledge 完成,返回包含 Future 对象
return {
'file_path': file_path,
'output_folder': output_folder,
'procurement_path': procurement_path,
'evaluation_method_path': evaluation_method_path,
'qualification_path': qualification_path,
'notice_path':notice_path,
'knowledge_future': future_knowledge, # 返回 Future 对象
'clause_path': clause_path,
'invalid_docpath': invalid_docpath
}
def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path): # 投标人须知前附表
logger.info("starting基础信息...")
basic_res = combine_basic_info()
logger.info("基础信息done")
return basic_res
def fetch_qualification_review(output_folder,qualification_path,notice_path,knowledge_name): #资格审查
logger.info("starting资格审查...")
review_standards_res = combine_qualification_review(output_folder,qualification_path,notice_path, knowledge_name)
logger.info("资格审查done")
return review_standards_res
def fetch_evaluation_standards(evaluation_method_path): # 评标细则
logger.info("starting 商务评分和技术评分...")
# 获取评标办法前附表的字典结果
evaluation_standards_res = combine_evaluation_standards(evaluation_method_path)
# 获取技术标和商务标
technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})}
commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})}
logger.info("商务评分和技术评分done")
# 返回将 "技术标" 和 "商务标" 包含在新的键中
return {
"technical_standards": technical_standards,
"commercial_standards": commercial_standards
}
#TODO:doc文档转换
def fetch_invalid_requirements(invalid_docpath, output_folder):
# 废标项要求:千问
logger.info("starting无效标与废标...")
find_invalid_res = combine_find_invalid(invalid_docpath, output_folder)
logger.info("无效标与废标done...")
return find_invalid_res
# 投标文件要求
def fetch_bidding_documents_requirements(clause_path):
logger.info("starting投标文件要求...")
fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1)
logger.info("投标文件要求done...")
return {"投标文件要求":fetch_bidding_documents_requirements_json}
# 开评定标流程
def fetch_bid_opening(clause_path):
logger.info("starting开评定标流程...")
fetch_bid_opening_json = extract_from_notice(clause_path, 2)
logger.info("开评定标流程done...")
return {"开评定标流程":fetch_bid_opening_json}
def goods_bid_main(output_folder,file_path, file_type, unique_id):
global logger
logger = get_global_logger(unique_id)
# 预处理文件,获取处理后的数据
processed_data = preprocess_files(output_folder, file_path, file_type, unique_id)
if not processed_data:
yield json.dumps({}) # 如果处理数据失败,返回空的 JSON
with concurrent.futures.ThreadPoolExecutor() as executor:
# 立即启动不依赖 knowledge_name 和 index 的任务
futures = {
'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['evaluation_method_path']),
'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],output_folder),
'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, processed_data['clause_path']),
'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path'])
}
# 提前处理这些不依赖的任务,按完成顺序返回
for future in concurrent.futures.as_completed(futures.values()):
key = next(k for k, v in futures.items() if v == future)
try:
result = future.result()
# 如果是 evaluation_standards拆分技术标和商务标
if key == 'evaluation_standards':
technical_standards = result["technical_standards"]
commercial_standards = result["commercial_standards"]
# 分别返回技术标和商务标
yield json.dumps({'technical_standards': transform_json_values(technical_standards)}, ensure_ascii=False)
yield json.dumps({'commercial_standards': transform_json_values(commercial_standards)}, ensure_ascii=False)
else:
# 处理其他任务的结果
yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False)
except Exception as exc:
logger.error(f"Error processing {key}: {exc}")
yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
# 只有在需要 knowledge_name 和 index 时才等待 future_knowledge 完成
try:
knowledge_name = "招标解析" + unique_id
index = processed_data['knowledge_future'].result() # 阻塞等待知识库上传任务完成
# 提交依赖 knowledge_name 和 index 的任务
future_dependencies = {
'base_info': executor.submit(fetch_project_basic_info, knowledge_name, processed_data['truncate0'],
output_folder, processed_data['clause_path']),
'qualification_review': executor.submit(fetch_qualification_review,output_folder, processed_data['qualification_path'],processed_data['notice_path'],knowledge_name),
}
# 按完成顺序返回依赖任务的结果
for future in concurrent.futures.as_completed(future_dependencies.values()):
key = next(k for k, v in future_dependencies.items() if v == future)
try:
result = future.result()
yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False)
except Exception as exc:
logger.error(f"Error processing {key}: {exc}")
yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
except Exception as e:
logger.error(f"Error uploading to knowledge base: {e}")
yield json.dumps({'error': f'Knowledge upload failed: {str(e)}'}, ensure_ascii=False)
# 删除知识索引
deleteKnowledge(index)

View File

@ -1,13 +1,14 @@
# -*- encoding:utf-8 -*-
import copy
import json
import re
from flask_app.main.基础信息整合 import combine_basic_info
from flask_app.main.通义千问long import qianwen_long, upload_file
from flask_app.main.多线程提问 import multi_threading
from flask_app.main.json_utils import combine_json_results,clean_json_string
from flask_app.main.形式响应评审 import update_json_data,extract_matching_keys
from flask_app.main.json_utils import extract_content_from_json
from flask_app.main.json_utils import extract_content_from_json, clean_json_string
from flask_app.货物标.投标人须知正文条款提取成json文件货物标版 import convert_clause_to_json
from flask_app.货物标.货物标截取pdf import truncate_pdf_main
# 这个字典可能有嵌套,你需要遍历里面的键名,对键名作判断,而不是键值,具体是这样的:如果处于同一层级的键的数量>1并且键名全由数字或点号组成。那么就将这些序号键名全部删除重新组织成一个字典格式的数据你可以考虑用字符串列表来保持部分平级的数据
# 对于同级的键,如果数量>1且键名都统一那么将键名去掉用列表保持它们的键值
@ -16,6 +17,8 @@ def is_numeric_key(key):
# 字母后跟数字,或数字后跟字母,单个字母后跟点,但不能是字母-数字-字母的组合
pattern = r'^[\d.]+$|^\(\d+\)$|^\d+$|^[a-zA-Z]$|^[a-zA-Z]\d+$|^\d+[a-zA-Z]$|^[a-zA-Z]\.$'
return re.match(pattern, key) is not None
# TODO:如果键值中存在数字就不行
# zbtest20也有问题
def contains_number_or_index(key, value):
@ -33,6 +36,7 @@ def contains_number_or_index(key, value):
# 如果值是数字或包含数字,且不包含中文字符,或者键包含 "序号",返回 True
return is_number or contains_index or contains_digit
# 对于同一个字典中,可能存在若干键值对,若它们的键值都是""或者"/" 你就将它们的键值删去,它们的键名用字符串列表保存
# 如果键名是"序号"或者键值中全是数字,删去序号
def preprocess_dict(data):
@ -56,7 +60,20 @@ def preprocess_dict(data):
else:
return data
def process_dict(data):
"""
递归处理字典将符合条件的键值对进行转换
如果键是数字或特定格式的字符串则将其值放入 'items' 列表中并排序
对于非数字键如果对应的值是列表且列表中只有一个元素则将其展平为单个元素
Args:
data (dict): 输入的字典数据
Returns:
dict list 原始数据类型: 处理后的数据结构
"""
if not isinstance(data, dict):
return data
@ -64,15 +81,20 @@ def process_dict(data):
numeric_keys = []
non_numeric_keys = {}
# 分类键为数字键和非数字键
for key, value in data.items():
if is_numeric_key(key):
numeric_keys.append((key, value))
else:
non_numeric_keys[key] = value
# 处理数字键,将其值递归处理后放入 'items' 列表中
if numeric_keys:
result['items'] = [process_dict(item[1]) for item in sorted(numeric_keys)]
# 按键排序,确保顺序一致
numeric_keys_sorted = sorted(numeric_keys, key=lambda x: x[0])
result['items'] = [process_dict(item[1]) for item in numeric_keys_sorted]
# 处理非数字键
for key, value in non_numeric_keys.items():
if isinstance(value, list):
processed_list = []
@ -92,25 +114,35 @@ def process_dict(data):
processed_list.append(processed_item)
# 新增逻辑:如果 processed_list 只有一个元素,则将其展平为单个元素
if len(processed_list) == 1:
result[key] = processed_list[0]
else:
result[key] = processed_list
else:
# 如果值不是列表,直接递归处理
result[key] = process_dict(value)
# 如果结果只有一个键 'items',则直接返回 'items' 列表
if len(result) == 1 and 'items' in result:
return result['items']
# 检查如果所有键对应的值都是空列表,则将键名转换成列表项
if all(isinstance(v, list) and not v for v in result.values()):
return list(result.keys())
return result
# 查找引用的序号
def find_chapter_clause_references(data, parent_key=""):
exclude_list = ["格式要求"]
result = []
# 正则匹配"第x章"或"第x款"
chapter_clause_pattern = re.compile(r'第[一二三四五六七八九十\d]+[章款]')
# 如果数据不是字典,则直接返回空列表
if not isinstance(data, dict):
return result
# 遍历字典中的键值对
for key, value in data.items():
# 生成当前的完整键名
@ -124,6 +156,13 @@ def find_chapter_clause_references(data, parent_key=""):
if isinstance(value, dict):
# 如果值是字典,递归调用函数
result.extend(find_chapter_clause_references(value, full_key))
elif isinstance(value, list):
# 如果值是列表,遍历列表中的元素
for index, item in enumerate(value):
if isinstance(item, dict):
# 生成新的键路径,包括列表索引
new_parent_key = f"{full_key}[{index}]"
result.extend(find_chapter_clause_references(item, new_parent_key))
elif isinstance(value, str):
# 如果值是字符串,检查是否匹配"第x章"或"第x款"
if chapter_clause_pattern.search(value):
@ -131,6 +170,7 @@ def find_chapter_clause_references(data, parent_key=""):
return result
def preprocess_value(value):
# 使用正则表达式查找"第X章"或"第X款"
chapter_match = re.search(r'第(.+?)章', value)
@ -185,6 +225,23 @@ def generate_questions(input_list):
questions.append(question)
return questions
"""
eg:
response_list = [
{
"person.name": "Bob",
"person.address.city": "Los Angeles"
},
{
"company.location": "Austin",
"person.age": 35
}
]
"""
# 用新数据更新原有字典
def update_json_data(original_data, response_list):
def recursive_update(data, key, value):
# 处理点分隔的键,递归定位并更新嵌套字典
@ -195,42 +252,158 @@ def update_json_data(original_data,response_list):
data[keys[-1]] = {**data.get(keys[-1], {}), **value}
else:
data[keys[-1]] = value
for response_dict in response_list:
for key, value in response_dict.items():
recursive_update(original_data, key, value)
return original_data
def qualification_review(truncate_file,knowledge_name):
file_id=upload_file(truncate_file)
user_query=["该招标文件中规定的资格性审查标准是怎样的请以json格式给出外层为'资格性审查',你的回答要与原文一致,不可擅自总结删减,也不要回答有关符合性性审查的内容。","该招标文件中规定的符合性审查标准是怎样的请以json格式给出外层为'符合性审查',你的回答要与原文一致,不可擅自总结删减,也不要回答有关资格性审查的内容。"]
def process_match_keys(match_keys, clause_path_file):
"""
处理 match_keys根据其中的数字或中文数字提取相应的条款内容并附加到原始值后面
参数
- match_keys (list): 包含键值对的列表
- clause_path_file (str): clause_path的JSON文件路径
返回
- list: 更新后的match_keys列表
"""
# 定义数字到中文数字的映射,扩展到'十'
digit_map = {'1': '', '2': '','3': '','4': '','5': '','6': '','7': '','8': '','9': '','10': ''}
# 定义中文数字列表
chinese_numerals = ['', '', '', '', '', '', '', '', '', '']
# 编译一个正则表达式,用于查找中文数字后面跟着的不是'章'或'部分'的字符
# 这个模式会捕获中文数字和紧随其后的一个字符
pattern = re.compile(r'([一二三四五六七八九十]+)(?!章|部分)(.)')
# 读取clause_path的内容
try:
with open(clause_path_file, 'r', encoding='utf-8') as file:
clause_path = json.load(file)
except FileNotFoundError:
print(f"文件未找到: {clause_path_file}")
return match_keys
except json.JSONDecodeError:
print(f"文件内容不是有效的JSON格式: {clause_path_file}")
return match_keys
for item in match_keys:
for key, value in item.items():
# 将match_keys中的数字1-10转换为对应的中文数字
for digit, chinese in digit_map.items():
value = re.sub(r'{}'.format(digit), chinese, value)
# 查找值中所有匹配的中文数字
matches = pattern.findall(value)
# 存储需要附加的条款内容,避免重复
clauses_to_append = []
for match in matches:
numeral = match[0]
# 检查提取的中文数字是否在定义的列表中
if numeral in chinese_numerals:
# 在clause_path的键中查找包含该中文数字的键
for clause_key in clause_path.keys():
if numeral in clause_key:
clause_value = clause_path[clause_key]
if clause_value not in clauses_to_append:
clauses_to_append.append(clause_value)
if clauses_to_append:
# 将找到的条款内容用换行符连接
appended_text = '\n'.join(clauses_to_append)
# 更新当前项的值,添加换行和附加内容
item[key] = value + '\n' + appended_text
return match_keys
#处理如'符合本采购文件第一章第二款要求'的情况,跳转到指定地方摘取内容
def process_additional_queries(combined_res, match_keys,output_folder, notice_path,knowledge_name):
"""
处理额外的查询并更新结果
Args:
combined_res: 初始的组合结果
match_keys: 匹配的章节或条款引用 [{'资格性审查.资格要求': '符合本采购文件第一章第二款要求,并提供合格有效的证明材料。'}]
knowledge_name: 知识库的名称
Returns:
dict: 更新后的最终结果
"""
clause2_path=convert_clause_to_json(notice_path,output_folder,2)
# 处理match_keys匹配并附加条款内容
original_keys = copy.deepcopy(match_keys)
updated_match_keys = process_match_keys(match_keys, clause2_path)
#正则匹配到指定内容后就直接返回
if updated_match_keys != original_keys:
return update_json_data(combined_res, updated_match_keys)
# 生成进一步的问题
ques = generate_questions(original_keys)
results = multi_threading(ques, knowledge_name)
for _, response in results:
if response and len(response) > 1:
try:
temp = extract_content_from_json(response[1])
updated_match_keys.append(temp)
except Exception as e:
print(f"形式响应评审Error processing response: {e}")
else:
print(f"形式响应评审Warning: Missing or incomplete response data.")
return update_json_data(combined_res, updated_match_keys)
def combine_qualification_review(output_folder,qualification_path, notice_path,knowledge_name):
"""
组合资格性审查和符合性审查的评审结果
Args:
truncate_file: 要上传和处理的文件
knowledge_name: 知识库的名称用于后续查询
Returns:
dict: 最终组合的评审结果
"""
# 上传文件并获取文件ID
file_id = upload_file(qualification_path)
# 定义用户查询列表
user_query = [
"该招标文件中规定的资格性审查标准是怎样的请以json格式给出外层为'资格性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关符合性性审查的内容。",
"该招标文件中规定的符合性审查标准是怎样的请以json格式给出外层为'符合性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关资格性审查的内容。"
]
# 执行多线程查询
results = multi_threading(user_query, "", file_id, 2)
combined_res = {}
for question, response in results:
cleaned_data = clean_json_string(response)
if response:
cleaned_data = clean_json_string(response) # 清理大模型回答
processed1 = preprocess_dict(cleaned_data)
processed2 = process_dict(processed1)
combined_res.update(processed2)
match_keys=find_chapter_clause_references(combined_res)
ques=generate_questions(match_keys)
results = multi_threading(ques, knowledge_name) # 无序号的直接问大模型
first_response_list = []
for _, response in results:
try:
if response and len(response) > 1: # 检查response存在且有至少两个元素
temp = extract_content_from_json(response[1])
first_response_list.append(temp)
else:
print(f"形式响应评审Warning: Missing or incomplete response data for query index {_}.")
except Exception as e:
print(f"形式响应评审Error processing response for query index {_}: {e}")
# print(first_response_list)
final_result=update_json_data(combined_res,first_response_list)
return final_result
print(f"Warning: No response for question '{question}'.")
# 查找章节或条款引用
match_keys = find_chapter_clause_references(combined_res) #[{'资格性审查.资格要求': '符合本采购文件第一章第二款要求,并提供合格有效的证明材料。'}]
# 如果没有匹配的章节或条款,直接返回 combined_res
if not match_keys:
return combined_res
# 调用新的函数处理后续逻辑
return process_additional_queries(combined_res, match_keys, output_folder,notice_path,knowledge_name)
# 整合基础信息核心代码
# [{'资格性审查.资格要求': '符合本采购文件第一章第二款要求,并提供合格有效的证明材料'}, {'资格性审查.没有重大违法记录的书面声明': '是否提交参加政府采购活动前三年内在经营活动中没有重大违法记录的书面承诺或声明(格式要求详见本项目采购文件第六章相关格式要求)'}]
# TODO:有个严重的问题,对于{'资格性审查.资格要求': '符合本采购文件第一章第二款要求,并提供合格有效的证明材料'}调用百炼rag的时候容易得到一模一样的回答而不是跳转到具体的地方有两个思路1.结构化第一章内容 2.优化提示词 3.构造问题的时候不带value直接问key
if __name__ == "__main__":
truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\6.2定版视频会议磋商文件_qualification2.pdf"
# qualification_path="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\6.2定版视频会议磋商文件_qualification2.pdf"
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zboutpub"
qualification_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output3\\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件_qualification2.pdf"
notice_path="C:\\Users\\Administrator\\Desktop\\货物标\\output5\\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件_notice.pdf"
knowledge_name = "6.2视频会议docx"
res=qualification_review(truncate_file,knowledge_name)
res = combine_qualification_review(output_folder,qualification_path, notice_path,knowledge_name)
print(json.dumps(res, ensure_ascii=False, indent=4))