9.11 改为全局logger,完善了无效标的bug
This commit is contained in:
parent
ad8fd46cf7
commit
a256ceceb4
@ -2,6 +2,8 @@ from docx import Document
|
|||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
from flask import g
|
||||||
|
|
||||||
|
|
||||||
def copy_docx(source_path):
|
def copy_docx(source_path):
|
||||||
doc = Document(source_path) # 打开源文档
|
doc = Document(source_path) # 打开源文档
|
||||||
@ -10,7 +12,7 @@ def copy_docx(source_path):
|
|||||||
# 获取原文件名并添加后缀
|
# 获取原文件名并添加后缀
|
||||||
original_file_name = os.path.basename(source_path)
|
original_file_name = os.path.basename(source_path)
|
||||||
file_name_without_ext, file_ext = os.path.splitext(original_file_name)
|
file_name_without_ext, file_ext = os.path.splitext(original_file_name)
|
||||||
modified_file_name = file_name_without_ext + "_invalid" + file_ext
|
modified_file_name = file_name_without_ext + "_invalid111" + file_ext
|
||||||
destination_path = os.path.join(output_folder, modified_file_name)
|
destination_path = os.path.join(output_folder, modified_file_name)
|
||||||
|
|
||||||
new_doc = Document() # 创建新文档
|
new_doc = Document() # 创建新文档
|
||||||
@ -22,6 +24,7 @@ def copy_docx(source_path):
|
|||||||
# 寻找最后一个begin_pattern的位置
|
# 寻找最后一个begin_pattern的位置
|
||||||
last_begin_index = -1
|
last_begin_index = -1
|
||||||
for i, paragraph in enumerate(doc.paragraphs):
|
for i, paragraph in enumerate(doc.paragraphs):
|
||||||
|
|
||||||
if begin_pattern.search(paragraph.text):
|
if begin_pattern.search(paragraph.text):
|
||||||
last_begin_index = i
|
last_begin_index = i
|
||||||
|
|
||||||
@ -42,9 +45,10 @@ def copy_docx(source_path):
|
|||||||
break
|
break
|
||||||
|
|
||||||
new_doc.save(destination_path) # 保存新文档
|
new_doc.save(destination_path) # 保存新文档
|
||||||
|
g.logger.info("docx截取docx成功!")
|
||||||
|
|
||||||
|
|
||||||
# 调用函数
|
# 调用函数
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
source_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output\\zbtest13.docx"
|
source_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile.docx"
|
||||||
copy_docx(source_path)
|
copy_docx(source_path)
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
import requests
|
import requests
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
|
||||||
|
from flask import g
|
||||||
|
|
||||||
|
|
||||||
def download_file(url, local_filename):
|
def download_file(url, local_filename):
|
||||||
try:
|
try:
|
||||||
@ -27,13 +29,13 @@ def download_file(url, local_filename):
|
|||||||
else:
|
else:
|
||||||
return full_filename,3
|
return full_filename,3
|
||||||
except requests.HTTPError as e:
|
except requests.HTTPError as e:
|
||||||
print(f"HTTP Error: {e}")
|
g.logger.error(f"download: HTTP Error: {e}")
|
||||||
return None
|
return None
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
print(f"Error downloading the file: {e}")
|
g.logger.error(f"download: Error downloading the file: {e}")
|
||||||
return None
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"An error occurred: {e}")
|
g.logger.error(f"download: An error occurred: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import requests
|
import requests
|
||||||
|
from flask import g
|
||||||
|
|
||||||
from flask_app.main.download import download_file
|
from flask_app.main.download import download_file
|
||||||
|
|
||||||
|
|
||||||
@ -19,14 +21,14 @@ def upload_file(file_path, url):
|
|||||||
|
|
||||||
# 检查响应状态码
|
# 检查响应状态码
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
print("文件上传成功")
|
g.logger.info("format_change 文件上传成功")
|
||||||
receive_file_response = response.content.decode('utf-8')
|
receive_file_response = response.content.decode('utf-8')
|
||||||
receive_file_json = json.loads(receive_file_response)
|
receive_file_json = json.loads(receive_file_response)
|
||||||
receive_file_url = receive_file_json["data"]
|
receive_file_url = receive_file_json["data"]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print(f"文件上传失败,状态码: {response.status_code}")
|
g.logger.info(f"format_change 文件上传失败,状态码: {response.status_code}")
|
||||||
print(response.text)
|
g.logger.info(f"format_change {response.text}")
|
||||||
|
|
||||||
return receive_file_url
|
return receive_file_url
|
||||||
|
|
||||||
@ -44,7 +46,7 @@ def pdf2docx(local_path_in):
|
|||||||
filename, folder = get_filename_and_folder(local_path_in) #输入输出在同一个文件夹
|
filename, folder = get_filename_and_folder(local_path_in) #输入输出在同一个文件夹
|
||||||
local_path_out=os.path.join(folder,filename) #输出文件名
|
local_path_out=os.path.join(folder,filename) #输出文件名
|
||||||
downloaded_filepath,file_type=download_file(receive_download_url, local_path_out)
|
downloaded_filepath,file_type=download_file(receive_download_url, local_path_out)
|
||||||
print("have downloaded file to:",downloaded_filepath)
|
g.logger.info("format_change p2d:have downloaded file to:",downloaded_filepath)
|
||||||
return downloaded_filepath
|
return downloaded_filepath
|
||||||
|
|
||||||
def docx2pdf(local_path_in):
|
def docx2pdf(local_path_in):
|
||||||
@ -53,7 +55,7 @@ def docx2pdf(local_path_in):
|
|||||||
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
|
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
|
||||||
local_path_out = os.path.join(folder, filename) # 输出文件名
|
local_path_out = os.path.join(folder, filename) # 输出文件名
|
||||||
downloaded_filepath,file_type = download_file(receive_download_url, local_path_out)
|
downloaded_filepath,file_type = download_file(receive_download_url, local_path_out)
|
||||||
print("have downloaded file to:", downloaded_filepath)
|
g.logger.info("format_change d2p:have downloaded file to:", downloaded_filepath)
|
||||||
return downloaded_filepath
|
return downloaded_filepath
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from flask import g
|
||||||
|
|
||||||
|
|
||||||
def extract_content_from_json(json_data):
|
def extract_content_from_json(json_data):
|
||||||
"""提取 { 和 } 之间的内容,并将其解析为字典"""
|
"""提取 { 和 } 之间的内容,并将其解析为字典"""
|
||||||
if not json_data.strip():
|
if not json_data.strip():
|
||||||
@ -11,10 +14,10 @@ def extract_content_from_json(json_data):
|
|||||||
json_data = match.group(0)
|
json_data = match.group(0)
|
||||||
return json.loads(json_data) #返回字典
|
return json.loads(json_data) #返回字典
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
print(f"JSON decode error: {e}")
|
g.logger.info(f"json_utils: extract_content_from_json: JSON decode error: {e}")
|
||||||
return {}
|
return {}
|
||||||
else:
|
else:
|
||||||
print("No valid JSON content found.")
|
g.logger.info("json_utils: extract_content_from_json: No valid JSON content found.")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def clean_json_string(json_string):
|
def clean_json_string(json_string):
|
||||||
@ -63,18 +66,18 @@ def add_keys_to_json(target_dict, source_dict):
|
|||||||
dict: 更新后的字典。
|
dict: 更新后的字典。
|
||||||
"""
|
"""
|
||||||
if not target_dict:
|
if not target_dict:
|
||||||
print("Error: Target dictionary is empty.")
|
g.logger.error("json_utils: Error: Target dictionary is empty.")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
if len(target_dict) != 1:
|
if len(target_dict) != 1:
|
||||||
print("Error: Target dictionary must contain exactly one top-level key.")
|
g.logger.error("json_utils: Error: Target dictionary must contain exactly one top-level key.")
|
||||||
return target_dict
|
return target_dict
|
||||||
|
|
||||||
# 获取唯一的外层键
|
# 获取唯一的外层键
|
||||||
target_key, existing_dict = next(iter(target_dict.items()))
|
target_key, existing_dict = next(iter(target_dict.items()))
|
||||||
|
|
||||||
if not isinstance(existing_dict, dict):
|
if not isinstance(existing_dict, dict):
|
||||||
print(f"Error: The value under the key '{target_key}' is not a dictionary.")
|
g.logger.error(f"json_utils: Error: The value under the key '{target_key}' is not a dictionary.")
|
||||||
return target_dict
|
return target_dict
|
||||||
|
|
||||||
# 合并字典
|
# 合并字典
|
||||||
@ -92,7 +95,8 @@ def rename_outer_key(original_data,new_key):
|
|||||||
|
|
||||||
# 提取原始数据中的唯一外层值(假设只有一个外层键)
|
# 提取原始数据中的唯一外层值(假设只有一个外层键)
|
||||||
if not original_data or not isinstance(original_data, dict):
|
if not original_data or not isinstance(original_data, dict):
|
||||||
return {} # 如果输入无效或不是字典,则返回空字典
|
g.logger.error("json_utils: Error: Invalid input or input is not a dictionary.") # 如果输入无效或不是字典,则返回空字典
|
||||||
|
return {}
|
||||||
|
|
||||||
# 使用 next(iter(...)) 提取第一个键的值
|
# 使用 next(iter(...)) 提取第一个键的值
|
||||||
original_value = next(iter(original_data.values()), {})
|
original_value = next(iter(original_data.values()), {})
|
||||||
|
@ -5,7 +5,7 @@ import time
|
|||||||
import uuid
|
import uuid
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
from flask import Flask, request, jsonify, Response, stream_with_context
|
from flask import Flask, request, jsonify, Response, stream_with_context, g
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from flask_app.main.download import download_file
|
from flask_app.main.download import download_file
|
||||||
@ -31,30 +31,30 @@ class CSTFormatter(logging.Formatter):
|
|||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
||||||
def create_logger(unique_id):
|
@app.before_request
|
||||||
"""为每个请求创建一个新的日志器,日志器的日志文件存放在指定的输出文件夹中"""
|
def before_request():
|
||||||
|
# 每个请求开始前初始化 logger
|
||||||
|
create_logger() # 确保这个函数中设置了 g.logger
|
||||||
|
|
||||||
|
|
||||||
|
def create_logger():
|
||||||
|
unique_id = str(uuid.uuid4())
|
||||||
|
g.unique_id = unique_id
|
||||||
output_folder = f"flask_app/static/output/{unique_id}"
|
output_folder = f"flask_app/static/output/{unique_id}"
|
||||||
# output_folder =f"C:/Users/Administrator/Desktop/招标文件/test/{unique_id}"
|
os.makedirs(output_folder, exist_ok=True)
|
||||||
if not os.path.exists(output_folder):
|
|
||||||
os.makedirs(output_folder, exist_ok=True)
|
|
||||||
log_filename = "log.txt"
|
log_filename = "log.txt"
|
||||||
log_path = os.path.join(output_folder, log_filename)
|
log_path = os.path.join(output_folder, log_filename)
|
||||||
logger = logging.getLogger(unique_id) # 使用 unique_id 作为日志器名字
|
logger = logging.getLogger(unique_id)
|
||||||
if not logger.handlers: # 避免重复添加处理器
|
if not logger.handlers:
|
||||||
# 文件处理器
|
|
||||||
file_handler = logging.FileHandler(log_path)
|
file_handler = logging.FileHandler(log_path)
|
||||||
file_formatter = CSTFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
file_formatter = CSTFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||||
file_handler.setFormatter(file_formatter)
|
file_handler.setFormatter(file_formatter)
|
||||||
logger.addHandler(file_handler)
|
logger.addHandler(file_handler)
|
||||||
|
stream_handler = logging.StreamHandler()
|
||||||
# 流处理器(控制台输出)
|
stream_handler.setFormatter(logging.Formatter('%(message)s'))
|
||||||
stream_handler = logging.StreamHandler(sys.stdout)
|
|
||||||
stream_formatter = logging.Formatter('%(message)s') # 简化的格式,只输出消息
|
|
||||||
stream_handler.setFormatter(stream_formatter)
|
|
||||||
logger.addHandler(stream_handler)
|
logger.addHandler(stream_handler)
|
||||||
|
|
||||||
logger.setLevel(logging.INFO)
|
logger.setLevel(logging.INFO)
|
||||||
return logger, output_folder
|
g.logger = logger
|
||||||
|
|
||||||
|
|
||||||
@app.route('/upload', methods=['POST'])
|
@app.route('/upload', methods=['POST'])
|
||||||
@ -64,10 +64,10 @@ def zbparse():
|
|||||||
return file_url
|
return file_url
|
||||||
try:
|
try:
|
||||||
app.logger.info("starting parsing url:" + file_url)
|
app.logger.info("starting parsing url:" + file_url)
|
||||||
final_json_path, output_folder, logger = download_and_process_file(file_url)
|
final_json_path, output_folder = download_and_process_file(file_url)
|
||||||
if not final_json_path:
|
if not final_json_path:
|
||||||
return jsonify({'error': 'File processing failed'}), 500
|
return jsonify({'error': 'File processing failed'}), 500
|
||||||
response = generate_response(final_json_path, logger) # 先获取响应内容
|
response = generate_response(final_json_path) # 先获取响应内容
|
||||||
# remove_directory(output_folder) # 然后删除文件夹
|
# remove_directory(output_folder) # 然后删除文件夹
|
||||||
return response # 最后返回获取的响应
|
return response # 最后返回获取的响应
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -75,6 +75,7 @@ def zbparse():
|
|||||||
return jsonify({'error': str(e)}), 500
|
return jsonify({'error': str(e)}), 500
|
||||||
|
|
||||||
|
|
||||||
|
# 流式
|
||||||
# def zbparse():
|
# def zbparse():
|
||||||
# file_url = validate_request()
|
# file_url = validate_request()
|
||||||
# if isinstance(file_url, tuple): # Check if the returned value is an error response
|
# if isinstance(file_url, tuple): # Check if the returned value is an error response
|
||||||
@ -87,40 +88,42 @@ def zbparse():
|
|||||||
# return jsonify({'error': str(e)}), 500
|
# return jsonify({'error': str(e)}), 500
|
||||||
|
|
||||||
|
|
||||||
def process_and_stream(file_url):
|
# 分段返回
|
||||||
unique_id = str(uuid.uuid4())
|
# def process_and_stream(file_url):
|
||||||
logger, output_folder = create_logger(unique_id)
|
# logger = g.logger
|
||||||
filename = "ztbfile"
|
# unique_id = g.unique_id
|
||||||
downloaded_filename = os.path.join(output_folder, filename)
|
# output_folder = f"flask_app/static/output/{unique_id}" # 直接使用全局 unique_id 构建路径
|
||||||
|
# filename = "ztbfile"
|
||||||
downloaded_filepath, file_type = download_file(file_url, downloaded_filename)
|
# downloaded_filename = os.path.join(output_folder, filename)
|
||||||
|
#
|
||||||
if downloaded_filepath is None or file_type == 3:
|
# downloaded_filepath, file_type = download_file(file_url, downloaded_filename)
|
||||||
logger.error("Unsupported file type or failed to download file")
|
#
|
||||||
error_response = {
|
# if downloaded_filepath is None or file_type == 3:
|
||||||
'message': 'File processing failed',
|
# logger.error("Unsupported file type or failed to download file")
|
||||||
'filename': None,
|
# error_response = {
|
||||||
'data': json.dumps({'error': 'File processing failed'})
|
# 'message': 'File processing failed',
|
||||||
}
|
# 'filename': None,
|
||||||
yield f"data: {json.dumps(error_response)}\n\n"
|
# 'data': json.dumps({'error': 'File processing failed'})
|
||||||
return
|
# }
|
||||||
|
# yield f"data: {json.dumps(error_response)}\n\n"
|
||||||
logger.info("Local file path: " + downloaded_filepath)
|
# return
|
||||||
|
#
|
||||||
for data in main_processing(output_folder, downloaded_filepath, file_type, unique_id):
|
# logger.info("Local file path: " + downloaded_filepath)
|
||||||
response = {
|
#
|
||||||
'message': 'Processing',
|
# for data in main_processing(output_folder, downloaded_filepath, file_type, unique_id):
|
||||||
'filename': os.path.basename(downloaded_filepath),
|
# response = {
|
||||||
'data': data
|
# 'message': 'Processing',
|
||||||
}
|
# 'filename': os.path.basename(downloaded_filepath),
|
||||||
yield f"data: {json.dumps(response)}\n\n"
|
# 'data': data
|
||||||
|
# }
|
||||||
final_response = {
|
# yield f"data: {json.dumps(response)}\n\n"
|
||||||
'message': 'File uploaded and processed successfully',
|
#
|
||||||
'filename': os.path.basename(downloaded_filepath),
|
# final_response = {
|
||||||
'data': 'END'
|
# 'message': 'File uploaded and processed successfully',
|
||||||
}
|
# 'filename': os.path.basename(downloaded_filepath),
|
||||||
yield f"data: {json.dumps(final_response)}\n\n"
|
# 'data': 'END'
|
||||||
|
# }
|
||||||
|
# yield f"data: {json.dumps(final_response)}\n\n"
|
||||||
|
|
||||||
|
|
||||||
def validate_request():
|
def validate_request():
|
||||||
@ -133,8 +136,9 @@ def validate_request():
|
|||||||
|
|
||||||
|
|
||||||
def download_and_process_file(file_url):
|
def download_and_process_file(file_url):
|
||||||
unique_id = str(uuid.uuid4()) # 生成一个唯一的 UUID
|
logger = g.logger
|
||||||
logger, output_folder = create_logger(unique_id)
|
unique_id = g.unique_id
|
||||||
|
output_folder = f"flask_app/static/output/{unique_id}" # 直接使用全局 unique_id 构建路径
|
||||||
filename = "ztbfile"
|
filename = "ztbfile"
|
||||||
downloaded_filename = os.path.join(output_folder, filename)
|
downloaded_filename = os.path.join(output_folder, filename)
|
||||||
|
|
||||||
@ -147,7 +151,7 @@ def download_and_process_file(file_url):
|
|||||||
|
|
||||||
logger.info("Local file path: " + downloaded_filepath)
|
logger.info("Local file path: " + downloaded_filepath)
|
||||||
processed_file_path = main_processing(output_folder, downloaded_filepath, file_type, unique_id)
|
processed_file_path = main_processing(output_folder, downloaded_filepath, file_type, unique_id)
|
||||||
return processed_file_path, output_folder, logger
|
return processed_file_path, output_folder
|
||||||
|
|
||||||
|
|
||||||
@app.route('/api/test_zbparse', methods=['POST'])
|
@app.route('/api/test_zbparse', methods=['POST'])
|
||||||
@ -189,10 +193,15 @@ def test_process_and_stream():
|
|||||||
yield f"data: {json.dumps(final_response)}\n\n"
|
yield f"data: {json.dumps(final_response)}\n\n"
|
||||||
|
|
||||||
|
|
||||||
def generate_response(final_json_path, logger):
|
def generate_response(final_json_path):
|
||||||
|
logger = g.logger
|
||||||
|
# 检查final_json_path是否为空或None
|
||||||
|
if not final_json_path:
|
||||||
|
logger.error('Empty or None path provided for final_json.')
|
||||||
|
return jsonify({'error': 'No path provided for final_json.'}), 400
|
||||||
if not os.path.exists(final_json_path):
|
if not os.path.exists(final_json_path):
|
||||||
logger.error('JSON file not found at path: ' + final_json_path)
|
logger.error('final_json not found at path: ' + final_json_path)
|
||||||
return jsonify({'error': 'JSON file not found'}), 404
|
return jsonify({'error': 'final_json not found'}), 404
|
||||||
with open(final_json_path, 'r', encoding='utf-8') as f:
|
with open(final_json_path, 'r', encoding='utf-8') as f:
|
||||||
logger.info('final_json_path:' + final_json_path)
|
logger.info('final_json_path:' + final_json_path)
|
||||||
zbparse_data = json.load(f)
|
zbparse_data = json.load(f)
|
||||||
|
@ -3,6 +3,8 @@ import os
|
|||||||
from docx import Document
|
from docx import Document
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
from flask import g
|
||||||
|
|
||||||
|
|
||||||
def read_tables_from_docx(file_path):
|
def read_tables_from_docx(file_path):
|
||||||
"""读取DOCX文件中的表格数据,并以嵌套字典的形式返回."""
|
"""读取DOCX文件中的表格数据,并以嵌套字典的形式返回."""
|
||||||
@ -87,13 +89,13 @@ def save_data_to_json(data, output_folder):
|
|||||||
"""将数据保存到JSON文件中."""
|
"""将数据保存到JSON文件中."""
|
||||||
with open(output_filepath, 'w', encoding='utf-8') as file:
|
with open(output_filepath, 'w', encoding='utf-8') as file:
|
||||||
json.dump(data, file, ensure_ascii=False, indent=4)
|
json.dump(data, file, ensure_ascii=False, indent=4)
|
||||||
print(f"The data has been processed and saved to '{output_filepath}'.")
|
g.logger.info(f"table_content_extraction: The data has been processed and saved to '{output_filepath}'.")
|
||||||
return output_filepath
|
return output_filepath
|
||||||
|
|
||||||
|
|
||||||
def extract_tables_main(path, output_folder):
|
def extract_tables_main(path, output_folder):
|
||||||
if not os.path.exists(path):
|
if not os.path.exists(path):
|
||||||
print(f"The specified file does not exist: {path}")
|
g.logger.error(f"table_content_extraction: The specified file does not exist: {path}")
|
||||||
return ""
|
return ""
|
||||||
# 读取文档表格数据
|
# 读取文档表格数据
|
||||||
table_data = read_tables_from_docx(path)
|
table_data = read_tables_from_docx(path)
|
||||||
|
@ -1,28 +1,19 @@
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
from PyPDF2 import PdfReader
|
pattern = re.compile(r'(\b\d+\s*\.\s*\d+\s*\.\s*\d+\b)|(\b3\s*\.\s*2\b)')
|
||||||
|
text = '3.1.3已标价工程量清单中漏报了某个工程子目的单价、合价或总额价则漏报的工程 子目单价、合价和总额价视为已含入其他工程子目的单价、合价和总额价之中。'
|
||||||
|
|
||||||
|
match = pattern.search(text)
|
||||||
|
if match:
|
||||||
|
print("匹配成功:", match.group())
|
||||||
|
else:
|
||||||
|
print("未找到匹配")
|
||||||
|
|
||||||
def extract_common_header(pdf_path):
|
# 使用 findall 查看所有匹配
|
||||||
pdf_document = PdfReader(pdf_path)
|
all_matches = pattern.findall(text)
|
||||||
headers = []
|
print("所有匹配:", all_matches)
|
||||||
num_pages_to_read = 3 # 预读页数
|
|
||||||
|
|
||||||
for i in range(min(num_pages_to_read, len(pdf_document.pages))):
|
# 打印文本的前10个字符的ASCII值,检查是否有不可见字符
|
||||||
page = pdf_document.pages[i]
|
print("文本前10个字符的ASCII值:")
|
||||||
text = page.extract_text()
|
for char in text[:10]:
|
||||||
if text: # 确保页面有文本内容
|
print(f"{char}: {ord(char)}")
|
||||||
first_line = text.strip().split('\n')[0]
|
|
||||||
headers.append(first_line)
|
|
||||||
|
|
||||||
if len(headers) < 2:
|
|
||||||
return "" # 如果没有足够的页来比较,返回空字符串
|
|
||||||
|
|
||||||
# 使用set交集来找出公共部分
|
|
||||||
common_header = set(headers[0].split()).intersection(*[set(header.split()) for header in headers[1:]])
|
|
||||||
common_header = ' '.join(common_header)
|
|
||||||
return common_header
|
|
||||||
|
|
||||||
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\test\\zbtest19.pdf"
|
|
||||||
res=extract_common_header(input_path)
|
|
||||||
print(res)
|
|
@ -122,14 +122,11 @@ def judge_whether_main(file_path,output_folder): #传入招标文件中‘投
|
|||||||
output_json_path = os.path.join(output_folder,'judge_exist.json')
|
output_json_path = os.path.join(output_folder,'judge_exist.json')
|
||||||
read_pdf_and_judge_main(file_path, output_json_path) #提取打勾符号
|
read_pdf_and_judge_main(file_path, output_json_path) #提取打勾符号
|
||||||
qianwen_answer = qianwen_ask(output_json_path, user_query1) # 调用普通千问判断是、否、未知
|
qianwen_answer = qianwen_ask(output_json_path, user_query1) # 调用普通千问判断是、否、未知
|
||||||
print("qianwen_answer:" + qianwen_answer)
|
|
||||||
user_query2 = construct_judge_questions(qianwen_answer) # 提取回答为”未知“的键
|
user_query2 = construct_judge_questions(qianwen_answer) # 提取回答为”未知“的键
|
||||||
# 判断user_query是否为空
|
# 判断user_query是否为空
|
||||||
if user_query2:
|
if user_query2:
|
||||||
print("user_query:" + user_query2)
|
|
||||||
file_id = upload_file(file_path)
|
file_id = upload_file(file_path)
|
||||||
res = qianwen_long(file_id, user_query2) #整个前附表一起传问千问long
|
res = qianwen_long(file_id, user_query2) #整个前附表一起传问千问long
|
||||||
print(res)
|
|
||||||
return process_judge_content(qianwen_answer, res)
|
return process_judge_content(qianwen_answer, res)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@ -143,7 +140,6 @@ def process_judge_content(original_json, update_json): #用新的数据合并
|
|||||||
original = extract_content_from_json(original_json)
|
original = extract_content_from_json(original_json)
|
||||||
updates = extract_content_from_json(update_json)
|
updates = extract_content_from_json(update_json)
|
||||||
original.update(updates)
|
original.update(updates)
|
||||||
print(original)
|
|
||||||
return merge_json_to_list(original)
|
return merge_json_to_list(original)
|
||||||
|
|
||||||
|
|
||||||
|
@ -55,7 +55,6 @@ def combine_technical_and_business(data, target_values1, target_values2):
|
|||||||
|
|
||||||
def combine_evaluation_standards(truncate2):
|
def combine_evaluation_standards(truncate2):
|
||||||
# 商务标、技术标评分项:千问
|
# 商务标、技术标评分项:千问
|
||||||
print("starting商务标技术标...")
|
|
||||||
file_id = upload_file(truncate2)
|
file_id = upload_file(truncate2)
|
||||||
user_query_2 = (
|
user_query_2 = (
|
||||||
"根据该文档中的评标办法前附表,请你列出该文件的技术标,商务标,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术标'中新增子键名'备注'存放该信息。如果评分内容不是这3个,则返回文档中给定的评分内容以及它的评分要求,都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容")
|
"根据该文档中的评标办法前附表,请你列出该文件的技术标,商务标,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术标'中新增子键名'备注'存放该信息。如果评分内容不是这3个,则返回文档中给定的评分内容以及它的评分要求,都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容")
|
||||||
@ -64,7 +63,6 @@ def combine_evaluation_standards(truncate2):
|
|||||||
target_values2=['投标报价','商务标','业绩','信誉','分值','计算公式','信用','人员','资格','奖项','认证','荣誉']
|
target_values2=['投标报价','商务标','业绩','信誉','分值','计算公式','信用','人员','资格','奖项','认证','荣誉']
|
||||||
update_json=combine_technical_and_business(clean_json_string(evaluation_res),target_values1,target_values2)
|
update_json=combine_technical_and_business(clean_json_string(evaluation_res),target_values1,target_values2)
|
||||||
evaluation_combined_res = json.dumps(update_json,ensure_ascii=False,indent=4)
|
evaluation_combined_res = json.dumps(update_json,ensure_ascii=False,indent=4)
|
||||||
print("商务标技术标done")
|
|
||||||
return evaluation_combined_res
|
return evaluation_combined_res
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
import json
|
import json
|
||||||
|
|
||||||
|
from flask import g
|
||||||
|
|
||||||
from flask_app.main.json_utils import clean_json_string, nest_json_under_key,rename_outer_key, combine_json_results
|
from flask_app.main.json_utils import clean_json_string, nest_json_under_key,rename_outer_key, combine_json_results
|
||||||
from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
|
from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
|
||||||
from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge
|
from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge
|
||||||
@ -31,8 +33,8 @@ def combine_basic_info(baseinfo_list):
|
|||||||
dynamic_key_handling(key_groups, relevant_keys_detected)
|
dynamic_key_handling(key_groups, relevant_keys_detected)
|
||||||
|
|
||||||
# 打印 key_groups 的内容检查它们是否被正确更新
|
# 打印 key_groups 的内容检查它们是否被正确更新
|
||||||
print("Updated key_groups after dynamic handling:")
|
# g.logger.info("Updated key_groups after dynamic handling:")
|
||||||
print(key_groups)
|
|
||||||
|
|
||||||
# 使用合并后的字典创建最终输出
|
# 使用合并后的字典创建最终输出
|
||||||
for group_name, keys in key_groups.items():
|
for group_name, keys in key_groups.items():
|
||||||
@ -79,7 +81,6 @@ def judge_consortium_bidding(baseinfo_list):
|
|||||||
return accept_bidding
|
return accept_bidding
|
||||||
def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #投标人须知前附表
|
def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #投标人须知前附表
|
||||||
# 调用大模型回答项目基础信息
|
# 调用大模型回答项目基础信息
|
||||||
print("starting基础信息...")
|
|
||||||
baseinfo_list = []
|
baseinfo_list = []
|
||||||
# baseinfo_file_path='../static/提示词/前两章提问总结.txt'
|
# baseinfo_file_path='../static/提示词/前两章提问总结.txt'
|
||||||
baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt' # 替换为你的txt文件路径
|
baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt' # 替换为你的txt文件路径
|
||||||
@ -90,10 +91,9 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
|
|||||||
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
||||||
baseinfo_list.append(response[1])
|
baseinfo_list.append(response[1])
|
||||||
else:
|
else:
|
||||||
print(f"Warning: Missing or incomplete response data for query index {_}.")
|
g.logger.error(f"基础信息整合: Warning: Missing or incomplete response data for query index {_}.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing response for query index {_}: {e}")
|
g.logger.error(f"基础信息整合: Error processing response for query index {_}: {e}")
|
||||||
print("basic信息done...")
|
|
||||||
# 判断是否分包、是否需要递交投标保证金等
|
# 判断是否分包、是否需要递交投标保证金等
|
||||||
chosen_numbers, merged = judge_whether_main(truncate0,output_folder)
|
chosen_numbers, merged = judge_whether_main(truncate0,output_folder)
|
||||||
baseinfo_list.append(merged)
|
baseinfo_list.append(merged)
|
||||||
@ -109,7 +109,7 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
|
|||||||
file_id=upload_file(truncate0)
|
file_id=upload_file(truncate0)
|
||||||
res2 = multi_threading(judge_questions, "",file_id,2) #调用千问-long
|
res2 = multi_threading(judge_questions, "",file_id,2) #调用千问-long
|
||||||
if not res2:
|
if not res2:
|
||||||
print("errror!")
|
g.logger.error("基础信息整合: multi_threading errror!")
|
||||||
else:
|
else:
|
||||||
# 打印结果
|
# 打印结果
|
||||||
for question, response in res2:
|
for question, response in res2:
|
||||||
@ -119,9 +119,9 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
|
|||||||
# if response and len(response) > 1: # 检查response存在且有至少两个元素
|
# if response and len(response) > 1: # 检查response存在且有至少两个元素
|
||||||
# baseinfo_list.append(response[1])
|
# baseinfo_list.append(response[1])
|
||||||
# else:
|
# else:
|
||||||
# print(f"Warning: Missing or incomplete response data for query index {_}.")
|
# g.error.info(f"基础信息整合: Warning: Missing or incomplete response data for query index {_}.")
|
||||||
# except Exception as e:
|
# except Exception as e:
|
||||||
# print(f"Error processing response for query index {_}: {e}")
|
# g.logger.error(f"基础信息整合: Error processing response for query index {_}: {e}")
|
||||||
|
|
||||||
rebidding_situation = extract_from_notice(clause_path, 3) #"重新招标, 不再招标和终止招标"需从投标人须知正文提取
|
rebidding_situation = extract_from_notice(clause_path, 3) #"重新招标, 不再招标和终止招标"需从投标人须知正文提取
|
||||||
|
|
||||||
@ -131,7 +131,6 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
|
|||||||
update_baseinfo_list=combine_basic_info(baseinfo_list) #整合基础信息核心代码
|
update_baseinfo_list=combine_basic_info(baseinfo_list) #整合基础信息核心代码
|
||||||
|
|
||||||
baseinfo_combined_res = combine_json_results(update_baseinfo_list) # 返回值是字典
|
baseinfo_combined_res = combine_json_results(update_baseinfo_list) # 返回值是字典
|
||||||
print("基础信息done")
|
|
||||||
return nest_json_under_key(baseinfo_combined_res, "基础信息") #返回值是json字符串
|
return nest_json_under_key(baseinfo_combined_res, "基础信息") #返回值是json字符串
|
||||||
|
|
||||||
|
|
||||||
|
@ -6,9 +6,9 @@ import concurrent.futures
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
from dashscope import Assistants, Messages, Runs, Threads
|
from dashscope import Assistants, Messages, Runs, Threads
|
||||||
|
from flask import g
|
||||||
from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
|
from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
|
||||||
from flask_app.main.通义千问long import qianwen_long, upload_file
|
from flask_app.main.通义千问long import qianwen_long, upload_file
|
||||||
|
|
||||||
prompt = """
|
prompt = """
|
||||||
# 角色
|
# 角色
|
||||||
你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。
|
你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。
|
||||||
@ -31,7 +31,6 @@ prompt = """
|
|||||||
"""
|
"""
|
||||||
prom = '请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${documents}'
|
prom = '请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${documents}'
|
||||||
|
|
||||||
|
|
||||||
def read_questions_from_file(file_path):
|
def read_questions_from_file(file_path):
|
||||||
questions = []
|
questions = []
|
||||||
with open(file_path, 'r', encoding='utf-8') as file:
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
@ -119,8 +118,10 @@ def pure_assistant():
|
|||||||
|
|
||||||
def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type):
|
def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type):
|
||||||
if llm_type==1:
|
if llm_type==1:
|
||||||
|
g.logger.info(f"rag_assistant! question:{question}")
|
||||||
assistant = rag_assistant(knowledge_name)
|
assistant = rag_assistant(knowledge_name)
|
||||||
elif llm_type==2:
|
elif llm_type==2:
|
||||||
|
g.logger.info(f"qianwen_long! question:{question}")
|
||||||
qianwen_res = qianwen_long(file_id,question)
|
qianwen_res = qianwen_long(file_id,question)
|
||||||
result_queue.put((ans_index,(question,qianwen_res)))
|
result_queue.put((ans_index,(question,qianwen_res)))
|
||||||
return
|
return
|
||||||
@ -130,6 +131,7 @@ def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type
|
|||||||
result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans)
|
result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans)
|
||||||
|
|
||||||
def multi_threading(queries, knowledge_name="", file_id="",llm_type=1):
|
def multi_threading(queries, knowledge_name="", file_id="",llm_type=1):
|
||||||
|
g.logger.info("多线程提问:starting multi_threading...")
|
||||||
result_queue = queue.Queue()
|
result_queue = queue.Queue()
|
||||||
|
|
||||||
# 使用 ThreadPoolExecutor 管理线程
|
# 使用 ThreadPoolExecutor 管理线程
|
||||||
@ -148,7 +150,7 @@ def multi_threading(queries, knowledge_name="", file_id="",llm_type=1):
|
|||||||
try:
|
try:
|
||||||
future.result() # 可以用来捕获异常或确认任务完成
|
future.result() # 可以用来捕获异常或确认任务完成
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(f"Query {index} generated an exception: {exc}")
|
g.logger.error(f"Query {index} generated an exception: {exc}")
|
||||||
|
|
||||||
# 从队列中获取所有结果并按索引排序
|
# 从队列中获取所有结果并按索引排序
|
||||||
results = [None] * len(queries)
|
results = [None] * len(queries)
|
||||||
|
@ -3,6 +3,8 @@ import re
|
|||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
from flask import g
|
||||||
|
|
||||||
from flask_app.main.多线程提问 import multi_threading
|
from flask_app.main.多线程提问 import multi_threading
|
||||||
from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2
|
from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2
|
||||||
from flask_app.main.json_utils import extract_content_from_json
|
from flask_app.main.json_utils import extract_content_from_json
|
||||||
@ -187,9 +189,9 @@ def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause
|
|||||||
temp = extract_content_from_json(response[1])
|
temp = extract_content_from_json(response[1])
|
||||||
first_response_list.append(temp)
|
first_response_list.append(temp)
|
||||||
else:
|
else:
|
||||||
print(f"Warning: Missing or incomplete response data for query index {_}.")
|
g.logger.error(f"形式响应评审:Warning: Missing or incomplete response data for query index {_}.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing response for query index {_}: {e}")
|
g.logger.error(f"形式响应评审:Error processing response for query index {_}: {e}")
|
||||||
|
|
||||||
# Assume JSON file paths are defined or configured correctly
|
# Assume JSON file paths are defined or configured correctly
|
||||||
# print(entries_with_numbers) #[{'形式评审标准.多标段投标': '3.7.4(5)'}]
|
# print(entries_with_numbers) #[{'形式评审标准.多标段投标': '3.7.4(5)'}]
|
||||||
|
@ -2,6 +2,9 @@ from PyPDF2 import PdfReader, PdfWriter
|
|||||||
import re # 导入正则表达式库
|
import re # 导入正则表达式库
|
||||||
import os # 用于文件和文件夹操作
|
import os # 用于文件和文件夹操作
|
||||||
|
|
||||||
|
from flask import g
|
||||||
|
|
||||||
|
|
||||||
def extract_common_header(pdf_path):
|
def extract_common_header(pdf_path):
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
headers = []
|
headers = []
|
||||||
@ -59,9 +62,9 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
|
|||||||
with open(output_pdf_path, 'wb') as f:
|
with open(output_pdf_path, 'wb') as f:
|
||||||
output_doc.write(f)
|
output_doc.write(f)
|
||||||
|
|
||||||
print(f"已截取并保存页面从 {start_page + 1} 到 {end_page + 1} 为 {output_pdf_path}")
|
g.logger.info(f"已截取并保存页面从 {start_page + 1} 到 {end_page + 1} 为 {output_pdf_path}")
|
||||||
else:
|
else:
|
||||||
print("提供的页码范围无效。")
|
g.logger.error("提供的页码范围无效。")
|
||||||
return output_pdf_path
|
return output_pdf_path
|
||||||
|
|
||||||
|
|
||||||
@ -105,7 +108,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix):
|
|||||||
break # 找到结束页后退出循环
|
break # 找到结束页后退出循环
|
||||||
|
|
||||||
if start_page is None or end_page is None:
|
if start_page is None or end_page is None:
|
||||||
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
|
g.logger.error(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!")
|
||||||
return ""
|
return ""
|
||||||
else:
|
else:
|
||||||
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
|
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
|
||||||
@ -154,7 +157,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
|||||||
if output_suffix == "qualification" or output_suffix =="invalid":
|
if output_suffix == "qualification" or output_suffix =="invalid":
|
||||||
return extract_pages_twice(pdf_path, output_folder, output_suffix)
|
return extract_pages_twice(pdf_path, output_folder, output_suffix)
|
||||||
else:
|
else:
|
||||||
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
|
g.logger.error(f"first: 未找到起始或结束页在文件 {pdf_path} 中!")
|
||||||
return ""
|
return ""
|
||||||
else:
|
else:
|
||||||
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
|
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
|
||||||
@ -182,7 +185,7 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
|
|||||||
if output_pdf_path and os.path.isfile(output_pdf_path):
|
if output_pdf_path and os.path.isfile(output_pdf_path):
|
||||||
return [output_pdf_path] # 以列表形式返回,以保持一致性
|
return [output_pdf_path] # 以列表形式返回,以保持一致性
|
||||||
else:
|
else:
|
||||||
print("提供的路径既不是文件夹也不是PDF文件。")
|
g.logger.error("提供的路径既不是文件夹也不是PDF文件。")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
@ -232,7 +235,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
|||||||
end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|第二卷', re.MULTILINE)
|
end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|第二卷', re.MULTILINE)
|
||||||
output_suffix = "invalid"
|
output_suffix = "invalid"
|
||||||
else:
|
else:
|
||||||
print("无效的选择")
|
g.logger.error("无效的选择:请选择1-6")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Process the selected input
|
# Process the selected input
|
||||||
|
@ -4,6 +4,9 @@ import fitz
|
|||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
from flask import g
|
||||||
|
|
||||||
|
|
||||||
def extract_text_from_docx(file_path):
|
def extract_text_from_docx(file_path):
|
||||||
doc = docx.Document(file_path)
|
doc = docx.Document(file_path)
|
||||||
return '\n'.join([para.text for para in doc.paragraphs])
|
return '\n'.join([para.text for para in doc.paragraphs])
|
||||||
@ -126,7 +129,7 @@ def convert_to_json(file_path, start_word, end_phrases):
|
|||||||
|
|
||||||
def convert_clause_to_json(input_path,output_folder,type=1):
|
def convert_clause_to_json(input_path,output_folder,type=1):
|
||||||
if not os.path.exists(input_path):
|
if not os.path.exists(input_path):
|
||||||
print(f"The specified file does not exist: {input_path}")
|
g.logger.error(f"The specified file does not exist: {input_path}")
|
||||||
return ""
|
return ""
|
||||||
if type==1:
|
if type==1:
|
||||||
start_word = "投标人须知正文"
|
start_word = "投标人须知正文"
|
||||||
|
@ -3,6 +3,9 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
from flask import g
|
||||||
|
|
||||||
from flask_app.main.截取pdf import truncate_pdf_multiple
|
from flask_app.main.截取pdf import truncate_pdf_multiple
|
||||||
from flask_app.main.table_content_extraction import extract_tables_main
|
from flask_app.main.table_content_extraction import extract_tables_main
|
||||||
from flask_app.main.知识库操作 import addfileToKnowledge, deleteKnowledge
|
from flask_app.main.知识库操作 import addfileToKnowledge, deleteKnowledge
|
||||||
@ -17,26 +20,27 @@ from flask_app.main.商务标技术标整合 import combine_evaluation_standards
|
|||||||
from flask_app.main.format_change import pdf2docx, docx2pdf
|
from flask_app.main.format_change import pdf2docx, docx2pdf
|
||||||
from flask_app.main.docx截取docx import copy_docx
|
from flask_app.main.docx截取docx import copy_docx
|
||||||
|
|
||||||
global_logger = None
|
# def get_global_logger(unique_id):
|
||||||
|
# if unique_id is None:
|
||||||
|
# return logging.getLogger() # 获取默认的日志器
|
||||||
def get_global_logger(unique_id):
|
# logger = logging.getLogger(unique_id)
|
||||||
if unique_id is None:
|
# return logger
|
||||||
return logging.getLogger() # 获取默认的日志器
|
|
||||||
logger = logging.getLogger(unique_id)
|
|
||||||
return logger
|
|
||||||
|
|
||||||
|
|
||||||
# 可能有问题:pdf转docx导致打勾符号消失
|
# 可能有问题:pdf转docx导致打勾符号消失
|
||||||
def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
|
def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
|
||||||
|
g.logger.info("starting 文件预处理...")
|
||||||
# 根据文件类型处理文件路径
|
# 根据文件类型处理文件路径
|
||||||
global docx_path, pdf_path
|
|
||||||
if file_type == 1: # docx
|
if file_type == 1: # docx
|
||||||
docx_path = downloaded_file_path
|
docx_path = downloaded_file_path
|
||||||
pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理
|
pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理
|
||||||
elif file_type == 2: # pdf
|
elif file_type == 2: # pdf
|
||||||
pdf_path = downloaded_file_path
|
pdf_path = downloaded_file_path
|
||||||
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
|
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
|
||||||
|
else:
|
||||||
|
# 如果文件类型不是预期中的1或2,记录错误并返回None
|
||||||
|
g.logger.error("Unsupported file type provided. Preprocessing halted.")
|
||||||
|
return None
|
||||||
|
|
||||||
# 上传知识库
|
# 上传知识库
|
||||||
knowledge_name = "招标解析" + unique_id
|
knowledge_name = "招标解析" + unique_id
|
||||||
@ -44,7 +48,6 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
|
|||||||
|
|
||||||
# 调用截取PDF多次
|
# 调用截取PDF多次
|
||||||
truncate_files = truncate_pdf_multiple(pdf_path, output_folder) # [前附表, 评标办法, 须知正文, 资格审查条件]
|
truncate_files = truncate_pdf_multiple(pdf_path, output_folder) # [前附表, 评标办法, 须知正文, 资格审查条件]
|
||||||
print(truncate_files)
|
|
||||||
|
|
||||||
# 处理各个部分
|
# 处理各个部分
|
||||||
truncate0_docpath = pdf2docx(truncate_files[0]) # 投标人须知前附表转docx
|
truncate0_docpath = pdf2docx(truncate_files[0]) # 投标人须知前附表转docx
|
||||||
@ -56,6 +59,7 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
|
|||||||
truncate1 = truncate_files[1] #评标办法前附表
|
truncate1 = truncate_files[1] #评标办法前附表
|
||||||
truncate3 = truncate_files[3] #资格审查表
|
truncate3 = truncate_files[3] #资格审查表
|
||||||
clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json
|
clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json
|
||||||
|
g.logger.info("文件预处理done")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'input_file_path':downloaded_file_path,
|
'input_file_path':downloaded_file_path,
|
||||||
@ -90,61 +94,63 @@ def post_processing(data,includes):
|
|||||||
return result
|
return result
|
||||||
# 基本信息
|
# 基本信息
|
||||||
def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path): # 投标人须知前附表
|
def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path): # 投标人须知前附表
|
||||||
global_logger.info("starting基础信息...")
|
g.logger.info("starting基础信息...")
|
||||||
basic_res = project_basic_info(knowledge_name, truncate0, output_folder, clause_path)
|
basic_res = project_basic_info(knowledge_name, truncate0, output_folder, clause_path)
|
||||||
global_logger.info("基础信息done")
|
g.logger.info("基础信息done")
|
||||||
return basic_res
|
return basic_res
|
||||||
|
|
||||||
|
|
||||||
# 形式、响应、资格评审
|
# 形式、响应、资格评审
|
||||||
def fetch_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path,input_file,output_folder):
|
def fetch_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path,input_file,output_folder):
|
||||||
global_logger.info("starting资格审查...")
|
g.logger.info("starting资格审查...")
|
||||||
review_standards_res = combine_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath,
|
review_standards_res = combine_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath,
|
||||||
clause_path,input_file,output_folder)
|
clause_path,input_file,output_folder)
|
||||||
global_logger.info("资格审查done")
|
g.logger.info("资格审查done")
|
||||||
return review_standards_res
|
return review_standards_res
|
||||||
|
|
||||||
|
|
||||||
# 评分细则
|
# 评分细则
|
||||||
def fetch_evaluation_standards(truncate1): # 评标办法前附表
|
def fetch_evaluation_standards(truncate1): # 评标办法前附表
|
||||||
global_logger.info("starting商务标技术标...")
|
g.logger.info("starting商务标技术标...")
|
||||||
evaluation_standards_res = combine_evaluation_standards(truncate1)
|
evaluation_standards_res = combine_evaluation_standards(truncate1)
|
||||||
global_logger.info("商务标技术标done")
|
g.logger.info("商务标技术标done")
|
||||||
return evaluation_standards_res
|
return evaluation_standards_res
|
||||||
|
|
||||||
|
|
||||||
# 无效、废标项解析
|
# 无效、废标项解析
|
||||||
def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3):
|
def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3):
|
||||||
# 废标项要求:千问
|
# 废标项要求:千问
|
||||||
global_logger.info("starting无效标与废标...")
|
g.logger.info("starting无效标与废标...")
|
||||||
find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3)
|
find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3)
|
||||||
global_logger.info("无效标与废标done...")
|
g.logger.info("无效标与废标done...")
|
||||||
return find_invalid_res
|
return find_invalid_res
|
||||||
|
|
||||||
|
|
||||||
# 投标文件要求
|
# 投标文件要求
|
||||||
def fetch_bidding_documents_requirements(clause_path):
|
def fetch_bidding_documents_requirements(clause_path):
|
||||||
global_logger.info("starting投标文件要求...")
|
g.logger.info("starting投标文件要求...")
|
||||||
fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1)
|
fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1)
|
||||||
qualify_nested_res = nest_json_under_key(fetch_bidding_documents_requirements_json, "投标文件要求")
|
qualify_nested_res = nest_json_under_key(fetch_bidding_documents_requirements_json, "投标文件要求")
|
||||||
global_logger.info("投标文件要求done...")
|
g.logger.info("投标文件要求done...")
|
||||||
return qualify_nested_res
|
return qualify_nested_res
|
||||||
|
|
||||||
|
|
||||||
# 开评定标流程
|
# 开评定标流程
|
||||||
def fetch_bid_opening(clause_path):
|
def fetch_bid_opening(clause_path):
|
||||||
global_logger.info("starting开评定标流程...")
|
g.logger.info("starting开评定标流程...")
|
||||||
fetch_bid_opening_json = extract_from_notice(clause_path, 2)
|
fetch_bid_opening_json = extract_from_notice(clause_path, 2)
|
||||||
qualify_nested_res = nest_json_under_key(fetch_bid_opening_json, "开评定标流程")
|
qualify_nested_res = nest_json_under_key(fetch_bid_opening_json, "开评定标流程")
|
||||||
global_logger.info("开评定标流程done...")
|
g.logger.info("开评定标流程done...")
|
||||||
return qualify_nested_res
|
return qualify_nested_res
|
||||||
|
|
||||||
|
|
||||||
def main_processing(output_folder, downloaded_file_path, file_type, unique_id): # file_type=1->docx file_type=2->pdf
|
def main_processing(output_folder, downloaded_file_path, file_type, unique_id): # file_type=1->docx file_type=2->pdf
|
||||||
global global_logger
|
# global global_logger
|
||||||
global_logger = get_global_logger(unique_id)
|
# global_logger = get_global_logger(unique_id)
|
||||||
# Preprocess files and get necessary data paths and knowledge index
|
# Preprocess files and get necessary data paths and knowledge index
|
||||||
processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id)
|
processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id)
|
||||||
|
if not processed_data:
|
||||||
|
return ""
|
||||||
|
|
||||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
# Submit all tasks to the executor
|
# Submit all tasks to the executor
|
||||||
@ -174,7 +180,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
|
|||||||
result = futures[key].result()
|
result = futures[key].result()
|
||||||
comprehensive_responses.append(result)
|
comprehensive_responses.append(result)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
global_logger.info(f"Error processing {key}: {exc}")
|
g.logger.error(f"Error processing {key}: {exc}")
|
||||||
# 合并 JSON 结果
|
# 合并 JSON 结果
|
||||||
combined_final_result = combine_json_results(comprehensive_responses)
|
combined_final_result = combine_json_results(comprehensive_responses)
|
||||||
includes = ["基础信息", "资格审查", "商务标", "技术标", "无效标与废标项", "投标文件要求", "开评定标流程"]
|
includes = ["基础信息", "资格审查", "商务标", "技术标", "无效标与废标项", "投标文件要求", "开评定标流程"]
|
||||||
@ -184,7 +190,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
|
|||||||
final_result_path = os.path.join(output_folder, "final_result.json")
|
final_result_path = os.path.join(output_folder, "final_result.json")
|
||||||
with open(final_result_path, 'w', encoding='utf-8') as file:
|
with open(final_result_path, 'w', encoding='utf-8') as file:
|
||||||
json.dump(modified_json, file, ensure_ascii=False, indent=2)
|
json.dump(modified_json, file, ensure_ascii=False, indent=2)
|
||||||
global_logger.info("final_result.json has been saved")
|
g.logger.info("final_result.json has been saved")
|
||||||
deleteKnowledge(processed_data['knowledge_index'])
|
deleteKnowledge(processed_data['knowledge_index'])
|
||||||
return final_result_path
|
return final_result_path
|
||||||
|
|
||||||
|
@ -2,6 +2,9 @@ import re
|
|||||||
import PyPDF2
|
import PyPDF2
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
from flask import g
|
||||||
|
|
||||||
|
|
||||||
def extract_key_value_pairs(text):
|
def extract_key_value_pairs(text):
|
||||||
# 更新正则表达式来包括对"团"的处理和行尾斜线
|
# 更新正则表达式来包括对"团"的处理和行尾斜线
|
||||||
pattern = r'\d+\.\d+\s*([\w\s()\u4e00-\u9fff]+)[\x01\x02☑团]([\w\s\u4e00-\u9fff]+)'
|
pattern = r'\d+\.\d+\s*([\w\s()\u4e00-\u9fff]+)[\x01\x02☑团]([\w\s\u4e00-\u9fff]+)'
|
||||||
@ -57,15 +60,13 @@ def read_pdf_and_judge_main(file_path, output_json_path):
|
|||||||
with open(file_path, 'rb') as file:
|
with open(file_path, 'rb') as file:
|
||||||
reader = PyPDF2.PdfReader(file)
|
reader = PyPDF2.PdfReader(file)
|
||||||
num_pages = len(reader.pages)
|
num_pages = len(reader.pages)
|
||||||
print(f"Total pages: {num_pages}")
|
|
||||||
|
|
||||||
all_data = {}
|
all_data = {}
|
||||||
for page_num in range(num_pages):
|
for page_num in range(num_pages):
|
||||||
page = reader.pages[page_num]
|
page = reader.pages[page_num]
|
||||||
text = page.extract_text() if page.extract_text() else ""
|
text = page.extract_text() if page.extract_text() else ""
|
||||||
# 使用正则表达式删除每页开头的页码,格式通常为“数字+空格”
|
# 使用正则表达式删除每页开头的页码,格式通常为“数字+空格”
|
||||||
cleaned_text = re.sub(r'^\d+\s+', '', text)
|
cleaned_text = re.sub(r'^\d+\s+', '', text)
|
||||||
print(cleaned_text)
|
# print(cleaned_text)
|
||||||
|
|
||||||
key_value_pairs = extract_key_value_pairs(cleaned_text)
|
key_value_pairs = extract_key_value_pairs(cleaned_text)
|
||||||
all_data.update(key_value_pairs)
|
all_data.update(key_value_pairs)
|
||||||
@ -74,7 +75,7 @@ def read_pdf_and_judge_main(file_path, output_json_path):
|
|||||||
with open(output_json_path, "w", encoding="utf-8") as json_file:
|
with open(output_json_path, "w", encoding="utf-8") as json_file:
|
||||||
json.dump(all_data, json_file, ensure_ascii=False, indent=4)
|
json.dump(all_data, json_file, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
print(f"Data extraction complete and saved to '{output_json_path}'.")
|
g.logger.info(f"da_gou signal: Data extraction complete and saved to '{output_json_path}'.")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -3,75 +3,125 @@ import json
|
|||||||
import os.path
|
import os.path
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from flask import g
|
||||||
|
|
||||||
from flask_app.main.json_utils import combine_json_results, nest_json_under_key
|
from flask_app.main.json_utils import combine_json_results, nest_json_under_key
|
||||||
from flask_app.main.通义千问long import upload_file, qianwen_long
|
from flask_app.main.通义千问long import upload_file, qianwen_long
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from flask_app.main.禁止投标情形 import find_forbidden
|
from flask_app.main.禁止投标情形 import find_forbidden, process_string_list
|
||||||
from 禁止投标情形 import process_string_list
|
|
||||||
|
|
||||||
#如果当前段落有序号,则向下匹配直接遇到相同的序号样式
|
#如果当前段落有序号,则向下匹配直接遇到相同的序号样式
|
||||||
#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。
|
#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。
|
||||||
|
|
||||||
def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
|
def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from docx import Document
|
from docx import Document
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
if isinstance(keywords, str):
|
||||||
|
keywords = [keywords]
|
||||||
|
|
||||||
doc = Document(doc_path)
|
doc = Document(doc_path)
|
||||||
extracted_paragraphs = OrderedDict() # 使用OrderedDict保持顺序
|
extracted_paragraphs = OrderedDict()
|
||||||
continue_collecting = False
|
continue_collecting = False
|
||||||
current_section_pattern = None
|
current_section_pattern = None
|
||||||
active_key = None # 用来标记当前正在收集的文本块的键
|
active_key = None
|
||||||
|
|
||||||
def match_keywords(text, patterns):
|
def match_keywords(text, patterns):
|
||||||
"""使用正则表达式匹配关键词。"""
|
|
||||||
return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns)
|
return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns)
|
||||||
|
|
||||||
def extract_from_text(text, index):
|
def extract_from_text(text, current_index):
|
||||||
nonlocal continue_collecting, current_section_pattern, active_key
|
nonlocal continue_collecting, current_section_pattern, active_key
|
||||||
if text == "": # Skip empty lines
|
if text == "":
|
||||||
return
|
return current_index
|
||||||
|
|
||||||
if continue_collecting:
|
if continue_collecting:
|
||||||
if current_section_pattern and re.match(current_section_pattern, text):
|
if current_section_pattern and re.match(current_section_pattern, text):
|
||||||
continue_collecting = False
|
continue_collecting = False
|
||||||
active_key = None # 结束当前的收集
|
active_key = None
|
||||||
else:
|
else:
|
||||||
if active_key is not None:
|
if active_key is not None:
|
||||||
extracted_paragraphs[active_key].append(text)
|
extracted_paragraphs[active_key].append(text)
|
||||||
|
return current_index
|
||||||
|
|
||||||
if match_keywords(text, keywords):
|
if match_keywords(text, keywords):
|
||||||
active_key = text # 设置当前的关键词块
|
active_key = text
|
||||||
extracted_paragraphs[active_key] = [text] # 初始化列表以收集文本,包括当前匹配的文本
|
extracted_paragraphs[active_key] = [text]
|
||||||
# 检查是否也匹配后续关键词
|
|
||||||
if match_keywords(text, follow_up_keywords):
|
if match_keywords(text, follow_up_keywords):
|
||||||
continue_collecting = True
|
continue_collecting = True
|
||||||
# 设置跟踪模式
|
|
||||||
section_number = re.match(r'(\d+(\s*\.\s*\d+)*)', text)
|
section_number = re.match(r'(\d+(\s*\.\s*\d+)*)', text)
|
||||||
if section_number:
|
if section_number:
|
||||||
current_section_number = section_number.group(1)
|
current_section_number = section_number.group(1)
|
||||||
level_count = current_section_number.count('.')
|
level_count = current_section_number.count('.')
|
||||||
pattern = r'\b' + (r'\d+\s*\.\s*') * level_count + r'\d+\b'
|
|
||||||
current_section_pattern = re.compile(pattern)
|
# Pattern to match current level, e.g., 3.4.5
|
||||||
|
pattern = r'^' + (r'\d+\s*\.\s*') * level_count + r'\d+'
|
||||||
|
|
||||||
|
# Generate patterns for next section at same level and parent level
|
||||||
|
parts = current_section_number.split('.')
|
||||||
|
matched_patterns = [pattern] # start with the full pattern
|
||||||
|
|
||||||
|
# Next section at same level
|
||||||
|
parts[-1] = str(int(parts[-1]) + 1)
|
||||||
|
next_pattern = r'^' + r'\s*\.\s*'.join(parts)
|
||||||
|
matched_patterns.append(next_pattern)
|
||||||
|
|
||||||
|
# Parent section (if applicable)
|
||||||
|
if len(parts) > 1:
|
||||||
|
parent_section_parts = parts[:-1]
|
||||||
|
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1)
|
||||||
|
parent_pattern = r'^' + r'\s*\.\s*'.join(parent_section_parts)
|
||||||
|
matched_patterns.append(parent_pattern)
|
||||||
|
|
||||||
|
# Combine the patterns
|
||||||
|
combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
|
||||||
|
current_section_pattern = re.compile(combined_pattern)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
found_next_number = False
|
found_next_number = False
|
||||||
current_section_pattern = None
|
current_section_pattern = None
|
||||||
|
|
||||||
for next_index in range(index + 1, len(doc.paragraphs)):
|
while current_index < len(doc.paragraphs) - 1:
|
||||||
next_text = doc.paragraphs[next_index].text.strip()
|
current_index += 1
|
||||||
|
next_text = doc.paragraphs[current_index].text.strip()
|
||||||
if not found_next_number:
|
if not found_next_number:
|
||||||
next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)', next_text)
|
next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))', next_text)
|
||||||
if next_section_number:
|
if next_section_number:
|
||||||
found_next_number = True
|
found_next_number = True
|
||||||
section_parts = next_section_number.group(1).split('.')
|
if next_section_number.group(1):
|
||||||
dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
|
section_parts = next_section_number.group(1).split('.')
|
||||||
|
dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
|
||||||
|
elif next_section_number.group(2):
|
||||||
|
dynamic_pattern = r'^[\(\(]\d+[\)\)]'
|
||||||
current_section_pattern = re.compile(dynamic_pattern)
|
current_section_pattern = re.compile(dynamic_pattern)
|
||||||
|
|
||||||
if current_section_pattern and re.match(current_section_pattern, next_text):
|
if current_section_pattern and re.match(current_section_pattern, next_text):
|
||||||
extracted_paragraphs[active_key].append(next_text) # 持续收集
|
extracted_paragraphs[active_key].append(next_text)
|
||||||
|
else:
|
||||||
|
continue_collecting = False
|
||||||
|
active_key=None
|
||||||
|
break
|
||||||
|
|
||||||
for index, para in enumerate(doc.paragraphs):
|
return current_index
|
||||||
extract_from_text(para.text.strip(), index)
|
|
||||||
|
|
||||||
return extracted_paragraphs # 返回字典,键是关键词,值是相关文本列表
|
index = 0
|
||||||
|
while index < len(doc.paragraphs):
|
||||||
|
index = extract_from_text(doc.paragraphs[index].text.strip(), index)
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
return extracted_paragraphs
|
||||||
|
|
||||||
|
def preprocess_text_list(text_list):
|
||||||
|
new_text_list = []
|
||||||
|
# 正则表达式匹配中文字符或标点后的空格,该空格后紧跟字母、数字或带括号的数字
|
||||||
|
split_pattern = re.compile(r'(?<=[\u4e00-\u9fff。;!??!;])(?=\s+[a-zA-Z\d]|\s+\([1-9]\d*\)|\s+\([1-9]\d*\))')
|
||||||
|
for text in text_list:
|
||||||
|
# 使用正则表达式检查并拆分元素
|
||||||
|
parts = split_pattern.split(text)
|
||||||
|
new_text_list.extend(part.strip() for part in parts if part.strip()) # 添加非空字符串检查
|
||||||
|
|
||||||
|
return new_text_list
|
||||||
|
|
||||||
def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达式提取到的东西格式化
|
def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达式提取到的东西格式化
|
||||||
all_texts1 = []
|
all_texts1 = []
|
||||||
@ -107,11 +157,14 @@ def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达
|
|||||||
all_texts1.append(cleaned_text) # 将处理后的文本添加到结果列表
|
all_texts1.append(cleaned_text) # 将处理后的文本添加到结果列表
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
print(text_list)
|
||||||
|
new_text_list=preprocess_text_list(text_list)
|
||||||
|
print(new_text_list)
|
||||||
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)'
|
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)'
|
||||||
data = re.sub(pattern, '', text_list[0]).strip()
|
data = re.sub(pattern, '', new_text_list[0]).strip() #去除序号
|
||||||
# 将修改后的第一个元素和剩余的元素连接起来
|
# 将修改后的第一个元素和剩余的元素连接起来
|
||||||
text_list[0] = data # 更新列表中的第一个元素
|
new_text_list[0] = data # 更新列表中的第一个元素
|
||||||
joined_text = "\n".join(text_list) # 如果列表中有多个元素,则连接它们
|
joined_text = "\n".join(new_text_list) # 如果列表中有多个元素,则连接它们
|
||||||
all_texts2.append(joined_text) # 将每个列表的内容添加到 all_texts 中
|
all_texts2.append(joined_text) # 将每个列表的内容添加到 all_texts 中
|
||||||
|
|
||||||
return all_texts1,all_texts2 #all_texts1要额外用gpt all_text2直接返回结果
|
return all_texts1,all_texts2 #all_texts1要额外用gpt all_text2直接返回结果
|
||||||
@ -219,6 +272,7 @@ def handle_query(file_path, user_query, output_file, result_key, keywords, trunc
|
|||||||
excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:"]
|
excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:"]
|
||||||
follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下']
|
follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下']
|
||||||
extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) #字典结果
|
extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) #字典结果
|
||||||
|
# print(extracted_contents)
|
||||||
all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
|
all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
|
||||||
all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords)
|
all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords)
|
||||||
qianwen_txt = all_texts1 + all_tables1
|
qianwen_txt = all_texts1 + all_tables1
|
||||||
@ -234,7 +288,6 @@ def handle_query(file_path, user_query, output_file, result_key, keywords, trunc
|
|||||||
# 更新计数器,每次循环递增
|
# 更新计数器,每次循环递增
|
||||||
counter += 1
|
counter += 1
|
||||||
file_id = upload_file(output_file)
|
file_id = upload_file(output_file)
|
||||||
print("starting qianwen-long...")
|
|
||||||
qianwen_ans = qianwen_long(file_id, user_query)
|
qianwen_ans = qianwen_long(file_id, user_query)
|
||||||
selected_contents = []
|
selected_contents = []
|
||||||
num_list = process_string_list(qianwen_ans)
|
num_list = process_string_list(qianwen_ans)
|
||||||
@ -256,7 +309,6 @@ def handle_query(file_path, user_query, output_file, result_key, keywords, trunc
|
|||||||
return res
|
return res
|
||||||
|
|
||||||
def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,truncate3):
|
def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,truncate3):
|
||||||
print("starting无效标与废标...")
|
|
||||||
queries = [
|
queries = [
|
||||||
(r'否\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
|
(r'否\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
|
||||||
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。",
|
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。",
|
||||||
@ -280,7 +332,7 @@ def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,t
|
|||||||
results.append(future.result())
|
results.append(future.result())
|
||||||
|
|
||||||
#禁止投标
|
#禁止投标
|
||||||
print("starting不得存在的情形...")
|
# g.logger.info("starting不得存在的情形...")
|
||||||
forbidden_res = find_forbidden(truncate_json_path, clause_path, truncate3)
|
forbidden_res = find_forbidden(truncate_json_path, clause_path, truncate3)
|
||||||
results.append(forbidden_res)
|
results.append(forbidden_res)
|
||||||
|
|
||||||
@ -288,17 +340,18 @@ def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,t
|
|||||||
for d in results:
|
for d in results:
|
||||||
combined_dict.update(d)
|
combined_dict.update(d)
|
||||||
|
|
||||||
print("无效标与废标done...")
|
# g.logger.info("无效标与废标done...")
|
||||||
return nest_json_under_key(combined_dict, "无效标与废标项")
|
return nest_json_under_key(combined_dict, "无效标与废标项")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json"
|
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\truncate_output.json"
|
||||||
clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\clause.json"
|
clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\clause1.json"
|
||||||
truncate3="C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest12_qualification.pdf"
|
truncate3="C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile_qualification.pdf"
|
||||||
output_dir = "C:\\Users\\Administrator\\Desktop\\货物标\\output"
|
output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e"
|
||||||
doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output\\zbtest11_invalid.docx'
|
# doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx'
|
||||||
|
doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile_invalid.docx'
|
||||||
results = combine_find_invalid(doc_path, output_dir,truncate_json_path,clause_path,truncate3)
|
results = combine_find_invalid(doc_path, output_dir,truncate_json_path,clause_path,truncate3)
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
print("Elapsed time:", str(end_time - start_time))
|
print("Elapsed time:", str(end_time - start_time))
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
|
from flask import g
|
||||||
from llama_index.readers.dashscope.base import DashScopeParse
|
from llama_index.readers.dashscope.base import DashScopeParse
|
||||||
from llama_index.readers.dashscope.utils import ResultType
|
from llama_index.readers.dashscope.utils import ResultType
|
||||||
from llama_index.indices.managed.dashscope import DashScopeCloudIndex
|
from llama_index.indices.managed.dashscope import DashScopeCloudIndex
|
||||||
@ -15,6 +16,7 @@ def addfileToKnowledge(filepath,knowledge_name):
|
|||||||
knowledge_name,
|
knowledge_name,
|
||||||
verbose=True,
|
verbose=True,
|
||||||
)
|
)
|
||||||
|
g.logger.info("knowledge created successfully!!!")
|
||||||
# index = DashScopeCloudIndex(knowledge_name)
|
# index = DashScopeCloudIndex(knowledge_name)
|
||||||
# index._insert(documents)
|
# index._insert(documents)
|
||||||
# return index, documents
|
# return index, documents
|
||||||
@ -26,6 +28,7 @@ def deleteKnowledge(index):
|
|||||||
workspace_id = os.environ.get('DASHSCOPE_WORKSPACE_ID')
|
workspace_id = os.environ.get('DASHSCOPE_WORKSPACE_ID')
|
||||||
client = create_client()
|
client = create_client()
|
||||||
delete_index(client,workspace_id,index_id)
|
delete_index(client,workspace_id,index_id)
|
||||||
|
g.logger("knowledge deleted successfully!!!")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -4,16 +4,18 @@ import os
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
from PyPDF2 import PdfWriter, PdfReader
|
from PyPDF2 import PdfWriter, PdfReader
|
||||||
|
from flask import g
|
||||||
|
|
||||||
from flask_app.main.通义千问long import upload_file, qianwen_long
|
from flask_app.main.通义千问long import upload_file, qianwen_long
|
||||||
|
|
||||||
def extract_and_format_from_paths(json_paths, includes):
|
def extract_and_format_from_paths(json_paths, includes, excludes):
|
||||||
"""
|
"""
|
||||||
从多个 JSON 文件路径读取数据,提取包含特定关键词的内容,并按照格式要求合并。
|
从多个 JSON 文件路径读取数据,提取包含特定关键词的内容,并按照格式要求合并。
|
||||||
|
|
||||||
参数:
|
参数:
|
||||||
json_paths (list): 包含多个 JSON 文件路径的列表。
|
json_paths (list): 包含多个 JSON 文件路径的列表。
|
||||||
includes (list): 包含要检查的关键词的列表。
|
includes (list): 包含要检查的关键词的列表。
|
||||||
|
excludes (list): 包含要排除的关键词的列表。
|
||||||
|
|
||||||
返回:
|
返回:
|
||||||
list: 包含所有文件中满足条件的格式化字符串列表。
|
list: 包含所有文件中满足条件的格式化字符串列表。
|
||||||
@ -33,43 +35,74 @@ def extract_and_format_from_paths(json_paths, includes):
|
|||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
# 如果值是字典,检查嵌套字典的每个键值对
|
# 如果值是字典,检查嵌套字典的每个键值对
|
||||||
for sub_key, sub_value in value.items():
|
for sub_key, sub_value in value.items():
|
||||||
if any(include in sub_value for include in includes):
|
if any(include in sub_key for include in includes):
|
||||||
# 如果子值包含关键词,格式化并添加到结果列表
|
# 如果子值包含关键词,格式化并添加到结果列表
|
||||||
formatted_results.append(f"{sub_key}: {sub_value}")
|
formatted_results.append(f"{sub_value}")
|
||||||
elif isinstance(value, str):
|
elif isinstance(value, str): # clause
|
||||||
# 如果值是字符串,直接检查是否包含关键词
|
# 检查是否包含任何 include 关键词
|
||||||
if any(include in value for include in includes):
|
for include in includes:
|
||||||
# 如果值包含关键词,添加到结果列表
|
if include in value:
|
||||||
formatted_results.append(value)
|
# 找到 include 之前的内容
|
||||||
|
prefix = value.split(include)[0]
|
||||||
|
# 检查 prefix 是否不包含任何 exclude 关键词
|
||||||
|
if not any(exclude in prefix for exclude in excludes):
|
||||||
|
# 如果不包含任何 exclude 关键词,添加整个 value 到结果列表
|
||||||
|
if '\n' in value:
|
||||||
|
value = value.split('\n', 1)[-1]
|
||||||
|
formatted_results.append(value)
|
||||||
|
break # 找到一个符合条件的就跳出循环
|
||||||
|
|
||||||
# 将当前文件的结果添加到总结果列表
|
# 将当前文件的结果添加到总结果列表
|
||||||
all_formatted_results.extend(formatted_results)
|
all_formatted_results.extend(formatted_results)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
print(f"Error: The file '{path}' does not exist.")
|
g.logger.error(f"禁止投标情形: Error: The file '{path}' does not exist.")
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
print(f"Error: The file '{path}' contains invalid JSON.")
|
g.logger.error(f"禁止投标情形: Error: The file '{path}' contains invalid JSON.")
|
||||||
|
|
||||||
return all_formatted_results
|
return all_formatted_results
|
||||||
|
|
||||||
def extract_unique_items_from_texts(texts):
|
def extract_unique_items_from_texts(texts):
|
||||||
pattern = re.compile(r'(?:\d+\.|\(\d+\)|\(\d+\)|\d+\))\s*')
|
# 更新正则表达式以包括更广泛的序号类型,包括中文序号
|
||||||
intro_pattern = re.compile(r'^.*[::]')
|
pattern = re.compile(r'(?:\d+\.|\(\d+\)|\(\d+\)|\d+\)|\①|\②|\③|\④|\⑤|\⑥|\⑦|\⑧|\⑨|\⑩|\⑪|\⑫)\s*')
|
||||||
|
intro_pattern = re.compile(r'^.*?[::]')
|
||||||
punctuation_pattern = re.compile(r'[;。,、..,:;!?!?]+$')
|
punctuation_pattern = re.compile(r'[;。,、..,:;!?!?]+$')
|
||||||
|
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
|
||||||
|
content_check_pattern = re.compile(r'[\u4e00-\u9fa5a-zA-Z0-9]{2,}') # 检查是否含有至少2个连续的字母、数字或汉字
|
||||||
|
|
||||||
all_results = []
|
all_results = []
|
||||||
seen = set()
|
seen = set()
|
||||||
|
|
||||||
for text in texts:
|
for text in texts:
|
||||||
|
# 去除文本中的制表符和换行符
|
||||||
|
text = text.replace('\t', '').replace('\n', '')
|
||||||
|
|
||||||
|
# 删除引导性的文本(直到冒号,但保留冒号后的内容)
|
||||||
text = intro_pattern.sub('', text)
|
text = intro_pattern.sub('', text)
|
||||||
|
|
||||||
|
# 替换URL为占位符,并保存URL以便后续还原
|
||||||
|
urls = []
|
||||||
|
def url_replacer(match):
|
||||||
|
urls.append(match.group(0))
|
||||||
|
return f"{{URL{len(urls)}}}"
|
||||||
|
text = url_pattern.sub(url_replacer, text)
|
||||||
|
|
||||||
|
# 使用数字和括号的模式分割文本
|
||||||
items = pattern.split(text)
|
items = pattern.split(text)
|
||||||
|
|
||||||
for item in items:
|
for item in items:
|
||||||
cleaned_item = item.strip()
|
cleaned_item = item.strip()
|
||||||
if cleaned_item:
|
if cleaned_item:
|
||||||
|
# 进一步清理每个条目
|
||||||
cleaned_item = pattern.sub('', cleaned_item)
|
cleaned_item = pattern.sub('', cleaned_item)
|
||||||
cleaned_item = punctuation_pattern.sub('', cleaned_item)
|
cleaned_item = punctuation_pattern.sub('', cleaned_item)
|
||||||
cleaned_item = cleaned_item.strip()
|
cleaned_item = cleaned_item.strip()
|
||||||
if cleaned_item and cleaned_item not in seen:
|
|
||||||
|
# 还原URL
|
||||||
|
for i, url in enumerate(urls, 1):
|
||||||
|
cleaned_item = cleaned_item.replace(f"{{URL{i}}}", url)
|
||||||
|
|
||||||
|
# 添加未见过的独特条目,确保它包含足够的实质内容并长度大于3个字符
|
||||||
|
if cleaned_item and cleaned_item not in seen and len(cleaned_item) > 3 and content_check_pattern.search(cleaned_item):
|
||||||
seen.add(cleaned_item)
|
seen.add(cleaned_item)
|
||||||
all_results.append(cleaned_item)
|
all_results.append(cleaned_item)
|
||||||
|
|
||||||
@ -93,9 +126,9 @@ def merge_pdfs(paths, output_filename):
|
|||||||
if output_path:
|
if output_path:
|
||||||
with open(output_path, 'wb') as out:
|
with open(output_path, 'wb') as out:
|
||||||
pdf_writer.write(out)
|
pdf_writer.write(out)
|
||||||
print(f"Merged PDF saved to {output_path}")
|
g.logger.info(f"禁止投标情形: Merged PDF saved to {output_path}")
|
||||||
else:
|
else:
|
||||||
print("No files to merge.")
|
g.logger.error("禁止投标情形: No files to merge.")
|
||||||
return output_path
|
return output_path
|
||||||
|
|
||||||
def process_string_list(string_list):
|
def process_string_list(string_list):
|
||||||
@ -120,7 +153,7 @@ def process_string_list(string_list):
|
|||||||
actual_list = ast.literal_eval(formatted_list)
|
actual_list = ast.literal_eval(formatted_list)
|
||||||
return actual_list
|
return actual_list
|
||||||
except SyntaxError as e:
|
except SyntaxError as e:
|
||||||
print(f"Error parsing list: {e}")
|
g.logger.error(f"禁止投标情形: Error parsing list: {e}")
|
||||||
return []
|
return []
|
||||||
else:
|
else:
|
||||||
# 如果没有匹配到内容,返回空列表
|
# 如果没有匹配到内容,返回空列表
|
||||||
@ -129,15 +162,17 @@ def find_forbidden(truncate_json_path,clause_path,truncate3): #投标人须
|
|||||||
# output_filename="merged.pdf"
|
# output_filename="merged.pdf"
|
||||||
# paths=[truncate1,truncate4]
|
# paths=[truncate1,truncate4]
|
||||||
# merged_filepath=merge_pdfs(paths,output_filename) #暂时废弃,评分前附表中的在'否决投标'中摘录了。
|
# merged_filepath=merge_pdfs(paths,output_filename) #暂时废弃,评分前附表中的在'否决投标'中摘录了。
|
||||||
|
if truncate3:
|
||||||
file_id=upload_file(truncate3)
|
file_id=upload_file(truncate3)
|
||||||
# user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请按json列表格式给我提供信息,键名为'不得存在的其他情形',请你不要回答有关\"信誉要求\"的内容,若文件中未说明,请在键值中填'未知'。"
|
# user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请按json列表格式给我提供信息,键名为'不得存在的其他情形',请你不要回答有关\"信誉要求\"的内容,若文件中未说明,请在键值中填'未知'。"
|
||||||
user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...],请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。"
|
user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...],请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。"
|
||||||
qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden)
|
qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden)
|
||||||
|
else:
|
||||||
|
qianwen_forbidden_str="[]"
|
||||||
actual_list=process_string_list(qianwen_forbidden_str) #提取出字符串列表 ["xxx","xx"]
|
actual_list=process_string_list(qianwen_forbidden_str) #提取出字符串列表 ["xxx","xx"]
|
||||||
|
includes = ["不得存在", "不得与", "禁止投标", "对投标人的纪律"]
|
||||||
includes = ["不得存在", "禁止投标"]
|
excludes = ["招标", "评标", "定标"]
|
||||||
forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes)
|
forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes,excludes)
|
||||||
processed_results = extract_unique_items_from_texts(forbidden_results)
|
processed_results = extract_unique_items_from_texts(forbidden_results)
|
||||||
# print(processed_results)
|
# print(processed_results)
|
||||||
merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results))
|
merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results))
|
||||||
@ -145,11 +180,12 @@ def find_forbidden(truncate_json_path,clause_path,truncate3): #投标人须
|
|||||||
|
|
||||||
return forbidden_dict
|
return forbidden_dict
|
||||||
|
|
||||||
|
#TODO:不得存在的情况文中有很多内容
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\truncate_output.json"
|
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\truncate_output.json"
|
||||||
clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\clause.json"
|
clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\clause1.json"
|
||||||
truncate4 = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_qualification.pdf"
|
truncate3 = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile_qualification.pdf"
|
||||||
output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7"
|
output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e"
|
||||||
doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile.docx'
|
doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile.docx'
|
||||||
find_forbidden(truncate_json_path,clause_path,truncate4)
|
res = find_forbidden(truncate_json_path, clause_path, truncate3)
|
||||||
|
print(json.dumps(res, ensure_ascii=False, indent=4))
|
@ -14,5 +14,5 @@ def read_docx(file_path):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
file_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标02_invalid.docx"
|
file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile.docx"
|
||||||
read_docx(file_path)
|
read_docx(file_path)
|
||||||
|
@ -10,13 +10,11 @@ from concurrent.futures import ThreadPoolExecutor
|
|||||||
|
|
||||||
def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder): #评标办法前附表
|
def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder): #评标办法前附表
|
||||||
# 形式评审、响应评审:千问
|
# 形式评审、响应评审:千问
|
||||||
print("starting形式响应评审...")
|
|
||||||
file_id=upload_file(truncate1) #评标办法前附表
|
file_id=upload_file(truncate1) #评标办法前附表
|
||||||
user_query_1 = "根据该文档中的评标办法前附表,请你列出该文件中的形式评审标准和响应性评审标准和资格评审标准,请以json格式返回,外层键名为'形式评审标准'和'响应性评审标准'和'资格评审标准',嵌套键名为'评审因素'中的内容,相应的键值为对应'评审标准'中的内容。"
|
user_query_1 = "根据该文档中的评标办法前附表,请你列出该文件中的形式评审标准和响应性评审标准和资格评审标准,请以json格式返回,外层键名为'形式评审标准'和'响应性评审标准'和'资格评审标准',嵌套键名为'评审因素'中的内容,相应的键值为对应'评审标准'中的内容。"
|
||||||
results = qianwen_long(file_id, user_query_1)
|
results = qianwen_long(file_id, user_query_1)
|
||||||
original_dict_data = extract_content_from_json(results)
|
original_dict_data = extract_content_from_json(results)
|
||||||
qualification_review = original_dict_data.pop('资格评审标准', '默认值或None') #qianwen-long有关资格评审的内容
|
qualification_review = original_dict_data.pop('资格评审标准', '默认值或None') #qianwen-long有关资格评审的内容
|
||||||
print(original_dict_data)
|
|
||||||
with ThreadPoolExecutor() as executor:
|
with ThreadPoolExecutor() as executor:
|
||||||
# 创建Future对象
|
# 创建Future对象
|
||||||
future_qualification = executor.submit(process_qualification, qualification_review, truncate3, knowledge_name)
|
future_qualification = executor.submit(process_qualification, qualification_review, truncate3, knowledge_name)
|
||||||
@ -26,7 +24,6 @@ def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpa
|
|||||||
# 等待执行结果
|
# 等待执行结果
|
||||||
final_qualify_json = future_qualification.result()
|
final_qualify_json = future_qualification.result()
|
||||||
form_response_dict = future_form_response.result()
|
form_response_dict = future_form_response.result()
|
||||||
print("形式响应评审done")
|
|
||||||
form_response_dict.update(final_qualify_json)
|
form_response_dict.update(final_qualify_json)
|
||||||
return nest_json_under_key(form_response_dict,"资格审查")
|
return nest_json_under_key(form_response_dict,"资格审查")
|
||||||
|
|
||||||
|
@ -3,6 +3,8 @@
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from flask import g
|
||||||
|
|
||||||
from flask_app.main.json_utils import clean_json_string, combine_json_results, add_keys_to_json, nest_json_under_key
|
from flask_app.main.json_utils import clean_json_string, combine_json_results, add_keys_to_json, nest_json_under_key
|
||||||
from flask_app.main.多线程提问 import multi_threading,read_questions_from_file
|
from flask_app.main.多线程提问 import multi_threading,read_questions_from_file
|
||||||
from flask_app.main.通义千问long import upload_file
|
from flask_app.main.通义千问long import upload_file
|
||||||
@ -17,7 +19,7 @@ def merge_dictionaries_under_common_key(dicts, common_key):
|
|||||||
# 使用字典解包来合并字典
|
# 使用字典解包来合并字典
|
||||||
merged_dict[common_key].update(d[common_key])
|
merged_dict[common_key].update(d[common_key])
|
||||||
else:
|
else:
|
||||||
print(f"Warning: Dictionary does not contain the key {common_key}")
|
g.logger.error(f"资格评审: Warning: Dictionary does not contain the key {common_key}")
|
||||||
|
|
||||||
return merged_dict
|
return merged_dict
|
||||||
def generate_qual_question(matching_keys_list): #这里假设资质、信誉与人员要求 要不都有、要不都没
|
def generate_qual_question(matching_keys_list): #这里假设资质、信誉与人员要求 要不都有、要不都没
|
||||||
@ -71,9 +73,9 @@ def get_consortium_dict(knowledge_name):
|
|||||||
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
||||||
qualify_list.append(response[1])
|
qualify_list.append(response[1])
|
||||||
else:
|
else:
|
||||||
print(f"Warning: Missing or incomplete response data for query index {_}.")
|
g.logger.error(f"资格评审: Warning: Missing or incomplete response data for query index {_}.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing response for query index {_}: {e}")
|
g.logger.error(f"资格评审: Error processing response for query index {_}: {e}")
|
||||||
consortium_dict = combine_json_results(qualify_list)
|
consortium_dict = combine_json_results(qualify_list)
|
||||||
return consortium_dict
|
return consortium_dict
|
||||||
|
|
||||||
@ -88,9 +90,9 @@ def get_all_dict(knowledge_name):
|
|||||||
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
||||||
qualification_list.append(response[1])
|
qualification_list.append(response[1])
|
||||||
else:
|
else:
|
||||||
print(f"Warning: Missing or incomplete response data for query index {_}.")
|
g.logger.error(f"资格评审: Warning: Missing or incomplete response data for query index {_}.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing response for query index {_}: {e}")
|
g.logger.error(f"资格评审: Error processing response for query index {_}: {e}")
|
||||||
qualification_combined_res = combine_json_results(qualification_list)
|
qualification_combined_res = combine_json_results(qualification_list)
|
||||||
return {'资格评审': qualification_combined_res}
|
return {'资格评审': qualification_combined_res}
|
||||||
def process_qualification(qualification_review,truncate3,knowledge_name):
|
def process_qualification(qualification_review,truncate3,knowledge_name):
|
||||||
@ -99,14 +101,14 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
|
|||||||
if not matching_keys_list: #此时要求全部写在评分办法前附表中,不需要额外提取。
|
if not matching_keys_list: #此时要求全部写在评分办法前附表中,不需要额外提取。
|
||||||
if not non_matching_dict: #古法提取
|
if not non_matching_dict: #古法提取
|
||||||
if truncate3!="":
|
if truncate3!="":
|
||||||
print("type1")
|
g.logger.info("资格评审: type1")
|
||||||
matching_keys_list=["资质条件","财务要求","业绩要求","信誉要求","其他要求"]
|
matching_keys_list=["资质条件","财务要求","业绩要求","信誉要求","其他要求"]
|
||||||
ques=generate_qual_question(matching_keys_list)
|
ques=generate_qual_question(matching_keys_list)
|
||||||
file_id2 = upload_file(truncate3)
|
file_id2 = upload_file(truncate3)
|
||||||
results2 = multi_threading(ques, "", file_id2, 2) # 资格评审表,调用qianwen-long
|
results2 = multi_threading(ques, "", file_id2, 2) # 资格评审表,调用qianwen-long
|
||||||
res_list = []
|
res_list = []
|
||||||
if not results2:
|
if not results2:
|
||||||
print("未调用大模型询问资格评审文件要求!")
|
g.logger.error("资格评审: 调用大模型未成功获取资格评审文件中的要求!")
|
||||||
else:
|
else:
|
||||||
# 打印结果
|
# 打印结果
|
||||||
for question, response in results2:
|
for question, response in results2:
|
||||||
@ -117,11 +119,11 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
|
|||||||
updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) # 合并字典
|
updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) # 合并字典
|
||||||
return updated_qualify_json
|
return updated_qualify_json
|
||||||
else:
|
else:
|
||||||
print("type2")
|
g.logger.info("资格评审: type2")
|
||||||
return get_all_dict(knowledge_name)
|
return get_all_dict(knowledge_name)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print("type3")
|
g.logger.info("资格评审: type3")
|
||||||
new_non_matching_json={'资格评审':non_matching_dict}
|
new_non_matching_json={'资格评审':non_matching_dict}
|
||||||
substring = '联合体'
|
substring = '联合体'
|
||||||
found_key = any(substring in key for key in non_matching_dict.keys()) #没有联合体投标,则需生成,防止重复
|
found_key = any(substring in key for key in non_matching_dict.keys()) #没有联合体投标,则需生成,防止重复
|
||||||
@ -133,18 +135,18 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
|
|||||||
return new_non_matching_json
|
return new_non_matching_json
|
||||||
|
|
||||||
elif matching_keys_list and truncate3=="": #这种情况是评分办法前附表中有要求,但是没有正确截取到'资格审查表'
|
elif matching_keys_list and truncate3=="": #这种情况是评分办法前附表中有要求,但是没有正确截取到'资格审查表'
|
||||||
print("type4")
|
g.logger.info("资格评审: type4")
|
||||||
final_qualification=get_all_dict(knowledge_name)
|
final_qualification=get_all_dict(knowledge_name)
|
||||||
final_qualify_json = add_keys_to_json(final_qualification, non_matching_dict)
|
final_qualify_json = add_keys_to_json(final_qualification, non_matching_dict)
|
||||||
return final_qualify_json
|
return final_qualify_json
|
||||||
else: #大多数情况
|
else: #大多数情况
|
||||||
print("type5")
|
g.logger.info("资格评审: type5")
|
||||||
user_querys = generate_qual_question(matching_keys_list) # 生成提问->‘附件:资格审查’
|
user_querys = generate_qual_question(matching_keys_list) # 生成提问->‘附件:资格审查’
|
||||||
file_id2 = upload_file(truncate3)
|
file_id2 = upload_file(truncate3)
|
||||||
results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表,调用qianwen-long
|
results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表,调用qianwen-long
|
||||||
res_list = []
|
res_list = []
|
||||||
if not results2:
|
if not results2:
|
||||||
print("未调用大模型询问资格评审文件要求!")
|
g.logger.error("资格评审: 调用大模型未成功获取资格评审文件中的要求!")
|
||||||
else:
|
else:
|
||||||
# 打印结果
|
# 打印结果
|
||||||
for question, response in results2:
|
for question, response in results2:
|
||||||
|
@ -1,33 +1,115 @@
|
|||||||
def categorize_keys(data, includes):
|
import json
|
||||||
# 初始化结果字典,预设'其他'分类为空字典
|
import re
|
||||||
result = {"其他": {}}
|
|
||||||
|
|
||||||
# 遍历原始字典的每一个键值对
|
|
||||||
for key, value in data.items():
|
|
||||||
if key in includes:
|
|
||||||
# 如果键在includes列表中,直接保留这个键值对
|
|
||||||
result[key] = value
|
|
||||||
else:
|
|
||||||
# 如果键不在includes列表中,将这个键值对加入到'其他'分类中
|
|
||||||
result["其他"][key] = value
|
|
||||||
|
|
||||||
# 如果'其他'分类没有任何内容,可以选择删除这个键
|
|
||||||
if not result["其他"]:
|
|
||||||
del result["其他"]
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
# 使用示例
|
def extract_and_format_from_paths(json_paths, includes, excludes):
|
||||||
data = {
|
"""
|
||||||
"基础信息": "详细描述",
|
从多个 JSON 文件路径读取数据,提取包含特定关键词的内容,并按照格式要求合并。
|
||||||
"资格审查": "流程说明",
|
|
||||||
"商务标": "流程详情",
|
|
||||||
"技术标": "合同详细条款",
|
|
||||||
"支付方式": "支付条件说明"
|
|
||||||
}
|
|
||||||
|
|
||||||
includes = ["基础信息", "资格审查", "商务标", "技术标", "无效标与废标项", "投标文件要求", "开评定标流程"]
|
参数:
|
||||||
result = categorize_keys(data, includes)
|
json_paths (list): 包含多个 JSON 文件路径的列表。
|
||||||
|
includes (list): 包含要检查的关键词的列表。
|
||||||
|
excludes (list): 包含要排除的关键词的列表。
|
||||||
|
|
||||||
print(result)
|
返回:
|
||||||
|
list: 包含所有文件中满足条件的格式化字符串列表。
|
||||||
|
"""
|
||||||
|
all_formatted_results = []
|
||||||
|
|
||||||
|
# 遍历每个文件路径
|
||||||
|
for path in json_paths:
|
||||||
|
try:
|
||||||
|
with open(path, 'r', encoding='utf-8') as file:
|
||||||
|
# 加载 JSON 数据
|
||||||
|
json_data = json.load(file)
|
||||||
|
formatted_results = []
|
||||||
|
|
||||||
|
# 遍历 JSON 数据的每个键值对
|
||||||
|
for key, value in json_data.items():
|
||||||
|
if isinstance(value, dict):
|
||||||
|
# 如果值是字典,检查嵌套字典的每个键值对
|
||||||
|
for sub_key, sub_value in value.items():
|
||||||
|
if any(include in sub_key for include in includes):
|
||||||
|
# 如果子值包含关键词,格式化并添加到结果列表
|
||||||
|
formatted_results.append(f"{sub_value}")
|
||||||
|
elif isinstance(value, str): # clause
|
||||||
|
# 检查是否包含任何 include 关键词
|
||||||
|
for include in includes:
|
||||||
|
if include in value:
|
||||||
|
# 找到 include 之前的内容
|
||||||
|
prefix = value.split(include)[0]
|
||||||
|
# 检查 prefix 是否不包含任何 exclude 关键词
|
||||||
|
if not any(exclude in prefix for exclude in excludes):
|
||||||
|
# 如果不包含任何 exclude 关键词,添加整个 value 到结果列表
|
||||||
|
if '\n' in value:
|
||||||
|
value = value.split('\n', 1)[-1]
|
||||||
|
formatted_results.append(value)
|
||||||
|
break # 找到一个符合条件的就跳出循环
|
||||||
|
|
||||||
|
# 将当前文件的结果添加到总结果列表
|
||||||
|
all_formatted_results.extend(formatted_results)
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(f"Error: The file '{path}' does not exist.")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
print(f"Error: The file '{path}' contains invalid JSON.")
|
||||||
|
|
||||||
|
return all_formatted_results
|
||||||
|
|
||||||
|
|
||||||
|
def extract_unique_items_from_texts(texts):
|
||||||
|
# 更新正则表达式以包括更广泛的序号类型,包括中文序号
|
||||||
|
pattern = re.compile(r'(?:\d+\.|\(\d+\)|\(\d+\)|\d+\)|\①|\②|\③|\④|\⑤|\⑥|\⑦|\⑧|\⑨|\⑩|\⑪|\⑫)\s*')
|
||||||
|
intro_pattern = re.compile(r'^.*?[::]')
|
||||||
|
punctuation_pattern = re.compile(r'[;。,、..,:;!?!?]+$')
|
||||||
|
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
|
||||||
|
content_check_pattern = re.compile(r'[\u4e00-\u9fa5a-zA-Z0-9]{2,}') # 检查是否含有至少2个连续的字母、数字或汉字
|
||||||
|
|
||||||
|
all_results = []
|
||||||
|
seen = set()
|
||||||
|
|
||||||
|
for text in texts:
|
||||||
|
# 去除文本中的制表符和换行符
|
||||||
|
text = text.replace('\t', '').replace('\n', '')
|
||||||
|
|
||||||
|
# 删除引导性的文本(直到冒号,但保留冒号后的内容)
|
||||||
|
text = intro_pattern.sub('', text)
|
||||||
|
|
||||||
|
# 替换URL为占位符,并保存URL以便后续还原
|
||||||
|
urls = []
|
||||||
|
def url_replacer(match):
|
||||||
|
urls.append(match.group(0))
|
||||||
|
return f"{{URL{len(urls)}}}"
|
||||||
|
text = url_pattern.sub(url_replacer, text)
|
||||||
|
|
||||||
|
# 使用数字和括号的模式分割文本
|
||||||
|
items = pattern.split(text)
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
cleaned_item = item.strip()
|
||||||
|
if cleaned_item:
|
||||||
|
# 进一步清理每个条目
|
||||||
|
cleaned_item = pattern.sub('', cleaned_item)
|
||||||
|
cleaned_item = punctuation_pattern.sub('', cleaned_item)
|
||||||
|
cleaned_item = cleaned_item.strip()
|
||||||
|
|
||||||
|
# 还原URL
|
||||||
|
for i, url in enumerate(urls, 1):
|
||||||
|
cleaned_item = cleaned_item.replace(f"{{URL{i}}}", url)
|
||||||
|
|
||||||
|
# 添加未见过的独特条目,确保它包含足够的实质内容并长度大于3个字符
|
||||||
|
if cleaned_item and cleaned_item not in seen and len(cleaned_item) > 3 and content_check_pattern.search(cleaned_item):
|
||||||
|
seen.add(cleaned_item)
|
||||||
|
all_results.append(cleaned_item)
|
||||||
|
|
||||||
|
return all_results
|
||||||
|
# 使用上面定义的函数
|
||||||
|
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\truncate_output.json"
|
||||||
|
clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\clause1.json"
|
||||||
|
json_paths = [truncate_json_path,clause_path] # 根据实际存放的路径来填写
|
||||||
|
includes = ["不得存在","不得与","禁止投标","对投标人的纪律"]
|
||||||
|
excludes=["招标","评标","定标"]
|
||||||
|
# 调用函数
|
||||||
|
results = extract_and_format_from_paths(json_paths, includes,excludes)
|
||||||
|
print(results)
|
||||||
|
res=extract_unique_items_from_texts(results)
|
||||||
|
print(res)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user