9.11 改为全局logger,完善了无效标的bug

This commit is contained in:
zy123 2024-09-11 12:02:09 +08:00
parent ad8fd46cf7
commit a256ceceb4
23 changed files with 467 additions and 270 deletions

View File

@ -2,6 +2,8 @@ from docx import Document
import re
import os
from flask import g
def copy_docx(source_path):
doc = Document(source_path) # 打开源文档
@ -10,7 +12,7 @@ def copy_docx(source_path):
# 获取原文件名并添加后缀
original_file_name = os.path.basename(source_path)
file_name_without_ext, file_ext = os.path.splitext(original_file_name)
modified_file_name = file_name_without_ext + "_invalid" + file_ext
modified_file_name = file_name_without_ext + "_invalid111" + file_ext
destination_path = os.path.join(output_folder, modified_file_name)
new_doc = Document() # 创建新文档
@ -22,6 +24,7 @@ def copy_docx(source_path):
# 寻找最后一个begin_pattern的位置
last_begin_index = -1
for i, paragraph in enumerate(doc.paragraphs):
if begin_pattern.search(paragraph.text):
last_begin_index = i
@ -42,9 +45,10 @@ def copy_docx(source_path):
break
new_doc.save(destination_path) # 保存新文档
g.logger.info("docx截取docx成功")
# 调用函数
if __name__ == '__main__':
source_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output\\zbtest13.docx"
source_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile.docx"
copy_docx(source_path)

View File

@ -1,6 +1,8 @@
import requests
import mimetypes
from flask import g
def download_file(url, local_filename):
try:
@ -27,13 +29,13 @@ def download_file(url, local_filename):
else:
return full_filename,3
except requests.HTTPError as e:
print(f"HTTP Error: {e}")
g.logger.error(f"download: HTTP Error: {e}")
return None
except requests.RequestException as e:
print(f"Error downloading the file: {e}")
g.logger.error(f"download: Error downloading the file: {e}")
return None
except Exception as e:
print(f"An error occurred: {e}")
g.logger.error(f"download: An error occurred: {e}")
return None
if __name__ == '__main__':

View File

@ -1,6 +1,8 @@
import json
import os
import requests
from flask import g
from flask_app.main.download import download_file
@ -19,14 +21,14 @@ def upload_file(file_path, url):
# 检查响应状态码
if response.status_code == 200:
print("文件上传成功")
g.logger.info("format_change 文件上传成功")
receive_file_response = response.content.decode('utf-8')
receive_file_json = json.loads(receive_file_response)
receive_file_url = receive_file_json["data"]
else:
print(f"文件上传失败,状态码: {response.status_code}")
print(response.text)
g.logger.info(f"format_change 文件上传失败,状态码: {response.status_code}")
g.logger.info(f"format_change {response.text}")
return receive_file_url
@ -44,7 +46,7 @@ def pdf2docx(local_path_in):
filename, folder = get_filename_and_folder(local_path_in) #输入输出在同一个文件夹
local_path_out=os.path.join(folder,filename) #输出文件名
downloaded_filepath,file_type=download_file(receive_download_url, local_path_out)
print("have downloaded file to:",downloaded_filepath)
g.logger.info("format_change p2d:have downloaded file to:",downloaded_filepath)
return downloaded_filepath
def docx2pdf(local_path_in):
@ -53,7 +55,7 @@ def docx2pdf(local_path_in):
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
local_path_out = os.path.join(folder, filename) # 输出文件名
downloaded_filepath,file_type = download_file(receive_download_url, local_path_out)
print("have downloaded file to:", downloaded_filepath)
g.logger.info("format_change d2p:have downloaded file to:", downloaded_filepath)
return downloaded_filepath
if __name__ == '__main__':

View File

@ -1,6 +1,9 @@
import json
import re
from flask import g
def extract_content_from_json(json_data):
"""提取 { 和 } 之间的内容,并将其解析为字典"""
if not json_data.strip():
@ -11,10 +14,10 @@ def extract_content_from_json(json_data):
json_data = match.group(0)
return json.loads(json_data) #返回字典
except json.JSONDecodeError as e:
print(f"JSON decode error: {e}")
g.logger.info(f"json_utils: extract_content_from_json: JSON decode error: {e}")
return {}
else:
print("No valid JSON content found.")
g.logger.info("json_utils: extract_content_from_json: No valid JSON content found.")
return {}
def clean_json_string(json_string):
@ -63,18 +66,18 @@ def add_keys_to_json(target_dict, source_dict):
dict: 更新后的字典
"""
if not target_dict:
print("Error: Target dictionary is empty.")
g.logger.error("json_utils: Error: Target dictionary is empty.")
return {}
if len(target_dict) != 1:
print("Error: Target dictionary must contain exactly one top-level key.")
g.logger.error("json_utils: Error: Target dictionary must contain exactly one top-level key.")
return target_dict
# 获取唯一的外层键
target_key, existing_dict = next(iter(target_dict.items()))
if not isinstance(existing_dict, dict):
print(f"Error: The value under the key '{target_key}' is not a dictionary.")
g.logger.error(f"json_utils: Error: The value under the key '{target_key}' is not a dictionary.")
return target_dict
# 合并字典
@ -92,7 +95,8 @@ def rename_outer_key(original_data,new_key):
# 提取原始数据中的唯一外层值(假设只有一个外层键)
if not original_data or not isinstance(original_data, dict):
return {} # 如果输入无效或不是字典,则返回空字典
g.logger.error("json_utils: Error: Invalid input or input is not a dictionary.") # 如果输入无效或不是字典,则返回空字典
return {}
# 使用 next(iter(...)) 提取第一个键的值
original_value = next(iter(original_data.values()), {})

View File

@ -5,7 +5,7 @@ import time
import uuid
from datetime import datetime, timedelta
from flask import Flask, request, jsonify, Response, stream_with_context
from flask import Flask, request, jsonify, Response, stream_with_context, g
import json
import os
from flask_app.main.download import download_file
@ -31,30 +31,30 @@ class CSTFormatter(logging.Formatter):
return s
def create_logger(unique_id):
"""为每个请求创建一个新的日志器,日志器的日志文件存放在指定的输出文件夹中"""
@app.before_request
def before_request():
# 每个请求开始前初始化 logger
create_logger() # 确保这个函数中设置了 g.logger
def create_logger():
unique_id = str(uuid.uuid4())
g.unique_id = unique_id
output_folder = f"flask_app/static/output/{unique_id}"
# output_folder =f"C:/Users/Administrator/Desktop/招标文件/test/{unique_id}"
if not os.path.exists(output_folder):
os.makedirs(output_folder, exist_ok=True)
log_filename = "log.txt"
log_path = os.path.join(output_folder, log_filename)
logger = logging.getLogger(unique_id) # 使用 unique_id 作为日志器名字
if not logger.handlers: # 避免重复添加处理器
# 文件处理器
logger = logging.getLogger(unique_id)
if not logger.handlers:
file_handler = logging.FileHandler(log_path)
file_formatter = CSTFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_formatter)
logger.addHandler(file_handler)
# 流处理器(控制台输出)
stream_handler = logging.StreamHandler(sys.stdout)
stream_formatter = logging.Formatter('%(message)s') # 简化的格式,只输出消息
stream_handler.setFormatter(stream_formatter)
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(logging.Formatter('%(message)s'))
logger.addHandler(stream_handler)
logger.setLevel(logging.INFO)
return logger, output_folder
g.logger = logger
@app.route('/upload', methods=['POST'])
@ -64,10 +64,10 @@ def zbparse():
return file_url
try:
app.logger.info("starting parsing url:" + file_url)
final_json_path, output_folder, logger = download_and_process_file(file_url)
final_json_path, output_folder = download_and_process_file(file_url)
if not final_json_path:
return jsonify({'error': 'File processing failed'}), 500
response = generate_response(final_json_path, logger) # 先获取响应内容
response = generate_response(final_json_path) # 先获取响应内容
# remove_directory(output_folder) # 然后删除文件夹
return response # 最后返回获取的响应
except Exception as e:
@ -75,6 +75,7 @@ def zbparse():
return jsonify({'error': str(e)}), 500
# 流式
# def zbparse():
# file_url = validate_request()
# if isinstance(file_url, tuple): # Check if the returned value is an error response
@ -87,40 +88,42 @@ def zbparse():
# return jsonify({'error': str(e)}), 500
def process_and_stream(file_url):
unique_id = str(uuid.uuid4())
logger, output_folder = create_logger(unique_id)
filename = "ztbfile"
downloaded_filename = os.path.join(output_folder, filename)
downloaded_filepath, file_type = download_file(file_url, downloaded_filename)
if downloaded_filepath is None or file_type == 3:
logger.error("Unsupported file type or failed to download file")
error_response = {
'message': 'File processing failed',
'filename': None,
'data': json.dumps({'error': 'File processing failed'})
}
yield f"data: {json.dumps(error_response)}\n\n"
return
logger.info("Local file path: " + downloaded_filepath)
for data in main_processing(output_folder, downloaded_filepath, file_type, unique_id):
response = {
'message': 'Processing',
'filename': os.path.basename(downloaded_filepath),
'data': data
}
yield f"data: {json.dumps(response)}\n\n"
final_response = {
'message': 'File uploaded and processed successfully',
'filename': os.path.basename(downloaded_filepath),
'data': 'END'
}
yield f"data: {json.dumps(final_response)}\n\n"
# 分段返回
# def process_and_stream(file_url):
# logger = g.logger
# unique_id = g.unique_id
# output_folder = f"flask_app/static/output/{unique_id}" # 直接使用全局 unique_id 构建路径
# filename = "ztbfile"
# downloaded_filename = os.path.join(output_folder, filename)
#
# downloaded_filepath, file_type = download_file(file_url, downloaded_filename)
#
# if downloaded_filepath is None or file_type == 3:
# logger.error("Unsupported file type or failed to download file")
# error_response = {
# 'message': 'File processing failed',
# 'filename': None,
# 'data': json.dumps({'error': 'File processing failed'})
# }
# yield f"data: {json.dumps(error_response)}\n\n"
# return
#
# logger.info("Local file path: " + downloaded_filepath)
#
# for data in main_processing(output_folder, downloaded_filepath, file_type, unique_id):
# response = {
# 'message': 'Processing',
# 'filename': os.path.basename(downloaded_filepath),
# 'data': data
# }
# yield f"data: {json.dumps(response)}\n\n"
#
# final_response = {
# 'message': 'File uploaded and processed successfully',
# 'filename': os.path.basename(downloaded_filepath),
# 'data': 'END'
# }
# yield f"data: {json.dumps(final_response)}\n\n"
def validate_request():
@ -133,8 +136,9 @@ def validate_request():
def download_and_process_file(file_url):
unique_id = str(uuid.uuid4()) # 生成一个唯一的 UUID
logger, output_folder = create_logger(unique_id)
logger = g.logger
unique_id = g.unique_id
output_folder = f"flask_app/static/output/{unique_id}" # 直接使用全局 unique_id 构建路径
filename = "ztbfile"
downloaded_filename = os.path.join(output_folder, filename)
@ -147,7 +151,7 @@ def download_and_process_file(file_url):
logger.info("Local file path: " + downloaded_filepath)
processed_file_path = main_processing(output_folder, downloaded_filepath, file_type, unique_id)
return processed_file_path, output_folder, logger
return processed_file_path, output_folder
@app.route('/api/test_zbparse', methods=['POST'])
@ -189,10 +193,15 @@ def test_process_and_stream():
yield f"data: {json.dumps(final_response)}\n\n"
def generate_response(final_json_path, logger):
def generate_response(final_json_path):
logger = g.logger
# 检查final_json_path是否为空或None
if not final_json_path:
logger.error('Empty or None path provided for final_json.')
return jsonify({'error': 'No path provided for final_json.'}), 400
if not os.path.exists(final_json_path):
logger.error('JSON file not found at path: ' + final_json_path)
return jsonify({'error': 'JSON file not found'}), 404
logger.error('final_json not found at path: ' + final_json_path)
return jsonify({'error': 'final_json not found'}), 404
with open(final_json_path, 'r', encoding='utf-8') as f:
logger.info('final_json_path:' + final_json_path)
zbparse_data = json.load(f)

View File

@ -3,6 +3,8 @@ import os
from docx import Document
import json
from flask import g
def read_tables_from_docx(file_path):
"""读取DOCX文件中的表格数据并以嵌套字典的形式返回."""
@ -87,13 +89,13 @@ def save_data_to_json(data, output_folder):
"""将数据保存到JSON文件中."""
with open(output_filepath, 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
print(f"The data has been processed and saved to '{output_filepath}'.")
g.logger.info(f"table_content_extraction: The data has been processed and saved to '{output_filepath}'.")
return output_filepath
def extract_tables_main(path, output_folder):
if not os.path.exists(path):
print(f"The specified file does not exist: {path}")
g.logger.error(f"table_content_extraction: The specified file does not exist: {path}")
return ""
# 读取文档表格数据
table_data = read_tables_from_docx(path)

View File

@ -1,28 +1,19 @@
import re
from PyPDF2 import PdfReader
pattern = re.compile(r'(\b\d+\s*\.\s*\d+\s*\.\s*\d+\b)|(\b3\s*\.\s*2\b)')
text = '3.1.3已标价工程量清单中漏报了某个工程子目的单价、合价或总额价则漏报的工程 子目单价、合价和总额价视为已含入其他工程子目的单价、合价和总额价之中。'
match = pattern.search(text)
if match:
print("匹配成功:", match.group())
else:
print("未找到匹配")
def extract_common_header(pdf_path):
pdf_document = PdfReader(pdf_path)
headers = []
num_pages_to_read = 3 # 预读页数
# 使用 findall 查看所有匹配
all_matches = pattern.findall(text)
print("所有匹配:", all_matches)
for i in range(min(num_pages_to_read, len(pdf_document.pages))):
page = pdf_document.pages[i]
text = page.extract_text()
if text: # 确保页面有文本内容
first_line = text.strip().split('\n')[0]
headers.append(first_line)
if len(headers) < 2:
return "" # 如果没有足够的页来比较,返回空字符串
# 使用set交集来找出公共部分
common_header = set(headers[0].split()).intersection(*[set(header.split()) for header in headers[1:]])
common_header = ' '.join(common_header)
return common_header
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\test\\zbtest19.pdf"
res=extract_common_header(input_path)
print(res)
# 打印文本的前10个字符的ASCII值检查是否有不可见字符
print("文本前10个字符的ASCII值:")
for char in text[:10]:
print(f"{char}: {ord(char)}")

View File

@ -122,14 +122,11 @@ def judge_whether_main(file_path,output_folder): #传入招标文件中‘投
output_json_path = os.path.join(output_folder,'judge_exist.json')
read_pdf_and_judge_main(file_path, output_json_path) #提取打勾符号
qianwen_answer = qianwen_ask(output_json_path, user_query1) # 调用普通千问判断是、否、未知
print("qianwen_answer:" + qianwen_answer)
user_query2 = construct_judge_questions(qianwen_answer) # 提取回答为”未知“的键
# 判断user_query是否为空
if user_query2:
print("user_query:" + user_query2)
file_id = upload_file(file_path)
res = qianwen_long(file_id, user_query2) #整个前附表一起传问千问long
print(res)
return process_judge_content(qianwen_answer, res)
else:
@ -143,7 +140,6 @@ def process_judge_content(original_json, update_json): #用新的数据合并
original = extract_content_from_json(original_json)
updates = extract_content_from_json(update_json)
original.update(updates)
print(original)
return merge_json_to_list(original)

View File

@ -55,7 +55,6 @@ def combine_technical_and_business(data, target_values1, target_values2):
def combine_evaluation_standards(truncate2):
# 商务标、技术标评分项:千问
print("starting商务标技术标...")
file_id = upload_file(truncate2)
user_query_2 = (
"根据该文档中的评标办法前附表,请你列出该文件的技术标,商务标,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术标'中新增子键名'备注'存放该信息。如果评分内容不是这3个则返回文档中给定的评分内容以及它的评分要求都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容")
@ -64,7 +63,6 @@ def combine_evaluation_standards(truncate2):
target_values2=['投标报价','商务标','业绩','信誉','分值','计算公式','信用','人员','资格','奖项','认证','荣誉']
update_json=combine_technical_and_business(clean_json_string(evaluation_res),target_values1,target_values2)
evaluation_combined_res = json.dumps(update_json,ensure_ascii=False,indent=4)
print("商务标技术标done")
return evaluation_combined_res
if __name__ == "__main__":

View File

@ -1,5 +1,7 @@
import json
from flask import g
from flask_app.main.json_utils import clean_json_string, nest_json_under_key,rename_outer_key, combine_json_results
from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge
@ -31,8 +33,8 @@ def combine_basic_info(baseinfo_list):
dynamic_key_handling(key_groups, relevant_keys_detected)
# 打印 key_groups 的内容检查它们是否被正确更新
print("Updated key_groups after dynamic handling:")
print(key_groups)
# g.logger.info("Updated key_groups after dynamic handling:")
# 使用合并后的字典创建最终输出
for group_name, keys in key_groups.items():
@ -79,7 +81,6 @@ def judge_consortium_bidding(baseinfo_list):
return accept_bidding
def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #投标人须知前附表
# 调用大模型回答项目基础信息
print("starting基础信息...")
baseinfo_list = []
# baseinfo_file_path='../static/提示词/前两章提问总结.txt'
baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt' # 替换为你的txt文件路径
@ -90,10 +91,9 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
if response and len(response) > 1: # 检查response存在且有至少两个元素
baseinfo_list.append(response[1])
else:
print(f"Warning: Missing or incomplete response data for query index {_}.")
g.logger.error(f"基础信息整合: Warning: Missing or incomplete response data for query index {_}.")
except Exception as e:
print(f"Error processing response for query index {_}: {e}")
print("basic信息done...")
g.logger.error(f"基础信息整合: Error processing response for query index {_}: {e}")
# 判断是否分包、是否需要递交投标保证金等
chosen_numbers, merged = judge_whether_main(truncate0,output_folder)
baseinfo_list.append(merged)
@ -109,7 +109,7 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
file_id=upload_file(truncate0)
res2 = multi_threading(judge_questions, "",file_id,2) #调用千问-long
if not res2:
print("errror!")
g.logger.error("基础信息整合: multi_threading errror!")
else:
# 打印结果
for question, response in res2:
@ -119,9 +119,9 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
# if response and len(response) > 1: # 检查response存在且有至少两个元素
# baseinfo_list.append(response[1])
# else:
# print(f"Warning: Missing or incomplete response data for query index {_}.")
# g.error.info(f"基础信息整合: Warning: Missing or incomplete response data for query index {_}.")
# except Exception as e:
# print(f"Error processing response for query index {_}: {e}")
# g.logger.error(f"基础信息整合: Error processing response for query index {_}: {e}")
rebidding_situation = extract_from_notice(clause_path, 3) #"重新招标, 不再招标和终止招标"需从投标人须知正文提取
@ -131,7 +131,6 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
update_baseinfo_list=combine_basic_info(baseinfo_list) #整合基础信息核心代码
baseinfo_combined_res = combine_json_results(update_baseinfo_list) # 返回值是字典
print("基础信息done")
return nest_json_under_key(baseinfo_combined_res, "基础信息") #返回值是json字符串

View File

@ -6,9 +6,9 @@ import concurrent.futures
import time
from dashscope import Assistants, Messages, Runs, Threads
from flask import g
from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
from flask_app.main.通义千问long import qianwen_long, upload_file
prompt = """
# 角色
你是一个文档处理专家专门负责理解和操作基于特定内容的文档任务这包括解析总结搜索或生成与给定文档相关的各类信息
@ -31,7 +31,6 @@ prompt = """
"""
prom = '请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${documents}'
def read_questions_from_file(file_path):
questions = []
with open(file_path, 'r', encoding='utf-8') as file:
@ -119,8 +118,10 @@ def pure_assistant():
def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type):
if llm_type==1:
g.logger.info(f"rag_assistant! question:{question}")
assistant = rag_assistant(knowledge_name)
elif llm_type==2:
g.logger.info(f"qianwen_long! question:{question}")
qianwen_res = qianwen_long(file_id,question)
result_queue.put((ans_index,(question,qianwen_res)))
return
@ -130,6 +131,7 @@ def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type
result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans)
def multi_threading(queries, knowledge_name="", file_id="",llm_type=1):
g.logger.info("多线程提问starting multi_threading...")
result_queue = queue.Queue()
# 使用 ThreadPoolExecutor 管理线程
@ -148,7 +150,7 @@ def multi_threading(queries, knowledge_name="", file_id="",llm_type=1):
try:
future.result() # 可以用来捕获异常或确认任务完成
except Exception as exc:
print(f"Query {index} generated an exception: {exc}")
g.logger.error(f"Query {index} generated an exception: {exc}")
# 从队列中获取所有结果并按索引排序
results = [None] * len(queries)

View File

@ -3,6 +3,8 @@ import re
import json
import time
from flask import g
from flask_app.main.多线程提问 import multi_threading
from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2
from flask_app.main.json_utils import extract_content_from_json
@ -187,9 +189,9 @@ def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause
temp = extract_content_from_json(response[1])
first_response_list.append(temp)
else:
print(f"Warning: Missing or incomplete response data for query index {_}.")
g.logger.error(f"形式响应评审:Warning: Missing or incomplete response data for query index {_}.")
except Exception as e:
print(f"Error processing response for query index {_}: {e}")
g.logger.error(f"形式响应评审:Error processing response for query index {_}: {e}")
# Assume JSON file paths are defined or configured correctly
# print(entries_with_numbers) #[{'形式评审标准.多标段投标': '3.7.45'}]

View File

@ -2,6 +2,9 @@ from PyPDF2 import PdfReader, PdfWriter
import re # 导入正则表达式库
import os # 用于文件和文件夹操作
from flask import g
def extract_common_header(pdf_path):
pdf_document = PdfReader(pdf_path)
headers = []
@ -59,9 +62,9 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
with open(output_pdf_path, 'wb') as f:
output_doc.write(f)
print(f"已截取并保存页面从 {start_page + 1}{end_page + 1}{output_pdf_path}")
g.logger.info(f"已截取并保存页面从 {start_page + 1}{end_page + 1}{output_pdf_path}")
else:
print("提供的页码范围无效。")
g.logger.error("提供的页码范围无效。")
return output_pdf_path
@ -105,7 +108,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix):
break # 找到结束页后退出循环
if start_page is None or end_page is None:
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
g.logger.error(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!")
return ""
else:
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
@ -154,7 +157,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
if output_suffix == "qualification" or output_suffix =="invalid":
return extract_pages_twice(pdf_path, output_folder, output_suffix)
else:
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
g.logger.error(f"first: 未找到起始或结束页在文件 {pdf_path} 中!")
return ""
else:
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
@ -182,7 +185,7 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
if output_pdf_path and os.path.isfile(output_pdf_path):
return [output_pdf_path] # 以列表形式返回,以保持一致性
else:
print("提供的路径既不是文件夹也不是PDF文件。")
g.logger.error("提供的路径既不是文件夹也不是PDF文件。")
return []
@ -232,7 +235,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|第二卷', re.MULTILINE)
output_suffix = "invalid"
else:
print("无效的选择")
g.logger.error("无效的选择:请选择1-6")
return None
# Process the selected input

View File

@ -4,6 +4,9 @@ import fitz
import re
import os
from flask import g
def extract_text_from_docx(file_path):
doc = docx.Document(file_path)
return '\n'.join([para.text for para in doc.paragraphs])
@ -126,7 +129,7 @@ def convert_to_json(file_path, start_word, end_phrases):
def convert_clause_to_json(input_path,output_folder,type=1):
if not os.path.exists(input_path):
print(f"The specified file does not exist: {input_path}")
g.logger.error(f"The specified file does not exist: {input_path}")
return ""
if type==1:
start_word = "投标人须知正文"

View File

@ -3,6 +3,9 @@ import json
import logging
import os
import time
from flask import g
from flask_app.main.截取pdf import truncate_pdf_multiple
from flask_app.main.table_content_extraction import extract_tables_main
from flask_app.main.知识库操作 import addfileToKnowledge, deleteKnowledge
@ -17,26 +20,27 @@ from flask_app.main.商务标技术标整合 import combine_evaluation_standards
from flask_app.main.format_change import pdf2docx, docx2pdf
from flask_app.main.docx截取docx import copy_docx
global_logger = None
def get_global_logger(unique_id):
if unique_id is None:
return logging.getLogger() # 获取默认的日志器
logger = logging.getLogger(unique_id)
return logger
# def get_global_logger(unique_id):
# if unique_id is None:
# return logging.getLogger() # 获取默认的日志器
# logger = logging.getLogger(unique_id)
# return logger
# 可能有问题pdf转docx导致打勾符号消失
def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
g.logger.info("starting 文件预处理...")
# 根据文件类型处理文件路径
global docx_path, pdf_path
if file_type == 1: # docx
docx_path = downloaded_file_path
pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理
elif file_type == 2: # pdf
pdf_path = downloaded_file_path
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
else:
# 如果文件类型不是预期中的1或2记录错误并返回None
g.logger.error("Unsupported file type provided. Preprocessing halted.")
return None
# 上传知识库
knowledge_name = "招标解析" + unique_id
@ -44,7 +48,6 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
# 调用截取PDF多次
truncate_files = truncate_pdf_multiple(pdf_path, output_folder) # [前附表, 评标办法, 须知正文, 资格审查条件]
print(truncate_files)
# 处理各个部分
truncate0_docpath = pdf2docx(truncate_files[0]) # 投标人须知前附表转docx
@ -56,6 +59,7 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
truncate1 = truncate_files[1] #评标办法前附表
truncate3 = truncate_files[3] #资格审查表
clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json
g.logger.info("文件预处理done")
return {
'input_file_path':downloaded_file_path,
@ -90,61 +94,63 @@ def post_processing(data,includes):
return result
# 基本信息
def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path): # 投标人须知前附表
global_logger.info("starting基础信息...")
g.logger.info("starting基础信息...")
basic_res = project_basic_info(knowledge_name, truncate0, output_folder, clause_path)
global_logger.info("基础信息done")
g.logger.info("基础信息done")
return basic_res
# 形式、响应、资格评审
def fetch_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path,input_file,output_folder):
global_logger.info("starting资格审查...")
g.logger.info("starting资格审查...")
review_standards_res = combine_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath,
clause_path,input_file,output_folder)
global_logger.info("资格审查done")
g.logger.info("资格审查done")
return review_standards_res
# 评分细则
def fetch_evaluation_standards(truncate1): # 评标办法前附表
global_logger.info("starting商务标技术标...")
g.logger.info("starting商务标技术标...")
evaluation_standards_res = combine_evaluation_standards(truncate1)
global_logger.info("商务标技术标done")
g.logger.info("商务标技术标done")
return evaluation_standards_res
# 无效、废标项解析
def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3):
# 废标项要求:千问
global_logger.info("starting无效标与废标...")
g.logger.info("starting无效标与废标...")
find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3)
global_logger.info("无效标与废标done...")
g.logger.info("无效标与废标done...")
return find_invalid_res
# 投标文件要求
def fetch_bidding_documents_requirements(clause_path):
global_logger.info("starting投标文件要求...")
g.logger.info("starting投标文件要求...")
fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1)
qualify_nested_res = nest_json_under_key(fetch_bidding_documents_requirements_json, "投标文件要求")
global_logger.info("投标文件要求done...")
g.logger.info("投标文件要求done...")
return qualify_nested_res
# 开评定标流程
def fetch_bid_opening(clause_path):
global_logger.info("starting开评定标流程...")
g.logger.info("starting开评定标流程...")
fetch_bid_opening_json = extract_from_notice(clause_path, 2)
qualify_nested_res = nest_json_under_key(fetch_bid_opening_json, "开评定标流程")
global_logger.info("开评定标流程done...")
g.logger.info("开评定标流程done...")
return qualify_nested_res
def main_processing(output_folder, downloaded_file_path, file_type, unique_id): # file_type=1->docx file_type=2->pdf
global global_logger
global_logger = get_global_logger(unique_id)
# global global_logger
# global_logger = get_global_logger(unique_id)
# Preprocess files and get necessary data paths and knowledge index
processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id)
if not processed_data:
return ""
with concurrent.futures.ThreadPoolExecutor() as executor:
# Submit all tasks to the executor
@ -174,7 +180,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
result = futures[key].result()
comprehensive_responses.append(result)
except Exception as exc:
global_logger.info(f"Error processing {key}: {exc}")
g.logger.error(f"Error processing {key}: {exc}")
# 合并 JSON 结果
combined_final_result = combine_json_results(comprehensive_responses)
includes = ["基础信息", "资格审查", "商务标", "技术标", "无效标与废标项", "投标文件要求", "开评定标流程"]
@ -184,7 +190,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
final_result_path = os.path.join(output_folder, "final_result.json")
with open(final_result_path, 'w', encoding='utf-8') as file:
json.dump(modified_json, file, ensure_ascii=False, indent=2)
global_logger.info("final_result.json has been saved")
g.logger.info("final_result.json has been saved")
deleteKnowledge(processed_data['knowledge_index'])
return final_result_path

View File

@ -2,6 +2,9 @@ import re
import PyPDF2
import json
from flask import g
def extract_key_value_pairs(text):
# 更新正则表达式来包括对"团"的处理和行尾斜线
pattern = r'\d+\.\d+\s*([\w\s\u4e00-\u9fff]+)[\x01\x02☑团]([\w\s\u4e00-\u9fff]+)'
@ -57,15 +60,13 @@ def read_pdf_and_judge_main(file_path, output_json_path):
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
num_pages = len(reader.pages)
print(f"Total pages: {num_pages}")
all_data = {}
for page_num in range(num_pages):
page = reader.pages[page_num]
text = page.extract_text() if page.extract_text() else ""
# 使用正则表达式删除每页开头的页码,格式通常为“数字+空格”
cleaned_text = re.sub(r'^\d+\s+', '', text)
print(cleaned_text)
# print(cleaned_text)
key_value_pairs = extract_key_value_pairs(cleaned_text)
all_data.update(key_value_pairs)
@ -74,7 +75,7 @@ def read_pdf_and_judge_main(file_path, output_json_path):
with open(output_json_path, "w", encoding="utf-8") as json_file:
json.dump(all_data, json_file, ensure_ascii=False, indent=4)
print(f"Data extraction complete and saved to '{output_json_path}'.")
g.logger.info(f"da_gou signal: Data extraction complete and saved to '{output_json_path}'.")
if __name__ == "__main__":

View File

@ -3,75 +3,125 @@ import json
import os.path
import time
import re
from flask import g
from flask_app.main.json_utils import combine_json_results, nest_json_under_key
from flask_app.main.通义千问long import upload_file, qianwen_long
from concurrent.futures import ThreadPoolExecutor
from flask_app.main.禁止投标情形 import find_forbidden
from 禁止投标情形 import process_string_list
from flask_app.main.禁止投标情形 import find_forbidden, process_string_list
#如果当前段落有序号,则向下匹配直接遇到相同的序号样式
#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。
def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
from collections import OrderedDict
from docx import Document
import re
if isinstance(keywords, str):
keywords = [keywords]
doc = Document(doc_path)
extracted_paragraphs = OrderedDict() # 使用OrderedDict保持顺序
extracted_paragraphs = OrderedDict()
continue_collecting = False
current_section_pattern = None
active_key = None # 用来标记当前正在收集的文本块的键
active_key = None
def match_keywords(text, patterns):
"""使用正则表达式匹配关键词。"""
return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns)
def extract_from_text(text, index):
def extract_from_text(text, current_index):
nonlocal continue_collecting, current_section_pattern, active_key
if text == "": # Skip empty lines
return
if text == "":
return current_index
if continue_collecting:
if current_section_pattern and re.match(current_section_pattern, text):
continue_collecting = False
active_key = None # 结束当前的收集
active_key = None
else:
if active_key is not None:
extracted_paragraphs[active_key].append(text)
return current_index
if match_keywords(text, keywords):
active_key = text # 设置当前的关键词块
extracted_paragraphs[active_key] = [text] # 初始化列表以收集文本,包括当前匹配的文本
# 检查是否也匹配后续关键词
active_key = text
extracted_paragraphs[active_key] = [text]
if match_keywords(text, follow_up_keywords):
continue_collecting = True
# 设置跟踪模式
section_number = re.match(r'(\d+(\s*\.\s*\d+)*)', text)
if section_number:
current_section_number = section_number.group(1)
level_count = current_section_number.count('.')
pattern = r'\b' + (r'\d+\s*\.\s*') * level_count + r'\d+\b'
current_section_pattern = re.compile(pattern)
# Pattern to match current level, e.g., 3.4.5
pattern = r'^' + (r'\d+\s*\.\s*') * level_count + r'\d+'
# Generate patterns for next section at same level and parent level
parts = current_section_number.split('.')
matched_patterns = [pattern] # start with the full pattern
# Next section at same level
parts[-1] = str(int(parts[-1]) + 1)
next_pattern = r'^' + r'\s*\.\s*'.join(parts)
matched_patterns.append(next_pattern)
# Parent section (if applicable)
if len(parts) > 1:
parent_section_parts = parts[:-1]
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1)
parent_pattern = r'^' + r'\s*\.\s*'.join(parent_section_parts)
matched_patterns.append(parent_pattern)
# Combine the patterns
combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
current_section_pattern = re.compile(combined_pattern)
else:
found_next_number = False
current_section_pattern = None
for next_index in range(index + 1, len(doc.paragraphs)):
next_text = doc.paragraphs[next_index].text.strip()
while current_index < len(doc.paragraphs) - 1:
current_index += 1
next_text = doc.paragraphs[current_index].text.strip()
if not found_next_number:
next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)', next_text)
next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))', next_text)
if next_section_number:
found_next_number = True
if next_section_number.group(1):
section_parts = next_section_number.group(1).split('.')
dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
elif next_section_number.group(2):
dynamic_pattern = r'^[\(\]\d+[\)\]'
current_section_pattern = re.compile(dynamic_pattern)
if current_section_pattern and re.match(current_section_pattern, next_text):
extracted_paragraphs[active_key].append(next_text) # 持续收集
extracted_paragraphs[active_key].append(next_text)
else:
continue_collecting = False
active_key=None
break
for index, para in enumerate(doc.paragraphs):
extract_from_text(para.text.strip(), index)
return current_index
return extracted_paragraphs # 返回字典,键是关键词,值是相关文本列表
index = 0
while index < len(doc.paragraphs):
index = extract_from_text(doc.paragraphs[index].text.strip(), index)
index += 1
return extracted_paragraphs
def preprocess_text_list(text_list):
new_text_list = []
# 正则表达式匹配中文字符或标点后的空格,该空格后紧跟字母、数字或带括号的数字
split_pattern = re.compile(r'(?<=[\u4e00-\u9fff。?!;])(?=\s+[a-zA-Z\d]|\s+\([1-9]\d*\)|\s+\[1-9]\d*\)')
for text in text_list:
# 使用正则表达式检查并拆分元素
parts = split_pattern.split(text)
new_text_list.extend(part.strip() for part in parts if part.strip()) # 添加非空字符串检查
return new_text_list
def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达式提取到的东西格式化
all_texts1 = []
@ -107,11 +157,14 @@ def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达
all_texts1.append(cleaned_text) # 将处理后的文本添加到结果列表
else:
print(text_list)
new_text_list=preprocess_text_list(text_list)
print(new_text_list)
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)'
data = re.sub(pattern, '', text_list[0]).strip()
data = re.sub(pattern, '', new_text_list[0]).strip() #去除序号
# 将修改后的第一个元素和剩余的元素连接起来
text_list[0] = data # 更新列表中的第一个元素
joined_text = "\n".join(text_list) # 如果列表中有多个元素,则连接它们
new_text_list[0] = data # 更新列表中的第一个元素
joined_text = "\n".join(new_text_list) # 如果列表中有多个元素,则连接它们
all_texts2.append(joined_text) # 将每个列表的内容添加到 all_texts 中
return all_texts1,all_texts2 #all_texts1要额外用gpt all_text2直接返回结果
@ -219,6 +272,7 @@ def handle_query(file_path, user_query, output_file, result_key, keywords, trunc
excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:"]
follow_up_keywords = [r'\s*形\s*之\s*一', r'\s*况\s*之\s*一', r'\s*列', r'\s*下']
extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) #字典结果
# print(extracted_contents)
all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords)
qianwen_txt = all_texts1 + all_tables1
@ -234,7 +288,6 @@ def handle_query(file_path, user_query, output_file, result_key, keywords, trunc
# 更新计数器,每次循环递增
counter += 1
file_id = upload_file(output_file)
print("starting qianwen-long...")
qianwen_ans = qianwen_long(file_id, user_query)
selected_contents = []
num_list = process_string_list(qianwen_ans)
@ -256,7 +309,6 @@ def handle_query(file_path, user_query, output_file, result_key, keywords, trunc
return res
def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,truncate3):
print("starting无效标与废标...")
queries = [
(r'\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果x为符合的信息的序号若情况不存在返回[]。",
@ -280,7 +332,7 @@ def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,t
results.append(future.result())
#禁止投标
print("starting不得存在的情形...")
# g.logger.info("starting不得存在的情形...")
forbidden_res = find_forbidden(truncate_json_path, clause_path, truncate3)
results.append(forbidden_res)
@ -288,17 +340,18 @@ def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,t
for d in results:
combined_dict.update(d)
print("无效标与废标done...")
# g.logger.info("无效标与废标done...")
return nest_json_under_key(combined_dict, "无效标与废标项")
if __name__ == '__main__':
start_time = time.time()
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json"
clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\clause.json"
truncate3="C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest12_qualification.pdf"
output_dir = "C:\\Users\\Administrator\\Desktop\\货物标\\output"
doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output\\zbtest11_invalid.docx'
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\truncate_output.json"
clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\clause1.json"
truncate3="C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile_qualification.pdf"
output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e"
# doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx'
doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile_invalid.docx'
results = combine_find_invalid(doc_path, output_dir,truncate_json_path,clause_path,truncate3)
end_time = time.time()
print("Elapsed time:", str(end_time - start_time))

View File

@ -1,6 +1,7 @@
import os
import uuid
from flask import g
from llama_index.readers.dashscope.base import DashScopeParse
from llama_index.readers.dashscope.utils import ResultType
from llama_index.indices.managed.dashscope import DashScopeCloudIndex
@ -15,6 +16,7 @@ def addfileToKnowledge(filepath,knowledge_name):
knowledge_name,
verbose=True,
)
g.logger.info("knowledge created successfully!!!")
# index = DashScopeCloudIndex(knowledge_name)
# index._insert(documents)
# return index, documents
@ -26,6 +28,7 @@ def deleteKnowledge(index):
workspace_id = os.environ.get('DASHSCOPE_WORKSPACE_ID')
client = create_client()
delete_index(client,workspace_id,index_id)
g.logger("knowledge deleted successfully!!!")

View File

@ -4,16 +4,18 @@ import os
import re
from PyPDF2 import PdfWriter, PdfReader
from flask import g
from flask_app.main.通义千问long import upload_file, qianwen_long
def extract_and_format_from_paths(json_paths, includes):
def extract_and_format_from_paths(json_paths, includes, excludes):
"""
从多个 JSON 文件路径读取数据提取包含特定关键词的内容并按照格式要求合并
参数:
json_paths (list): 包含多个 JSON 文件路径的列表
includes (list): 包含要检查的关键词的列表
excludes (list): 包含要排除的关键词的列表
返回:
list: 包含所有文件中满足条件的格式化字符串列表
@ -33,43 +35,74 @@ def extract_and_format_from_paths(json_paths, includes):
if isinstance(value, dict):
# 如果值是字典,检查嵌套字典的每个键值对
for sub_key, sub_value in value.items():
if any(include in sub_value for include in includes):
if any(include in sub_key for include in includes):
# 如果子值包含关键词,格式化并添加到结果列表
formatted_results.append(f"{sub_key}: {sub_value}")
elif isinstance(value, str):
# 如果值是字符串,直接检查是否包含关键词
if any(include in value for include in includes):
# 如果值包含关键词,添加到结果列表
formatted_results.append(f"{sub_value}")
elif isinstance(value, str): # clause
# 检查是否包含任何 include 关键词
for include in includes:
if include in value:
# 找到 include 之前的内容
prefix = value.split(include)[0]
# 检查 prefix 是否不包含任何 exclude 关键词
if not any(exclude in prefix for exclude in excludes):
# 如果不包含任何 exclude 关键词,添加整个 value 到结果列表
if '\n' in value:
value = value.split('\n', 1)[-1]
formatted_results.append(value)
break # 找到一个符合条件的就跳出循环
# 将当前文件的结果添加到总结果列表
all_formatted_results.extend(formatted_results)
except FileNotFoundError:
print(f"Error: The file '{path}' does not exist.")
g.logger.error(f"禁止投标情形: Error: The file '{path}' does not exist.")
except json.JSONDecodeError:
print(f"Error: The file '{path}' contains invalid JSON.")
g.logger.error(f"禁止投标情形: Error: The file '{path}' contains invalid JSON.")
return all_formatted_results
def extract_unique_items_from_texts(texts):
pattern = re.compile(r'(?:\d+\.|\\d+\|\(\d+\)|\d+\))\s*')
intro_pattern = re.compile(r'^.*[:]')
# 更新正则表达式以包括更广泛的序号类型,包括中文序号
pattern = re.compile(r'(?:\d+\.|\\d+\|\(\d+\)|\d+\)|\①|\②|\③|\④|\⑤|\⑥|\⑦|\⑧|\⑨|\⑩|\⑪|\⑫)\s*')
intro_pattern = re.compile(r'^.*?[:]')
punctuation_pattern = re.compile(r'[;。,、..,:;!?]+$')
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
content_check_pattern = re.compile(r'[\u4e00-\u9fa5a-zA-Z0-9]{2,}') # 检查是否含有至少2个连续的字母、数字或汉字
all_results = []
seen = set()
for text in texts:
# 去除文本中的制表符和换行符
text = text.replace('\t', '').replace('\n', '')
# 删除引导性的文本(直到冒号,但保留冒号后的内容)
text = intro_pattern.sub('', text)
# 替换URL为占位符并保存URL以便后续还原
urls = []
def url_replacer(match):
urls.append(match.group(0))
return f"{{URL{len(urls)}}}"
text = url_pattern.sub(url_replacer, text)
# 使用数字和括号的模式分割文本
items = pattern.split(text)
for item in items:
cleaned_item = item.strip()
if cleaned_item:
# 进一步清理每个条目
cleaned_item = pattern.sub('', cleaned_item)
cleaned_item = punctuation_pattern.sub('', cleaned_item)
cleaned_item = cleaned_item.strip()
if cleaned_item and cleaned_item not in seen:
# 还原URL
for i, url in enumerate(urls, 1):
cleaned_item = cleaned_item.replace(f"{{URL{i}}}", url)
# 添加未见过的独特条目确保它包含足够的实质内容并长度大于3个字符
if cleaned_item and cleaned_item not in seen and len(cleaned_item) > 3 and content_check_pattern.search(cleaned_item):
seen.add(cleaned_item)
all_results.append(cleaned_item)
@ -93,9 +126,9 @@ def merge_pdfs(paths, output_filename):
if output_path:
with open(output_path, 'wb') as out:
pdf_writer.write(out)
print(f"Merged PDF saved to {output_path}")
g.logger.info(f"禁止投标情形: Merged PDF saved to {output_path}")
else:
print("No files to merge.")
g.logger.error("禁止投标情形: No files to merge.")
return output_path
def process_string_list(string_list):
@ -120,7 +153,7 @@ def process_string_list(string_list):
actual_list = ast.literal_eval(formatted_list)
return actual_list
except SyntaxError as e:
print(f"Error parsing list: {e}")
g.logger.error(f"禁止投标情形: Error parsing list: {e}")
return []
else:
# 如果没有匹配到内容,返回空列表
@ -129,15 +162,17 @@ def find_forbidden(truncate_json_path,clause_path,truncate3): #投标人须
# output_filename="merged.pdf"
# paths=[truncate1,truncate4]
# merged_filepath=merge_pdfs(paths,output_filename) #暂时废弃,评分前附表中的在'否决投标'中摘录了。
if truncate3:
file_id=upload_file(truncate3)
# user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些请按json列表格式给我提供信息键名为'不得存在的其他情形',请你不要回答有关\"信誉要求\"的内容,若文件中未说明,请在键值中填'未知'。"
user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...],请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。"
qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden)
else:
qianwen_forbidden_str="[]"
actual_list=process_string_list(qianwen_forbidden_str) #提取出字符串列表 ["xxx","xx"]
includes = ["不得存在", "禁止投"]
forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes)
includes = ["不得存在", "不得与", "禁止投标", "对投标人的纪律"]
excludes = ["招标", "评标", ""]
forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes,excludes)
processed_results = extract_unique_items_from_texts(forbidden_results)
# print(processed_results)
merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results))
@ -145,11 +180,12 @@ def find_forbidden(truncate_json_path,clause_path,truncate3): #投标人须
return forbidden_dict
#TODO:不得存在的情况文中有很多内容
if __name__ == '__main__':
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\truncate_output.json"
clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\clause.json"
truncate4 = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_qualification.pdf"
output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7"
doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile.docx'
find_forbidden(truncate_json_path,clause_path,truncate4)
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\truncate_output.json"
clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\clause1.json"
truncate3 = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile_qualification.pdf"
output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e"
doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile.docx'
res = find_forbidden(truncate_json_path, clause_path, truncate3)
print(json.dumps(res, ensure_ascii=False, indent=4))

View File

@ -14,5 +14,5 @@ def read_docx(file_path):
if __name__ == "__main__":
file_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标02_invalid.docx"
file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile.docx"
read_docx(file_path)

View File

@ -10,13 +10,11 @@ from concurrent.futures import ThreadPoolExecutor
def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder): #评标办法前附表
# 形式评审、响应评审:千问
print("starting形式响应评审...")
file_id=upload_file(truncate1) #评标办法前附表
user_query_1 = "根据该文档中的评标办法前附表请你列出该文件中的形式评审标准和响应性评审标准和资格评审标准请以json格式返回外层键名为'形式评审标准''响应性评审标准''资格评审标准',嵌套键名为'评审因素'中的内容,相应的键值为对应'评审标准'中的内容。"
results = qianwen_long(file_id, user_query_1)
original_dict_data = extract_content_from_json(results)
qualification_review = original_dict_data.pop('资格评审标准', '默认值或None') #qianwen-long有关资格评审的内容
print(original_dict_data)
with ThreadPoolExecutor() as executor:
# 创建Future对象
future_qualification = executor.submit(process_qualification, qualification_review, truncate3, knowledge_name)
@ -26,7 +24,6 @@ def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpa
# 等待执行结果
final_qualify_json = future_qualification.result()
form_response_dict = future_form_response.result()
print("形式响应评审done")
form_response_dict.update(final_qualify_json)
return nest_json_under_key(form_response_dict,"资格审查")

View File

@ -3,6 +3,8 @@
import json
import re
from flask import g
from flask_app.main.json_utils import clean_json_string, combine_json_results, add_keys_to_json, nest_json_under_key
from flask_app.main.多线程提问 import multi_threading,read_questions_from_file
from flask_app.main.通义千问long import upload_file
@ -17,7 +19,7 @@ def merge_dictionaries_under_common_key(dicts, common_key):
# 使用字典解包来合并字典
merged_dict[common_key].update(d[common_key])
else:
print(f"Warning: Dictionary does not contain the key {common_key}")
g.logger.error(f"资格评审: Warning: Dictionary does not contain the key {common_key}")
return merged_dict
def generate_qual_question(matching_keys_list): #这里假设资质、信誉与人员要求 要不都有、要不都没
@ -71,9 +73,9 @@ def get_consortium_dict(knowledge_name):
if response and len(response) > 1: # 检查response存在且有至少两个元素
qualify_list.append(response[1])
else:
print(f"Warning: Missing or incomplete response data for query index {_}.")
g.logger.error(f"资格评审: Warning: Missing or incomplete response data for query index {_}.")
except Exception as e:
print(f"Error processing response for query index {_}: {e}")
g.logger.error(f"资格评审: Error processing response for query index {_}: {e}")
consortium_dict = combine_json_results(qualify_list)
return consortium_dict
@ -88,9 +90,9 @@ def get_all_dict(knowledge_name):
if response and len(response) > 1: # 检查response存在且有至少两个元素
qualification_list.append(response[1])
else:
print(f"Warning: Missing or incomplete response data for query index {_}.")
g.logger.error(f"资格评审: Warning: Missing or incomplete response data for query index {_}.")
except Exception as e:
print(f"Error processing response for query index {_}: {e}")
g.logger.error(f"资格评审: Error processing response for query index {_}: {e}")
qualification_combined_res = combine_json_results(qualification_list)
return {'资格评审': qualification_combined_res}
def process_qualification(qualification_review,truncate3,knowledge_name):
@ -99,14 +101,14 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
if not matching_keys_list: #此时要求全部写在评分办法前附表中,不需要额外提取。
if not non_matching_dict: #古法提取
if truncate3!="":
print("type1")
g.logger.info("资格评审: type1")
matching_keys_list=["资质条件","财务要求","业绩要求","信誉要求","其他要求"]
ques=generate_qual_question(matching_keys_list)
file_id2 = upload_file(truncate3)
results2 = multi_threading(ques, "", file_id2, 2) # 资格评审表调用qianwen-long
res_list = []
if not results2:
print("未调用大模型询问资格评审文件要求!")
g.logger.error("资格评审: 调用大模型未成功获取资格评审文件中的要求!")
else:
# 打印结果
for question, response in results2:
@ -117,11 +119,11 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) # 合并字典
return updated_qualify_json
else:
print("type2")
g.logger.info("资格评审: type2")
return get_all_dict(knowledge_name)
else:
print("type3")
g.logger.info("资格评审: type3")
new_non_matching_json={'资格评审':non_matching_dict}
substring = '联合体'
found_key = any(substring in key for key in non_matching_dict.keys()) #没有联合体投标,则需生成,防止重复
@ -133,18 +135,18 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
return new_non_matching_json
elif matching_keys_list and truncate3=="": #这种情况是评分办法前附表中有要求,但是没有正确截取到'资格审查表'
print("type4")
g.logger.info("资格评审: type4")
final_qualification=get_all_dict(knowledge_name)
final_qualify_json = add_keys_to_json(final_qualification, non_matching_dict)
return final_qualify_json
else: #大多数情况
print("type5")
g.logger.info("资格评审: type5")
user_querys = generate_qual_question(matching_keys_list) # 生成提问->‘附件:资格审查’
file_id2 = upload_file(truncate3)
results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表调用qianwen-long
res_list = []
if not results2:
print("未调用大模型询问资格评审文件要求!")
g.logger.error("资格评审: 调用大模型未成功获取资格评审文件中的要求!")
else:
# 打印结果
for question, response in results2:

View File

@ -1,33 +1,115 @@
def categorize_keys(data, includes):
# 初始化结果字典,预设'其他'分类为空字典
result = {"其他": {}}
# 遍历原始字典的每一个键值对
for key, value in data.items():
if key in includes:
# 如果键在includes列表中直接保留这个键值对
result[key] = value
else:
# 如果键不在includes列表中将这个键值对加入到'其他'分类中
result["其他"][key] = value
# 如果'其他'分类没有任何内容,可以选择删除这个键
if not result["其他"]:
del result["其他"]
return result
import json
import re
# 使用示例
data = {
"基础信息": "详细描述",
"资格审查": "流程说明",
"商务标": "流程详情",
"技术标": "合同详细条款",
"支付方式": "支付条件说明"
}
def extract_and_format_from_paths(json_paths, includes, excludes):
"""
从多个 JSON 文件路径读取数据提取包含特定关键词的内容并按照格式要求合并
includes = ["基础信息", "资格审查", "商务标", "技术标", "无效标与废标项", "投标文件要求", "开评定标流程"]
result = categorize_keys(data, includes)
参数:
json_paths (list): 包含多个 JSON 文件路径的列表
includes (list): 包含要检查的关键词的列表
excludes (list): 包含要排除的关键词的列表
print(result)
返回:
list: 包含所有文件中满足条件的格式化字符串列表
"""
all_formatted_results = []
# 遍历每个文件路径
for path in json_paths:
try:
with open(path, 'r', encoding='utf-8') as file:
# 加载 JSON 数据
json_data = json.load(file)
formatted_results = []
# 遍历 JSON 数据的每个键值对
for key, value in json_data.items():
if isinstance(value, dict):
# 如果值是字典,检查嵌套字典的每个键值对
for sub_key, sub_value in value.items():
if any(include in sub_key for include in includes):
# 如果子值包含关键词,格式化并添加到结果列表
formatted_results.append(f"{sub_value}")
elif isinstance(value, str): # clause
# 检查是否包含任何 include 关键词
for include in includes:
if include in value:
# 找到 include 之前的内容
prefix = value.split(include)[0]
# 检查 prefix 是否不包含任何 exclude 关键词
if not any(exclude in prefix for exclude in excludes):
# 如果不包含任何 exclude 关键词,添加整个 value 到结果列表
if '\n' in value:
value = value.split('\n', 1)[-1]
formatted_results.append(value)
break # 找到一个符合条件的就跳出循环
# 将当前文件的结果添加到总结果列表
all_formatted_results.extend(formatted_results)
except FileNotFoundError:
print(f"Error: The file '{path}' does not exist.")
except json.JSONDecodeError:
print(f"Error: The file '{path}' contains invalid JSON.")
return all_formatted_results
def extract_unique_items_from_texts(texts):
# 更新正则表达式以包括更广泛的序号类型,包括中文序号
pattern = re.compile(r'(?:\d+\.|\\d+\|\(\d+\)|\d+\)|\①|\②|\③|\④|\⑤|\⑥|\⑦|\⑧|\⑨|\⑩|\⑪|\⑫)\s*')
intro_pattern = re.compile(r'^.*?[:]')
punctuation_pattern = re.compile(r'[;。,、..,:;!?]+$')
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
content_check_pattern = re.compile(r'[\u4e00-\u9fa5a-zA-Z0-9]{2,}') # 检查是否含有至少2个连续的字母、数字或汉字
all_results = []
seen = set()
for text in texts:
# 去除文本中的制表符和换行符
text = text.replace('\t', '').replace('\n', '')
# 删除引导性的文本(直到冒号,但保留冒号后的内容)
text = intro_pattern.sub('', text)
# 替换URL为占位符并保存URL以便后续还原
urls = []
def url_replacer(match):
urls.append(match.group(0))
return f"{{URL{len(urls)}}}"
text = url_pattern.sub(url_replacer, text)
# 使用数字和括号的模式分割文本
items = pattern.split(text)
for item in items:
cleaned_item = item.strip()
if cleaned_item:
# 进一步清理每个条目
cleaned_item = pattern.sub('', cleaned_item)
cleaned_item = punctuation_pattern.sub('', cleaned_item)
cleaned_item = cleaned_item.strip()
# 还原URL
for i, url in enumerate(urls, 1):
cleaned_item = cleaned_item.replace(f"{{URL{i}}}", url)
# 添加未见过的独特条目确保它包含足够的实质内容并长度大于3个字符
if cleaned_item and cleaned_item not in seen and len(cleaned_item) > 3 and content_check_pattern.search(cleaned_item):
seen.add(cleaned_item)
all_results.append(cleaned_item)
return all_results
# 使用上面定义的函数
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\truncate_output.json"
clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\clause1.json"
json_paths = [truncate_json_path,clause_path] # 根据实际存放的路径来填写
includes = ["不得存在","不得与","禁止投标","对投标人的纪律"]
excludes=["招标","评标","定标"]
# 调用函数
results = extract_and_format_from_paths(json_paths, includes,excludes)
print(results)
res=extract_unique_items_from_texts(results)
print(res)