This commit is contained in:
zy123 2024-10-14 17:13:11 +08:00
parent 9a2141c3a9
commit 82de3ae202
9 changed files with 363 additions and 92 deletions

View File

@ -5,33 +5,48 @@ from flask_app.main.投标人须知正文提取指定内容 import extract_from_
from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge
from flask_app.main.多线程提问 import read_questions_from_file, multi_threading
from flask_app.main.通义千问long import upload_file
def combine_basic_info(baseinfo_list):
def aggregate_basic_info(baseinfo_list):
"""
将基础信息列表中的数据进行合并和分类
参数
- baseinfo_list (list): 包含多个基础信息的列表
返回
- list: 合并和分类后的基础信息列表
"""
combined_baseinfo_list = []
key_groups = {
"招标人/代理信息": ["招标人","招标人联系方式", "招标代理机构","招标代理机构联系方式"],
"项目信息": ["工程名称", "招标编号","工程概况","招标范围","招标控制价","投标竞争下浮率"],
"关键时间/内容":["投标文件递交截止日期","递交方式","投标人要求澄清招标文件的截止时间","投标有效期","评标结果公示媒介"],
"保证金相关":['质量保证金','退还投标保证金'],
"其他信息":["重新招标、不再招标和终止招标","是否退还投标文件","费用承担"]
"招标人/代理信息": ["招标人", "招标人联系方式", "招标代理机构", "招标代理机构联系方式"],
"项目信息": ["工程名称", "招标编号", "工程概况", "招标范围", "招标控制价", "投标竞争下浮率"],
"关键时间/内容": [
"投标文件递交截止日期",
"递交方式",
"投标人要求澄清招标文件的截止时间",
"投标有效期",
"评标结果公示媒介"
],
"保证金相关": ["质量保证金", "退还投标保证金"],
"其他信息": [
"重新招标、不再招标和终止招标",
"是否退还投标文件",
"费用承担"
]
}
# 将所有基础信息合并到一个字典中
combined_data = {}
relevant_keys_detected = set()
# 预处理以决定哪些键名将被使用
# 合并所有基础信息并收集相关键
for baseinfo in baseinfo_list:
json_data = clean_json_string(baseinfo)
combined_data.update(json_data)
relevant_keys_detected.update(json_data.keys())
# for key in relevant_keys.keys():
# if key in json_data:
# relevant_keys[key] = True
# 根据检测到的键动态调整 key_groups
# 动态调整键组
dynamic_key_handling(key_groups, relevant_keys_detected)
# 使用合并后的字典创建最终输出
# 按键组分类并嵌套
for group_name, keys in key_groups.items():
group_data = {key: combined_data.get(key, "未提供") for key in keys}
combined_json = nest_json_under_key(group_data, group_name)
@ -74,69 +89,72 @@ def judge_consortium_bidding(baseinfo_list):
# 更新原始列表,如果你想保留修改
baseinfo_list[:] = updated_list
return accept_bidding
def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #投标人须知前附表
# 调用大模型回答项目基础信息
def combine_basic_info(knowledge_name, truncate0, output_folder, clause_path):
"""
综合和处理基础信息生成最终的基础信息字典
参数
- knowledge_name (str): 知识名称
- truncate0 (str): 文件路径
- output_folder (str): 输出文件夹路径
- clause_path (str): 条款路径
返回
- dict: 综合后的基础信息
"""
baseinfo_list = []
baseinfo_file_path='flask_app/static/提示词/前两章提问总结.txt'
# baseinfo_file_path='D:\\flask_project\\flask_app\\static\\提示词\\前两章提问总结.txt'
baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt'
questions = read_questions_from_file(baseinfo_file_path)
res1 = multi_threading(questions, knowledge_name)
for _, response in res1: # _占位代表ques;response[0]也是ques;response[1]是ans
for index, response in res1:
try:
if response and len(response) > 1: # 检查response存在且有至少两个元素
if response and len(response) > 1:
baseinfo_list.append(response[1])
else:
print(f"基础信息整合: Warning: Missing or incomplete response data for query index {_}.")
print(f"基础信息整合: Warning: Missing or incomplete response data for query index {index}.")
except Exception as e:
print(f"基础信息整合: Error processing response for query index {_}: {e}")
print(f"基础信息整合: Error processing response for query index {index}: {e}")
# 判断是否分包、是否需要递交投标保证金等
chosen_numbers, merged = judge_whether_main(truncate0,output_folder)
chosen_numbers, merged = judge_whether_main(truncate0, output_folder)
baseinfo_list.append(merged)
judge_file_path ='flask_app/static/提示词/是否相关问题.txt'
# judge_file_path='D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题.txt'
judge_file_path = 'flask_app/static/提示词/是否相关问题.txt'
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
judge_consortium = judge_consortium_bidding(baseinfo_list) # 通过招标公告判断是否接受联合体投标
judge_consortium = judge_consortium_bidding(baseinfo_list) #通过招标公告判断是否接受联合体投标
if judge_consortium:
judge_consortium_question = "该招标文件对于联合体投标的要求是怎样的请按json格式给我提供信息外层键名为'联合体投标要求',其中有一个嵌套键值对为:\"是否接受联合体投标\":\"\""
judge_consortium_question = (
"该招标文件对于联合体投标的要求是怎样的请按json格式给我提供信息"
"外层键名为'联合体投标要求',其中有一个嵌套键值对为:\"是否接受联合体投标\":\"\""
)
judge_questions.append(judge_consortium_question)
file_id=upload_file(truncate0)
res2 = multi_threading(judge_questions, "",file_id,2) #调用千问-long
file_id = upload_file(truncate0)
res2 = multi_threading(judge_questions, "", file_id, 2) # 调用千问-long
if not res2:
print("基础信息整合: multi_threading errror!")
print("基础信息整合: multi_threading error!")
else:
# 打印结果
for question, response in res2:
baseinfo_list.append(response)
# for _, response in res2: # _占位代表ques;response[0]也是ques;response[1]是ans #调用百炼rag
# try:
# if response and len(response) > 1: # 检查response存在且有至少两个元素
# baseinfo_list.append(response[1])
# else:
# print(f"基础信息整合: Warning: Missing or incomplete response data for query index {_}.")
# except Exception as e:
# print(f"基础信息整合: Error processing response for query index {_}: {e}")
rebidding_situation = extract_from_notice(clause_path, 3) #"重新招标, 不再招标和终止招标"需从投标人须知正文提取
update_json=rename_outer_key(rebidding_situation,"重新招标、不再招标和终止招标")
rebidding_situation = extract_from_notice(clause_path, 3) # "重新招标, 不再招标和终止招标"需从投标人须知正文提取
update_json = rename_outer_key(rebidding_situation, "重新招标、不再招标和终止招标")
baseinfo_list.append(update_json)
update_baseinfo_list=combine_basic_info(baseinfo_list) #整合基础信息核心代码
aggregated_baseinfo = aggregate_basic_info(baseinfo_list) # 整合基础信息核心代码
baseinfo_combined_res = combine_json_results(aggregated_baseinfo) # 返回值是字典
baseinfo_combined_res = combine_json_results(update_baseinfo_list) # 返回值是字典
# return nest_json_under_key(baseinfo_combined_res, "基础信息") #返回值是json字符串
return {"基础信息":baseinfo_combined_res}
return {"基础信息": baseinfo_combined_res}
if __name__ == "__main__":
knowledge_name = "ztb"
output_folder="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405"
truncate0="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\ztbfile_tobidders_notice_table.pdf"
clause_path="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\clause1.json"
res=project_basic_info(knowledge_name,truncate0,output_folder,clause_path)
res=combine_basic_info(knowledge_name,truncate0,output_folder,clause_path)
print(json.dumps(res,ensure_ascii=False,indent=4))

View File

@ -1,10 +1,12 @@
# 基于知识库提问的通用模板,
# assistant_id
import json
import os
import re
import queue
import concurrent.futures
import time
import requests
from dashscope import Assistants, Messages, Runs, Threads
from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
from flask_app.main.通义千问long import qianwen_long, upload_file
@ -100,6 +102,98 @@ def rag_assistant(knowledge_name):
)
return assistant
#TODO:http格式有bug还没修改
def create_assistant(knowledge_name):
"""
Create an assistant using DashScope API via HTTP request based on the provided knowledge name.
Parameters:
knowledge_name (str): The name of the knowledge base to associate with the assistant.
Returns:
dict: Response from the API containing assistant details.
Raises:
ValueError: If the DASHSCOPE_API_KEY environment variable is not set.
Exception: If any error occurs during the HTTP request.
"""
# Step 1: Initialize the Retriever and get the Pipeline ID
try:
retriever = DashScopeCloudRetriever(knowledge_name)
pipeline_id = str(retriever.pipeline_id)
except Exception as e:
print(f"Error retrieving pipeline ID for knowledge '{knowledge_name}': {e}")
return None
# Step 2: Fetch the API Key from Environment Variables
api_key = os.getenv("DASHSCOPE_API_KEY")
if not api_key:
raise ValueError("DASHSCOPE_API_KEY environment variable is not set.")
# Step 3: Define the API Endpoint and Headers
url = 'https://dashscope.aliyuncs.com/api/v1/assistants'
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
# Step 4: Construct the Instructions
instructions = (
"请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${documents}"
)
# Step 5: Define the Tools
tools = [
{
"type": "code_interpreter"
},
{
"type": "rag",
"prompt_ra": {
"pipeline_id": pipeline_id,
"parameters": {
"type": "object",
"properties": {
"query_word": {
"type": "str",
"value": "${documents}"
}
}
}
}
}
]
# Step 6: Construct the Payload
payload = {
"model": "qwen-max",
"name": "智能小助手", # "Smart Helper" in Chinese
"description": "智能助手,支持知识库查询和插件调用。",
"temperature": 0.3,
"instructions": instructions,
"tools": tools,
"file_ids": [], # Add file IDs if necessary
"metadata": {} # Add metadata if necessary
}
# Optional: If you have specific file_ids or metadata, you can modify the payload accordingly
# For example:
# payload["file_ids"] = ["file_id_1", "file_id_2"]
# payload["metadata"] = {"key1": "value1", "key2": "value2"}
# Step 7: Make the HTTP POST Request
try:
response = requests.post(url, headers=headers, data=json.dumps(payload))
response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
assistant = response.json()
print("Assistant created successfully:")
print(json.dumps(assistant, indent=4, ensure_ascii=False))
return assistant
except requests.exceptions.HTTPError as http_err:
print(f"HTTP error occurred: {http_err} - Response: {response.text}")
except Exception as err:
print(f"An error occurred: {err}")
def pure_assistant():
assistant = Assistants.create(
@ -119,6 +213,7 @@ def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type
if llm_type==1:
print(f"rag_assistant! question:{question}")
assistant = rag_assistant(knowledge_name)
# assistant=create_assistant(knowledge_name)
elif llm_type==2:
print(f"qianwen_long! question:{question}")
qianwen_res = qianwen_long(file_id,question)
@ -185,20 +280,10 @@ if __name__ == "__main__":
# print(f"Question: {question}")
# print(f"Response: {response}")
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\ztb_evaluation_method.pdf"
# file_id = upload_file(file_path)
# questions=["根据该文档中的评标办法前附表,请你列出该文件的技术标,以json的格式返回结果","根据该文档中的评标办法前附表,请你列出该文件的商务标,以json的格式返回结果","根据该文档中的评标办法前附表,请你列出该文件的投标报价,以json的格式返回结果"]
# results=multi_threading(questions,"",file_id,2) #1代表使用百炼rag 2代表使用qianwen-long
# if not results:
# print("errror!")
# else:
# # 打印结果
# for question, response in results:
# print(f"Question: {question}")
# print(f"Response: {response}")
ques=["关于'资格要求',本采购文件第一章第二款要求的内容是怎样的请按json格式给我提供信息键名为'资格要求',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'"]
# ques=["该招标文件的工程名称项目名称招标编号是招标人是招标代理机构是请按json格式给我提供信息键名分别是'工程名称','招标编号','招标人','招标代理机构',若存在未知信息,在对应的键值中填'未知'。","该招标文件的工程概况或项目概况招标范围是请按json格式给我提供信息键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。"]
results = multi_threading(ques, "6.2视频会议docx")
file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\6.2定版视频会议磋商文件(1)\\6.2定版视频会议磋商文件_1-21.pdf"
file_id = upload_file(file_path)
questions=["该招标文件的项目名称是项目编号或招标编号采购人或招标人采购代理机构或招标代理机构请按json格式给我提供信息键名分别是'项目名称','项目编号','采购人','采购代理机构',若存在未知信息,在对应的键值中填'未知'","该招标文件的项目概况是项目基本情况是请按json格式给我提供信息键名分别为'项目概况','项目基本情况',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,而嵌套键值必须与原文保持一致,若存在未知信息,在对应的键值中填'未知'"]
results=multi_threading(questions,"",file_id,2) #1代表使用百炼rag 2代表使用qianwen-long
if not results:
print("errror!")
else:
@ -206,3 +291,14 @@ if __name__ == "__main__":
for question, response in results:
print(f"Question: {question}")
print(f"Response: {response}")
# ques=["关于'资格要求',本采购文件第一章第二款要求的内容是怎样的请按json格式给我提供信息键名为'资格要求',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。"]
# # ques=["该招标文件的工程名称项目名称招标编号是招标人是招标代理机构是请按json格式给我提供信息键名分别是'工程名称','招标编号','招标人','招标代理机构',若存在未知信息,在对应的键值中填'未知'。","该招标文件的工程概况或项目概况招标范围是请按json格式给我提供信息键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。"]
# results = multi_threading(ques, "6.2视频会议docx")
# if not results:
# print("errror!")
# else:
# # 打印结果
# for question, response in results:
# print(f"Question: {question}")
# print(f"Response: {response}")

View File

@ -9,11 +9,11 @@ from flask_app.main.截取pdf import truncate_pdf_multiple
from flask_app.main.table_content_extraction import extract_tables_main
from flask_app.main.知识库操作 import addfileToKnowledge, deleteKnowledge
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
from flask_app.main.json_utils import nest_json_under_key, transform_json_values, combine_json_results
from flask_app.main.json_utils import transform_json_values, combine_json_results
from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid
from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
import concurrent.futures
from flask_app.main.基础信息整合 import project_basic_info
from flask_app.main.基础信息整合 import combine_basic_info
from flask_app.main.资格审查模块 import combine_review_standards
from flask_app.main.商务标技术标整合 import combine_evaluation_standards
from flask_app.main.format_change import pdf2docx, docx2pdf
@ -96,7 +96,7 @@ def post_processing(data,includes):
# 基本信息
def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path): # 投标人须知前附表
logger.info("starting基础信息...")
basic_res = project_basic_info(knowledge_name, truncate0, output_folder, clause_path)
basic_res = combine_basic_info(knowledge_name, truncate0, output_folder, clause_path)
logger.info("基础信息done")
return basic_res
@ -149,18 +149,14 @@ def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpat
def fetch_bidding_documents_requirements(clause_path):
logger.info("starting投标文件要求...")
fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1)
qualify_nested_res = nest_json_under_key(fetch_bidding_documents_requirements_json, "投标文件要求")
logger.info("投标文件要求done...")
# return qualify_nested_res
return {"投标文件要求":fetch_bidding_documents_requirements_json}
# 开评定标流程
def fetch_bid_opening(clause_path):
logger.info("starting开评定标流程...")
fetch_bid_opening_json = extract_from_notice(clause_path, 2)
qualify_nested_res = nest_json_under_key(fetch_bid_opening_json, "开评定标流程")
logger.info("开评定标流程done...")
# return qualify_nested_res
return {"开评定标流程":fetch_bid_opening_json}
# def main_processing(output_folder, downloaded_file_path, file_type, unique_id): # file_type=1->docx file_type=2->pdf

View File

@ -65,12 +65,41 @@ def read_tables_from_docx(file_path):
# 返回符合条件的单元格内容
return cell_contents
def read_docx_by_paragraphs(file_path):
"""
按段落读取指定路径的 .docx 文件
参数
file_path (str): .docx 文件的路径
返回
list: 包含所有段落文本的列表
"""
try:
# 打开文档
doc = Document(file_path)
# 读取所有段落的文本
paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
return paragraphs
except Exception as e:
print(f"读取 .docx 文件时发生错误: {e}")
return []
if __name__ == "__main__":
file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件.docx'
# output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\tmp" # 前附表json文件
# read_docx(file_path)
read_docx_tables(file_path)
list=read_tables_from_docx(file_path)
for i in list:
print(i)
print("--------------")
file_path = 'D:\\flask_project\\flask_app\\static\\output\\015d997e-c32c-49d1-a611-a2e817ace6a1\\ztbfile.docx'
read_docx(file_path) #按行读取
# paragraphs = read_docx_by_paragraphs(file_path) #按段落读取
#
# print(f"共读取到 {len(paragraphs)} 个段落。\n")
# for idx, para in enumerate(paragraphs, 1):
# print(f"段落 {idx}: {para}\n")
# read_docx_tables(file_path)
# list=read_tables_from_docx(file_path)
# for i in list:
# print(i)
# print("--------------")

View File

@ -0,0 +1,26 @@
1.该招标文件的项目名称是项目编号或招标编号采购人或招标人采购代理机构或招标代理机构请按json格式给我提供信息键名分别是'项目名称','项目编号','采购人','采购代理机构',若存在未知信息,在对应的键值中填'未知'。
2.该招标文件的项目概况是项目基本情况是请按json格式给我提供信息键名分别为'项目概况','项目基本情况',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,而嵌套键值必须与原文保持一致,若存在未知信息,在对应的键值中填'未知'。
3.该招标文件的最高限价或招标控制价请按json格式给我提供信息键名为'招标控制价',若存在未知信息,在对应的键值中填'未知'。
4.投标文件或响应文件递交截止时间是递交地点或方式请按json格式给我提供信息键名分别是'投标文件递交截止日期','递交地点'(或'递交方式',若存在未知信息,在对应的键值中填'未知'。
5.采购人招标人和采购代理机构或招标代理机构的联系方式是请按json格式给我提供信息键名分别是'采购人联系方式''采购代理机构联系方式',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。
6.该招标文件的信息公示媒介在哪请按json格式给我提供信息键名是'信息公示媒介',若存在未知信息,在对应的键值中填'未知'。
7.该招标文件的投标竞争下浮率是多少请按json格式给我提供信息键名是'投标竞争下浮率',若存在未知信息,在对应的键值中填'未知'。
8.该项目的投标有效期或响应文件有效期是什么请按json格式给我提供信息键名是'投标有效期',若存在未知信息,在对应的键值中填'未知'。
9.该招标文件对投标人准备和参加投标活动发生的费用是如何规定的请以json的格式给我提供信息键名是'费用承担',若存在未知信息,在对应的键值中填'未知'。
10.求澄清的招标文件截止时间是请以json的格式给我提供信息键名是'投标人要求澄清招标文件的截止时间',若存在未知信息,在对应的键值中填'未知'。
11.该文档要求扣留的质量保证金百分比是多少请以json格式给我提供信息键名为'质量保证金',如果没有则以'未知'填充。
12.该项目是否接受联合体投标请按json格式给我提供信息键名为'是否接受联合体投标''是否接受联合体投标'的键值仅限于'是'、'否'、'未知'。

View File

@ -1,4 +1,89 @@
import json
from flask_app.main.json_utils import clean_json_string, nest_json_under_key,rename_outer_key, combine_json_results
from flask_app.main.多线程提问 import read_questions_from_file, multi_threading
from flask_app.main.通义千问long import upload_file
def dynamic_key_handling(key_groups, detected_keys):
# 检查和调整键组配置
for key in detected_keys:
if "投标保证金" in key or "履约保证金" in key:
key_groups["保证金相关"].append(key)
elif "是否接受联合体" in key:
key_groups["项目信息"].append(key)
elif "联合体投标要求" in key:
key_groups["项目信息"].append(key)
elif "分包" in key:
key_groups["项目信息"].append(key)
elif "踏勘现场" in key:
key_groups["其他信息"].append(key)
elif "投标预备会" in key:
key_groups["其他信息"].append(key)
elif "偏离" in key:
key_groups["其他信息"].append(key)
def aggregate_basic_info(baseinfo_list):
"""
将基础信息列表中的数据进行合并和分类
参数
- baseinfo_list (list): 包含多个基础信息的列表
返回
- list: 合并和分类后的基础信息列表
"""
combined_baseinfo_list = []
key_groups = {
"招标人/代理信息": ["招标人", "招标人联系方式", "招标代理机构", "招标代理机构联系方式"],
"项目信息": ["工程名称", "招标编号", "工程概况", "招标范围", "招标控制价", "投标竞争下浮率"],
"关键时间/内容": [
"投标文件递交截止日期",
"递交方式",
"投标人要求澄清招标文件的截止时间",
"投标有效期",
"评标结果公示媒介"
],
"保证金相关": ["质量保证金", "退还投标保证金"],
"其他信息": [
"重新招标、不再招标和终止招标",
"是否退还投标文件",
"费用承担"
]
}
combined_data = {}
relevant_keys_detected = set()
# 合并所有基础信息并收集相关键
for baseinfo in baseinfo_list:
json_data = clean_json_string(baseinfo)
combined_data.update(json_data)
relevant_keys_detected.update(json_data.keys())
# 动态调整键组
dynamic_key_handling(key_groups, relevant_keys_detected)
# 按键组分类并嵌套
for group_name, keys in key_groups.items():
group_data = {key: combined_data.get(key, "未提供") for key in keys}
combined_json = nest_json_under_key(group_data, group_name)
combined_baseinfo_list.append(combined_json)
return combined_baseinfo_list
def combine_basic_info(knowledge_name,output_folder,clause_path):
# file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\6.2定版视频会议磋商文件(1)\\6.2定版视频会议磋商文件_1-21.pdf"
# file_id = upload_file(file_path)
# baseinfo_file_path='flask_app/static/提示词/前两章提问总结.txt'
# questions=read_questions_from_file(baseinfo_file_path)
# results = multi_threading(questions, "", file_id, 2) # 1代表使用百炼rag 2代表使用qianwen-long
# if not results:
# print("errror!")
# else:
# # 打印结果
# for question, response in results:
# print(f"Question: {question}")
# print(f"Response: {response}")
baseinfo_combined_res={
"招标人/代理信息": {
"招标人": "黄石临空建设管理有限公司",
@ -28,3 +113,11 @@ def combine_basic_info(knowledge_name,output_folder,clause_path):
}
}
return {"基础信息":baseinfo_combined_res}
if __name__ == "__main__":
knowledge_name = "ztb"
output_folder="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405"
truncate0="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\ztbfile_tobidders_notice_table.pdf"
clause_path="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\clause1.json"
res=combine_basic_info(knowledge_name,output_folder,clause_path)
print(json.dumps(res,ensure_ascii=False,indent=4))

View File

@ -57,12 +57,12 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
extracted_paragraphs[active_key] = [text]
if match_keywords(text, follow_up_keywords):
continue_collecting = True
section_number = re.match(r'(\d+(\s*\.\s*\d+)*)', text)
section_number = re.match(r'^(\d+(\s*\.\s*\d+)*)\s*、', text) # 修改后的正则,支持 '数字 、' 格式
if section_number:
current_section_number = section_number.group(1)
level_count = current_section_number.count('.')
# Pattern to match current level, e.g., 3.4.5
# Pattern to match current level, e.g., 3.4.5 或者 3
pattern = r'^' + (r'\d+\s*\.\s*') * level_count + r'\d+'
# Generate patterns for next section at same level and parent level
@ -71,20 +71,23 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
# Next section at same level
parts[-1] = str(int(parts[-1]) + 1)
next_pattern = r'^' + r'\s*\.\s*'.join(parts)
next_pattern = r'^' + r'\.\s*'.join(parts)
matched_patterns.append(next_pattern)
# Parent section (if applicable)
if len(parts) > 1:
parent_section_parts = parts[:-1]
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1)
parent_pattern = r'^' + r'\s*\.\s*'.join(parent_section_parts)
parent_pattern = r'^' + r'\.\s*'.join(parent_section_parts)
matched_patterns.append(parent_pattern)
# 添加对 '数字 、' 格式的支持
digit_comma_pattern = r'^\d+\s*、'
matched_patterns.append(digit_comma_pattern)
# Combine the patterns
combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
current_section_pattern = re.compile(combined_pattern)
else:
found_next_number = False
current_section_pattern = None
@ -93,7 +96,8 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
current_index += 1
next_text = doc.paragraphs[current_index].text.strip()
if not found_next_number:
next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))', next_text)
# 修改后的正则,支持 '数字 、' 格式
next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))|(\d+\s*、)', next_text)
if next_section_number:
found_next_number = True
if next_section_number.group(1):
@ -101,12 +105,14 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
elif next_section_number.group(2):
dynamic_pattern = r'^[\(\]\d+[\)\]'
elif next_section_number.group(3):
dynamic_pattern = r'^\d+\s*、'
current_section_pattern = re.compile(dynamic_pattern)
if current_section_pattern and re.match(current_section_pattern, next_text):
extracted_paragraphs[active_key].append(next_text)
else:
continue_collecting = False
active_key=None
active_key = None
break
return current_index
@ -118,6 +124,8 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
return extracted_paragraphs
"""
eg:
text_list = ["这是第一句。 1. 接下来是第二句! (3) 最后一句。"]
@ -143,6 +151,7 @@ def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达
for key, text_list in extracted_contents.items():
if len(text_list) == 1:
for data in text_list:
# print(data)
# 检查是否包含任何需要排除的字符串
if any(exclude in data for exclude in excludes):
continue # 如果包含任何排除字符串,跳过这个数据

View File

@ -464,9 +464,10 @@ def truncate_pdf_multiple(input_path, output_folder):
# TODO:交通智能系统和招标(1)(1)文件有问题 sele=4的时候excludsion有问题
if __name__ == "__main__":
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\outputtest"
# files=truncate_pdf_multiple(input_path,output_folder)
# print(files)
selection = 1 # 例如1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2
generated_files = truncate_pdf_main(input_path, output_folder, selection)
input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\ztbfile.pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9"
files=truncate_pdf_multiple(input_path,output_folder)
print(files)
# selection = 1 # 例如1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2
# generated_files = truncate_pdf_main(input_path, output_folder, selection)

View File

@ -36,6 +36,9 @@ def preprocess_files(output_folder, file_path, file_type, unique_id):
elif file_type == 2: # pdf
pdf_path = file_path
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
elif file_type ==3: #doc
pdf_path=docx2pdf(file_path)
docx_path=pdf2docx(pdf_path)
else:
logger.error("Unsupported file type provided. Preprocessing halted.")
return None