This commit is contained in:
zy123 2024-08-29 16:37:09 +08:00
commit 6463a9e593
62 changed files with 4793 additions and 0 deletions

8
.idea/.gitignore generated vendored Normal file
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

19
.idea/first_pro.iml generated Normal file
View File

@ -0,0 +1,19 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="Flask">
<option name="enabled" value="true" />
</component>
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="bidding_trading_project (3)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TemplatesService">
<option name="TEMPLATE_CONFIGURATION" value="Jinja2" />
<option name="TEMPLATE_FOLDERS">
<list>
<option value="$MODULE_DIR$/../first_pro\templates" />
</list>
</option>
</component>
</module>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
.idea/misc.xml generated Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="bidding_trading_project (3)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml generated Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/first_pro.iml" filepath="$PROJECT_DIR$/.idea/first_pro.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml generated Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

29
Dockerfile Normal file
View File

@ -0,0 +1,29 @@
# 使用官方 Python 运行时作为父镜像
FROM python:3.8-slim
# 设置工作目录
WORKDIR /ZbparseProjects
RUN pip config set global.progress_bar off
# 复制 requirements.txt 并安装依赖,确保每次构建都可以使用缓存(除非 requirements.txt 改变)
COPY ../../requirements.txt .
# 安装依赖
RUN pip install --upgrade pip --default-timeout=100 \
&& pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt
# 将当前目录的内容复制到容器的 /PycharmProjects 中
COPY .. .
# 定义环境变量
ENV DASHSCOPE_API_KEY=sk-f7ad8ad193064cf482588f7064e75183
ENV DASHSCOPE_WORKSPACE_ID=llm-mo38469hdfwtervi
ENV ALIBABA_CLOUD_ACCESS_KEY_ID=LTAI5tRWhjktXyY5MovoiNuF
ENV ALIBABA_CLOUD_ACCESS_KEY_SECRET=88oyw7LniqV8i0SnOuSFS5lprfrPtw
# 暴露端口
EXPOSE 5000
# 在容器启动时运行你的应用
CMD ["python", "main/start_up.py"]

0
__init__.py Normal file
View File

0
flask_app/__init__.py Normal file
View File

View File

View File

@ -0,0 +1,6 @@
from ..main.通义千问long import qianwen_long,upload_file
def read_dictory(file_path):
file_id=upload_file(file_path)
user_query="根据该文档中的评标办法前附表,请你列出该文件的技术标以及它对应的具体评分要求,若对应内容中存在其他信息,在嵌套键如'技术标'中新增键名'备注'存放该信息。如果评分内容不是这3个则返回文档中给定的评分内容以及它的评分要求都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容"

View File

@ -0,0 +1,58 @@
import json
def find_keys_by_value(target_value, json_data):
matched_keys = [k for k, v in json_data.items() if v == target_value]
if not matched_keys:
matched_keys = [k for k, v in json_data.items() if isinstance(v, str) and v.startswith(target_value)]
return matched_keys
def find_keys_with_prefix(key_prefix, json_data):
subheadings = [k for k in json_data if k.startswith(key_prefix) and k != key_prefix]
return subheadings
def extract_json(data, target_values):
results = {}
for target_value in target_values:
matched_keys = find_keys_by_value(target_value, data)
for key in matched_keys:
key_and_subheadings = {key: data[key]}
subheadings = find_keys_with_prefix(key, data)
for subkey in subheadings:
key_and_subheadings[subkey] = data[subkey]
results[target_value] = key_and_subheadings
return results
def renumber_keys(data, level=1):
if isinstance(data, dict):
new_dict = {}
for key in data:
parts = key.split('.')
parts[0] = '1'
new_key = '.'.join(parts)
new_dict[new_key] = renumber_keys(data[key], level + 1)
return new_dict
else:
return data
def json_results(extr_json):
renumbered_data = {}
for key in extr_json:
renumbered_data[key] = renumber_keys(extr_json[key])
return renumbered_data
if __name__ == "__main__":
target_values = ["投标文件"]
with open('clause3.json', 'r', encoding='utf-8') as file:
data = json.load(file)
extracted_data = extract_json(data, target_values)
renumbered_data = json_results(extracted_data)
with open('output_results1.json', 'w', encoding='utf-8') as file:
json.dump(renumbered_data, file, indent=4, ensure_ascii=False)
print("JSON文件已按要求重新编号并保存.")

View File

View File

@ -0,0 +1,45 @@
import requests
import mimetypes
def download_file(url, local_filename):
try:
with requests.get(url, stream=True) as response:
response.raise_for_status() # 确保请求成功,否则抛出异常
# 获取文件类型并设置适当的文件扩展名
content_type = response.headers.get('Content-Type')
extension = mimetypes.guess_extension(content_type, strict=False)
if not extension:
# 如果无法猜测扩展名,默认使用 .docx
extension = '.docx'
full_filename = local_filename + extension # 追加扩展名
with open(full_filename, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
# 根据扩展名返回对应的值
if extension == '.docx':
return full_filename,1
elif extension == '.pdf':
return full_filename,2
else:
return full_filename,3
except requests.HTTPError as e:
print(f"HTTP Error: {e}")
return None
except requests.RequestException as e:
print(f"Error downloading the file: {e}")
return None
except Exception as e:
print(f"An error occurred: {e}")
return None
if __name__ == '__main__':
# 测试下载的URL
test_url ="https://temp-pdf2docx.oss-cn-wuhan-lr.aliyuncs.com/docx/zbfile.docx?Expires=1724866978&OSSAccessKeyId=TMP.3KhJJmRnpG3r3FKwULgxRm7pfH2wHVDgwo7HotjD9j3w23omXG1mwrnBtP7n1G6j4HWW6CURq7JHqZ4kmC6RBMAZFcoDsw&Signature=LMczkwe6nVNbAHX4xvgCs8MtZ48%3D"
local_file_name = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output\\downloaded_file'
file_path = download_file(test_url, local_file_name)
if file_path:
print(f"Downloaded file path: {file_path}")

View File

@ -0,0 +1,42 @@
import PyPDF2
import re
def extract_contents_with_pages(pdf_path, keyword):
with open(pdf_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
for page_number in range(len(reader.pages)):
page = reader.pages[page_number]
text = page.extract_text()
if text:
lines = text.split('\n')
for line in lines:
if keyword.lower() in line.lower():
match = re.search(r"\d+(?=\s*$)", line)
if match:
return int(match.group(0)) # 直接返回整数类型的页码
return None # 如果遍历完所有页面后仍未找到页码返回None
def split_pdf(pdf_path, start_page, output_path):
"""切分PDF文件从start_page到end_page"""
with open(pdf_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
writer = PyPDF2.PdfWriter()
end_page = len(reader.pages)
# 确保start_page是整数
start_page = int(start_page)
# 注意页码从0开始因此需要调整页码索引
for i in range(start_page - 1, end_page):
writer.add_page(reader.pages[i])
with open(output_path, "wb") as output_pdf:
writer.write(output_pdf)
# 使用示例
pdf_path = "D:\\项目\\工程招标\\zb1.pdf"
output_path = "D:\\项目\\工程招标\\tb_format.pdf"
keyword = "投标文件格式" # 修改为你想查找的关键字
page_number = extract_contents_with_pages(pdf_path, keyword)
print(page_number)
if page_number is not None:
split_pdf(pdf_path, page_number, output_path)
else:
print("未找到含有关键字的页码")

View File

@ -0,0 +1,69 @@
import json
import os
import requests
from download import download_file
def upload_file(file_path, url):
receive_file_url = ""
# 定义文件名和路径
filename = file_path.split('/')[-1]
# 打开文件以二进制形式读取
with open(file_path, 'rb') as f:
# 使用multipart/form-data格式发送文件
files = {'file': (filename, f)}
# 发送POST请求
response = requests.post(url, files=files)
# 检查响应状态码
if response.status_code == 200:
print("文件上传成功")
receive_file_response = response.content.decode('utf-8')
receive_file_json = json.loads(receive_file_response)
receive_file_url = receive_file_json["data"]
else:
print(f"文件上传失败,状态码: {response.status_code}")
print(response.text)
return receive_file_url
def get_filename_and_folder(file_path):
# 使用os.path.basename获取文件名
filename = os.path.splitext(os.path.basename(file_path))[0] #ztb_tobidders_notice_table,不包括扩展名
# 使用os.path.dirname获取文件所在的完整目录路径再次使用basename获取文件夹名称
directory = os.path.dirname(file_path)
return filename, directory
#参数为要转换的文件路径,以及输出的文件名,文件类型为自动补全。
def pdf2docx(local_path_in):
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/p2d'
receive_download_url = upload_file(local_path_in, remote_url)
filename, folder = get_filename_and_folder(local_path_in) #输入输出在同一个文件夹
local_path_out=os.path.join(folder,filename) #输出文件名
downloaded_filepath,file_type=download_file(receive_download_url, local_path_out)
print("have downloaded file to:",downloaded_filepath)
return downloaded_filepath
def docx2pdf(local_path_in):
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p'
receive_download_url = upload_file(local_path_in, remote_url)
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
local_path_out = os.path.join(folder, filename) # 输出文件名
downloaded_filepath,file_type = download_file(receive_download_url, local_path_out)
print("have downloaded file to:", downloaded_filepath)
return downloaded_filepath
if __name__ == '__main__':
# 替换为你的文件路径和API URL
local_path_in="C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest16_invalid.docx"
# pdf2docx(local_path_in)
downloaded_file=docx2pdf(local_path_in)
print(downloaded_file)

View File

@ -0,0 +1,116 @@
import json
import re
def extract_content_from_json(json_data):
"""提取 { 和 } 之间的内容,并将其解析为字典"""
if not json_data.strip():
return {}
match = re.search(r'\{[\s\S]*\}', json_data)
if match:
try:
json_data = match.group(0)
return json.loads(json_data) #返回字典
except json.JSONDecodeError as e:
print(f"JSON decode error: {e}")
return {}
else:
print("No valid JSON content found.")
return {}
def clean_json_string(json_string):
"""清理JSON字符串移除多余的反引号并解析为字典"""
return extract_content_from_json(json_string)
def combine_json_results(json_lists):
"""
将类json格式的列表整合成json数据即大括号{}包裹
"""
combined_result = {}
for json_str in json_lists:
if json_str.strip():
json_data = clean_json_string(json_str)
combined_result.update(json_data)
return combined_result
def nest_json_under_key(data, key):
"""
将给定的字典 data 嵌套在一个新的字典层级下该层级由 key 指定并返回 JSON 格式的字符串
参数:
- data: dict, 要嵌套的原始字典
- key: str, 新层级的键名
返回:
- 嵌套后的 JSON 字符串
"""
# 创建一个新字典,其中包含一个键,该键的值是原始字典
nested_dict = {key: data}
# 将字典转换成 JSON 字符串
nested_json = json.dumps(nested_dict, ensure_ascii=False, indent=4)
return nested_json
def add_keys_to_json(target_dict, source_dict):
"""
source_dict 的内容添加到 target_dict 中的唯一外层键下的字典中
参数:
target_dict (dict): 要更新的目标字典假定只有一个外层键
source_dict (dict): 源字典其内容将被添加到目标字典
返回:
dict: 更新后的字典
"""
if not target_dict:
print("Error: Target dictionary is empty.")
return {}
if len(target_dict) != 1:
print("Error: Target dictionary must contain exactly one top-level key.")
return target_dict
# 获取唯一的外层键
target_key, existing_dict = next(iter(target_dict.items()))
if not isinstance(existing_dict, dict):
print(f"Error: The value under the key '{target_key}' is not a dictionary.")
return target_dict
# 合并字典
existing_dict.update(source_dict)
# 更新原字典
target_dict[target_key] = existing_dict
return target_dict
def rename_outer_key(original_data,new_key):
# 定义新的键名
# new_key = "重新招标, 不再招标和终止招标"
# 提取原始数据中的唯一外层值(假设只有一个外层键)
if not original_data or not isinstance(original_data, dict):
return {} # 如果输入无效或不是字典,则返回空字典
# 使用 next(iter(...)) 提取第一个键的值
original_value = next(iter(original_data.values()), {})
# 创建一个新的字典,使用新的键名
new_data = {new_key: original_value}
return json.dumps(new_data,ensure_ascii=False)
def transform_json_values(data):
if isinstance(data, dict):
return {key: transform_json_values(value) for key, value in data.items()}
elif isinstance(data, list):
return [transform_json_values(item) for item in data]
elif isinstance(data, bool):
return '' if data else ''
elif isinstance(data, (int, float)):
return str(data)
elif isinstance(data, str):
return data.replace('\n', '<br>')
else:
return data

View File

@ -0,0 +1,28 @@
import json
def search_key_in_json(file_path, search_key):
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
# 递归函数查找键
def recursive_search(data, key):
if key in data:
return key, data[key]
for k, v in data.items():
if isinstance(v, dict):
result = recursive_search(v, key)
if result:
return result
return None
result = recursive_search(data, search_key)
if result:
return f"{result[0]} : {result[1]}"
else:
return f"{search_key} : /"
# 用法示例
file_path = 'C:/Users/Administrator/Downloads/truncate_output2.json' # 替换为你的 JSON 文件路径
search_key = '多标段投标' # 替换为你想搜索的键
print(search_key_in_json(file_path, search_key))

219
flask_app/main/start_up.py Normal file
View File

@ -0,0 +1,219 @@
import logging
import shutil
import sys
import time
import uuid
from datetime import datetime, timedelta
from flask import Flask, request, jsonify, send_file, Response, stream_with_context
import json
import os
from download import download_file
from 招标文件解析 import main_processing
app = Flask(__name__)
class CSTFormatter(logging.Formatter):
"""自定义的 Formatter将日志的时间戳调整为中国标准时间UTC+8"""
def formatTime(self, record, datefmt=None):
ct = datetime.fromtimestamp(record.created) + timedelta(hours=8)
if datefmt:
s = ct.strftime(datefmt)
else:
try:
s = ct.strftime("%Y-%m-%d %H:%M:%S")
if self.usesTime():
s = f"{s},{record.msecs:03d}"
except ValueError:
s = ct.strftime("%Y-%m-%d %H:%M:%S")
return s
def create_logger(unique_id):
"""为每个请求创建一个新的日志器,日志器的日志文件存放在指定的输出文件夹中"""
output_folder = f"/ZbparseProjects/static/output/{unique_id}"
# output_folder =f"C:/Users/Administrator/Desktop/招标文件/test/{unique_id}"
if not os.path.exists(output_folder):
os.makedirs(output_folder, exist_ok=True)
log_filename = "log.txt"
log_path = os.path.join(output_folder, log_filename)
logger = logging.getLogger(unique_id) # 使用 unique_id 作为日志器名字
if not logger.handlers: # 避免重复添加处理器
# 文件处理器
file_handler = logging.FileHandler(log_path)
file_formatter = CSTFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_formatter)
logger.addHandler(file_handler)
# 流处理器(控制台输出)
stream_handler = logging.StreamHandler(sys.stdout)
stream_formatter = logging.Formatter('%(message)s') # 简化的格式,只输出消息
stream_handler.setFormatter(stream_formatter)
logger.addHandler(stream_handler)
logger.setLevel(logging.INFO)
return logger, output_folder
@app.route('/upload', methods=['POST'])
# def zbparse():
# file_url = validate_request()
# if isinstance(file_url, tuple): # Check if the returned value is an error response
# return file_url
# try:
# app.logger.info("starting parsing url:"+file_url)
# final_json_path, output_folder,logger = download_and_process_file(file_url)
# if not final_json_path:
# return jsonify({'error': 'File processing failed'}), 500
# response = generate_response(final_json_path,logger) # 先获取响应内容
# # remove_directory(output_folder) # 然后删除文件夹
# return response # 最后返回获取的响应
# except Exception as e:
# app.logger.error('Exception occurred: ' + str(e)) # 使用全局 logger 记录
# return jsonify({'error': str(e)}), 500
def zbparse():
file_url = validate_request()
if isinstance(file_url, tuple): # Check if the returned value is an error response
return file_url
try:
app.logger.info("starting parsing url:" + file_url)
return Response(stream_with_context(process_and_stream(file_url)), content_type='text/event-stream')
except Exception as e:
app.logger.error('Exception occurred: ' + str(e))
return jsonify({'error': str(e)}), 500
def process_and_stream(file_url):
unique_id = str(uuid.uuid4())
logger, output_folder = create_logger(unique_id)
filename = "ztbfile"
downloaded_filename = os.path.join(output_folder, filename)
downloaded_filepath, file_type = download_file(file_url, downloaded_filename)
if downloaded_filepath is None or file_type == 3:
logger.error("Unsupported file type or failed to download file")
error_response = {
'message': 'File processing failed',
'filename': None,
'data': json.dumps({'error': 'File processing failed'})
}
yield f"data: {json.dumps(error_response)}\n\n"
return
logger.info("Local file path: " + downloaded_filepath)
for data in main_processing(output_folder, downloaded_filepath, file_type, unique_id):
response = {
'message': 'Processing',
'filename': os.path.basename(downloaded_filepath),
'data': data
}
yield f"data: {json.dumps(response)}\n\n"
final_response = {
'message': 'File uploaded and processed successfully',
'filename': os.path.basename(downloaded_filepath),
'data': 'END'
}
yield f"data: {json.dumps(final_response)}\n\n"
def validate_request():
if not request.is_json:
return jsonify({'error': 'Missing JSON in request'}), 400
file_url = request.json.get('file_url')[0]
if not file_url:
return jsonify({'error': 'No file URL provided'}), 400
return file_url
def download_and_process_file(file_url):
unique_id = str(uuid.uuid4()) # 生成一个唯一的 UUID
logger, output_folder = create_logger(unique_id)
filename = "ztbfile"
downloaded_filename = os.path.join(output_folder, filename)
# 下载文件,假设 download_file 函数已正确处理异常并返回文件路径
downloaded_filepath,file_type = download_file(file_url, downloaded_filename)
if downloaded_filepath is None or file_type == 3:
logger.error("Unsupported file type or failed to download file")
return None, output_folder, logger
logger.info("Local file path: " + downloaded_filepath)
processed_file_path = main_processing(output_folder, downloaded_filepath,file_type, unique_id)
return processed_file_path, output_folder,logger
@app.route('/api/test_zbparse', methods=['POST'])
def test_zbparse():
try:
return Response(stream_with_context(test_process_and_stream()), content_type='text/event-stream')
except Exception as e:
app.logger.error('Exception occurred: ' + str(e))
return jsonify({'error': str(e)}), 500
def test_process_and_stream():
# 模拟五段数据
data_segments = [
{"base_info": {"project_name": "测试项目1", "project_code": "TP001"}},
{"review_standards": ["标准1", "标准2", "标准3"]},
{"evaluation_standards": ["评估标准A", "评估标准B"]},
{"invalid_requirements": ["无效要求X", "无效要求Y"]},
{"bidding_documents_requirements": ["文件要求1", "文件要求2"]}
]
filename = "test_file.pdf"
for i, data in enumerate(data_segments, 1):
response = {
'message': f'Processing segment {i}',
'filename': filename,
'data': data
}
yield f"data: {json.dumps(response)}\n\n"
time.sleep(5) # 每隔2秒发送一段数据
# 发送结束信号
final_response = {
'message': 'File processed successfully',
'filename': filename,
'data': 'END'
}
yield f"data: {json.dumps(final_response)}\n\n"
# def generate_response(final_json_path,logger):
# if not os.path.exists(final_json_path):
# logger.error('JSON file not found at path: ' + final_json_path)
# return jsonify({'error': 'JSON file not found'}), 404
# with open(final_json_path, 'r', encoding='utf-8') as f:
# logger.info('final_json_path:'+final_json_path)
# zbparse_data = json.load(f)
# json_str = json.dumps(zbparse_data, ensure_ascii=False)
# return jsonify({
# 'message': 'File uploaded and processed successfully',
# 'filename': os.path.basename(final_json_path),
# 'data': json_str
# })
# @app.route('/get_json', methods=['POST'])
# def testjson():
# final_json_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp4\\fd55f067-2cf6-475c-b7ce-4498f6606bf6\\final_result.json"
# with open(final_json_path, 'r', encoding='utf-8') as f:
# print('final_json_path:'+final_json_path)
# zbparse_data = json.load(f)
# json_str = json.dumps(zbparse_data, ensure_ascii=False)
# print(json_str)
# return jsonify({
# 'message': 'File uploaded and processed successfully',
# 'filename': os.path.basename(final_json_path),
# 'data': json_str
# })
def remove_directory(path):
try:
shutil.rmtree(path)
app.logger.info(f"Successfully removed directory: {path}") # 使用全局 logger 记录
except Exception as e:
app.logger.error(f"Failed to remove directory {path}: {str(e)}") # 使用全局 logger 记录
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0', port=5000)

View File

@ -0,0 +1,87 @@
from docx import Document
import json
def read_tables_from_docx(file_path):
"""读取DOCX文件中的表格数据并以嵌套字典的形式返回."""
doc = Document(file_path)
table_list = {}
cur_title = []
for table in doc.tables:
for i, row in enumerate(table.rows):
cur_level = table_list
temp_title = []
for j, cell in enumerate(row.cells):
text_str = cell.text.strip().replace(' ', '').replace('\n', '') # 移除键中的换行符
if j < len(row.cells) - 1:
if text_str == "":
text_str = cur_title[j] if j < len(cur_title) else "<未识别到上级标题>"
if text_str not in cur_level:
cur_level[text_str] = {}
cur_level = cur_level[text_str]
temp_title.append(text_str)
else:
cell_text = cell.text.strip().replace(' ', '')
if len(temp_title) > 0:
last_key = temp_title[-1]
if last_key in cur_level:
if isinstance(cur_level[last_key], dict):
cur_level[last_key] = f"\n{cell_text}"
else:
cur_level[last_key] += f"\n{cell_text}" # 追加值到已有键
else:
cur_level[last_key] = cell_text # 初始化键的值
else:
last_key = f"{i}行内容"
if last_key in cur_level:
if isinstance(cur_level[last_key], dict):
cur_level[last_key] = f"\n{cell_text}"
else:
cur_level[last_key] += f"\n{cell_text}" # 追加值到'第i行内容'
else:
cur_level[last_key] = cell_text
cur_title = temp_title[:]
return table_list
def flatten_nested_dicts(d):
"""平坦化嵌套字典以便更简洁地保存为JSON."""
keys_to_remove = []
items_to_add = {}
for key, value in list(d.items()):
if isinstance(value, dict):
value = flatten_nested_dicts(value)
if len(value) == 1 and key in value:
keys_to_remove.append(key)
items_to_add[key] = value[key]
elif len(value) == 1 and list(value.keys())[0] == list(value.values())[0]:
items_to_add[key] = list(value.values())[0]
for key in keys_to_remove:
del d[key]
d.update(items_to_add)
return d
def save_data_to_json(data, filename):
"""将数据保存到JSON文件中."""
with open(filename, 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
def extract_tables_main(path, output_filename):
# 读取文档表格数据
table_data = read_tables_from_docx(path)
# 平坦化嵌套字典
flattened_data = flatten_nested_dicts(table_data)
# 保存平坦化后的数据到JSON文件
save_data_to_json(flattened_data, output_filename)
print(f"The data has been processed and saved to '{output_filename}'.")
if __name__ == "__main__":
path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标03_tobidders_notice_table.docx'
output_filename = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json" # 前附表json文件
extract_tables_main(path, output_filename)

54
flask_app/main/test.py Normal file
View File

@ -0,0 +1,54 @@
import json
import re
def transform_json(input_json):
# 解析输入的JSON字符串
data = json.loads(input_json)
# 创建一个新的字典来存储转换后的结果
result = {}
# 用于临时存储各个层级的字典
temp = {0: result}
# 遍历原始JSON的键值对
for key, value in data.items():
# 使用正则表达式判断键名的层级
match = re.match(r'(\d+)(?:\.(\d+))?(?:\.(\d+))?', key)
if match:
levels = [int(l) for l in match.groups() if l is not None]
# 获取当前层级的父级字典
parent = temp[len(levels) - 1]
# 如果是最后一层,直接添加值
if len(levels) == len(match.groups()):
if isinstance(parent, list):
parent.append(value)
else:
parent[value.split()[0]] = value
else:
# 如果不是最后一层,创建新的列表或字典
new_key = value.split()[0]
if new_key not in parent:
parent[new_key] = [] if isinstance(parent, dict) else {}
temp[len(levels)] = parent[new_key]
return json.dumps(result, ensure_ascii=False, indent=2)
# 输入的JSON字符串
input_json = '''{
"6.": "评标",
"6.1": "评标委员会",
"6.1.1": "评标由招标人依法组建的评标委员会负责。评标委员会由招标人或其委托的招标代理机构熟悉相关业务的代表,以及有关技术、经济等方面的专家组成。评标委员会成员人数以及技术、经济等方面专家的确定方式见投标人须知前附表。",
"6.1.2": "评标委员会成员有下列情形之一的应当回避1投标人或投标人的主要负责人的近亲属2项目主管部门或者行政监督部门的人员3与投标人有经济利益关系或其他利害关系可能影响对投标公正评审的4曾因在招标、评标以及其他与招标投标有关活动中从事违法行为而受过行政处罚或刑事处罚的。",
"6.2": "评标原则评标活动遵循公平、公正、科学和择优的原则。",
"6.3": "评标评标委员会按照第三章"评标办法"规定的方法、评审因素、标准和程序对投标文件进行评审。第三章"评标办法"没有规定的方法、评审因素和标准,不作为评标依据。",
"6.4": "评标结果定标候选人公示招标人将自收到评标报告之日起3日内在投标人须知前附表规定的媒介公示定标候选人。公示期不少于3日。投标人或者其他利害关系人对评标结果有异议的应当在评标结果公示期间提出。招标人自收到异议之日起3日内作出答复作出答复前暂停招标投标活动。异议与答复应当通过"电子交易系统""异议与答复"菜单以书面形式进行。",
"6.5": "履约能力的审查(如有)如果定标候选人的经营、财务状况发生较大变化或者存在违法行为,招标人认为可能影响其履约能力的,将在发出中标通知书前,召集原评标委员会按照招标文件规定的标准和方法审查确认。"
}'''
# 转换JSON并打印结果
print(transform_json(input_json))

18
flask_app/main/ttt.py Normal file
View File

@ -0,0 +1,18 @@
import re
# 正则表达式
pattern = re.compile(r'第[一二三四五六七八九十百千]+章.*?(?:项目|服务|商务).*?要求')
# 示例文本进行测试
text = """
第一章项目技术服务及商务要求
第二章 服务细节要求
第三章 商务处理要求
第四章 项目安排要求
第五章 安全要求
"""
# 查找所有匹配
matches = pattern.findall(text)
for match in matches:
print(match)

View File

@ -0,0 +1,55 @@
# -*- coding: utf-8 -*-
# This file is auto-generated, don't edit it. Thanks.
import os
from alibabacloud_bailian20231229.client import Client as bailian20231229Client
from alibabacloud_tea_openapi import models as open_api_models
from alibabacloud_bailian20231229 import models as bailian_20231229_models
from alibabacloud_tea_util import models as util_models
from alibabacloud_tea_util.client import Client as UtilClient
def create_client() -> bailian20231229Client:
"""
使用AK&SK初始化账号Client
@return: Client
@throws Exception
"""
config = open_api_models.Config(
access_key_id=os.environ['ALIBABA_CLOUD_ACCESS_KEY_ID'],
access_key_secret=os.environ['ALIBABA_CLOUD_ACCESS_KEY_SECRET']
)
config.endpoint = 'bailian.cn-beijing.aliyuncs.com'
return bailian20231229Client(config)
def delete_index(client: bailian20231229Client, workspace_id: str, index_id: str) -> None:
delete_index_request = bailian_20231229_models.DeleteIndexRequest(
index_id=index_id
)
runtime = util_models.RuntimeOptions()
headers = {}
try:
response = client.delete_index_with_options(workspace_id, delete_index_request, headers, runtime)
print("API Response:", response)
except Exception as error:
print(error.message)
print(error.data.get("Recommend"))
UtilClient.assert_as_string(error.message)
async def delete_index_async(client: bailian20231229Client, workspace_id: str, index_id: str) -> None:
delete_index_request = bailian_20231229_models.DeleteIndexRequest(
index_id=index_id
)
runtime = util_models.RuntimeOptions()
headers = {}
try:
response = await client.delete_index_with_options_async(workspace_id, delete_index_request, headers, runtime)
print("API Response:", response)
except Exception as error:
print(error.message)
print(error.data.get("Recommend"))
UtilClient.assert_as_string(error.message)
if __name__ == '__main__':
workspace_id = os.environ['DASHSCOPE_WORKSPACE_ID']
index_id = 'pg5rrsv26x'
client = create_client()
delete_index(client, workspace_id, index_id)

View File

@ -0,0 +1,157 @@
# -*- encoding:utf-8 -*-
import json
import os.path
import re
from json_utils import extract_content_from_json # 可以选择性地导入特定的函数
from 提取打勾符号 import read_pdf_and_judge_main
from 通义千问 import qianwen_ask
from 通义千问long import qianwen_long,upload_file
#调用qianwen-ask之后组织提示词问百炼。
def construct_judge_questions(json_data):
# 使用 extract_content_from_json 提取和解析 JSON 数据
parsed_data = extract_content_from_json(json_data)
if not parsed_data:
return ""
question_keys = []
for key, value in parsed_data.items():
if value == '未知':
question_keys.append(f"'{key}'")
if not question_keys:
return ""
# 移除单引号后的键名列表字符串
questions_without_quotes = ', '.join(key.strip("'") for key in question_keys) # 移除单引号
if not questions_without_quotes: # 检查 questions_without_quotes 是否为空
return ""
keys_str = ",".join(question_keys)
question = f"请你依据文档中的信息回答,{questions_without_quotes}请按json格式给我提供信息键名分别为{keys_str},键值仅限于'','','未知'"
return question
def merge_json_to_list(merged):
"""Merge updates into the original data by modifying specific keys based on their value ('' or ''), and create a list based on these values."""
chosen_numbers = []
# 处理是否允许分包 保持'是否允许分包'键名主要是由于存在'未知'的情况。
if merged.get('是否允许分包') == '':
chosen_numbers.append(1)
merged.pop('是否允许分包', None)
elif merged.get('是否允许分包') == '':
merged['分包'] = '不允许'
merged.pop('是否允许分包', None)
# 处理是否递交投标保证金
if merged.get('是否递交投标保证金') == '':
chosen_numbers.extend([2, 3])
merged.pop('是否递交投标保证金', None)
elif merged.get('是否递交投标保证金') == '':
merged['投标保证金'] = '不提交'
merged['退还投标保证金'] = '/'
merged.pop('是否递交投标保证金', None)
# 处理是否有履约保证金
if merged.get('是否有履约保证金') == '':
chosen_numbers.append(4)
merged.pop('是否有履约保证金', None)
elif merged.get('是否有履约保证金') == '':
merged['履约保证金'] = '不提交'
merged.pop('是否有履约保证金', None)
# 处理是否有招标代理服务费
if merged.get('是否有招标代理服务费') == '':
chosen_numbers.append(5)
merged.pop('是否有招标代理服务费', None)
elif merged.get('是否有招标代理服务费') == '':
merged['招标代理服务费'] = ''
merged.pop('是否有招标代理服务费', None)
if merged.get('是否组织踏勘现场') == '':
chosen_numbers.append(6)
merged.pop('是否组织踏勘现场',None)
elif merged.get('是否组织踏勘现场') == '':
merged['踏勘现场']='不组织'
merged.pop('是否组织踏勘现场', None)
if merged.get('是否召开投标预备会') == '':
chosen_numbers.append(7)
merged.pop('是否召开投标预备会',None)
elif merged.get('是否召开投标预备会') == '':
merged['投标预备会']='不召开'
merged.pop('是否召开投标预备会', None)
if merged.get('是否允许偏离') == '':
chosen_numbers.append(8)
merged.pop('是否允许偏离',None)
elif merged.get('是否允许偏离') == '':
merged['偏离']='不允许'
merged.pop('是否允许偏离', None)
return chosen_numbers, json.dumps(merged,ensure_ascii=False)
def read_questions_from_judge(file_path, indices):
questions = []
# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# 正则表达式提取问题
pattern = r'(\d+)\.(.*?)#pdf提取之后的提示词|(\d+)\.(.*?)(?=\d+\.|$)'
matches = re.findall(pattern, content, re.DOTALL)
# 解析匹配到的内容并提取对应序号的问题
for match in matches:
num = match[0] or match[2]
question = match[1].strip() or match[3].strip()
if int(num) in indices:
questions.append(question)
return questions
def judge_whether_main(file_path,output_folder): #传入招标文件中‘投标人须知前附表’
prompt = "请你依据以上信息,回答以下问题:是否组织踏勘现场?是否召开投标预备会?是否允许偏离?是否退还投标文件?是否允许分包? 是否需要递交投标保证金是否需要提交履约保证金履约担保是否有招标代理服务费请按json格式给我提供信息键名分别为'是否组织踏勘现场','是否召开投标预备会','是否允许偏离','是否退还投标文件',是否允许分包','是否递交投标保证金','是否提交履约保证金','是否有招标代理服务费',键值仅限于'','','未知',若存在矛盾信息,请回答'未知'"
output_json_path = os.path.join(output_folder,'judge_exist.json')
read_pdf_and_judge_main(file_path, output_json_path) #提取打勾符号
qianwen_answer = qianwen_ask(output_json_path, prompt) # 调用普通千问判断是、否、未知
print("qianwen_answer:" + qianwen_answer)
user_query = construct_judge_questions(qianwen_answer) # 提取回答为”未知“的键
# 判断user_query是否为空
if user_query:
print("user_query:" + user_query)
file_id = upload_file(file_path)
res = qianwen_long(file_id, user_query) #整个前附表一起传问千问long
print(res)
return process_judge_content(qianwen_answer, res)
else:
print("Normal sig!No valid user query available. Skipping further actions.")
original = extract_content_from_json(qianwen_answer)
return merge_json_to_list(original)
def process_judge_content(original_json, update_json): #用新的数据合并旧数据
"""Process judging content by merging updates into the original JSON data."""
original = extract_content_from_json(original_json)
updates = extract_content_from_json(update_json)
original.update(updates)
print(original)
return merge_json_to_list(original)
if __name__ == "__main__":
file_path="C:\\Users\\Administrator\\Desktop\\招标文件\\test2\\zbtest10_tobidders_notice_table.pdf"
output_dir="C:\\Users\\Administrator\\Desktop\\招标文件\\output1"
chosen_numbers, merged=judge_whether_main(file_path,output_dir)
print(chosen_numbers)
print(merged)

View File

@ -0,0 +1,43 @@
import json
from json_utils import clean_json_string, combine_json_results
from 通义千问long import upload_file, qianwen_long
def combine_business_and_bidding(data):
# 提取并移除“商务标”
business_data = data.pop("商务标", {})
# 尝试提取不同命名的“投标报价”
bidding_data = data.pop("投标报价", None)
if bidding_data is None:
bidding_data = data.pop("投标报价评审标准", {})
# 将“商务标”和“投标报价”数据合并
business_data.update({
"投标报价": bidding_data
})
# 将修改后的商务标数据重新放入主数据结构
data["商务标"] = business_data
return data
def combine_evaluation_standards(truncate2):
# 商务标、技术标评分项:千问
print("starting商务标技术标...")
file_id = upload_file(truncate2)
user_query_2 = (
"根据该文档中的评标办法前附表,请你列出该文件的技术标,商务标,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术标'中新增子键名'备注'存放该信息。如果评分内容不是这3个则返回文档中给定的评分内容以及它的评分要求都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容")
evaluation_res = qianwen_long(file_id, user_query_2)
update_json=combine_business_and_bidding(clean_json_string(evaluation_res))
update_json_str = json.dumps(update_json, ensure_ascii=False)
temp=combine_json_results([update_json_str])
evaluation_combined_res = json.dumps(temp,ensure_ascii=False,indent=4)
print("商务标技术标done")
return evaluation_combined_res
if __name__ == "__main__":
truncate2="C:\\Users\\Administrator\\Desktop\\招标文件\\output\\zbfile_evaluation_method.pdf"
res=combine_evaluation_standards(truncate2)
print(res)

View File

@ -0,0 +1,216 @@
#基于多线程提问,现已废弃
# assistant_id
import queue
import concurrent.futures
from dashscope import Assistants, Messages, Runs, Threads
from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
from json_utils import extract_content_from_json
prompt = """
# 角色
你是一个文档处理专家专门负责理解和操作基于特定内容的文档任务这包括解析总结搜索或生成与给定文档相关的各类信息
## 技能
### 技能 1文档解析与摘要
- 深入理解并分析${document1}的内容提取关键信息
- 根据需求生成简洁明了的摘要保持原文核心意义不变
### 技能 2信息检索与关联
- ${document1}中高效检索特定信息或关键词
- 能够识别并链接到文档内部或外部的相关内容增强信息的连贯性和深度
## 限制
- 所有操作均需基于${document1}的内容不可超出此范围创造信息
- 在处理敏感或机密信息时需遵守严格的隐私和安全规定
- 确保所有生成或改编的内容逻辑连贯无误导性信息
请注意上述技能执行时将直接利用并参考${document1}的具体内容以确保所有产出紧密相关且高质量
"""
prom = '请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}'
#正文和文档名之间的内容
def extract_content_between_tags(text):
results = []
# 使用“【正文】”来分割文本
parts = text.split('【正文】')[1:] # 跳过第一个分割结果,因为它前面不会有内容
for index, part in enumerate(parts):
# 查找“【文档名】”标签的位置
doc_name_index = part.find('【文档名】')
# 查找 'file_ids' 标签的位置
file_ids_index = part.find("'file_ids'")
# 根据是否找到“【文档名】”来决定提取内容的截止点
if doc_name_index != -1:
end_index = doc_name_index
elif file_ids_index != -1:
end_index = file_ids_index
else:
end_index = len(part)
# 提取内容
content = part[:end_index].strip()
results.append(content)
# 如果存在 file_ids处理最后一部分特别提取 file_ids 前的内容
if "'file_ids'" in parts[-1]:
file_ids_index = parts[-1].find("'file_ids'")
if file_ids_index != -1:
last_content = parts[-1][:file_ids_index].strip()
results[-1] = last_content # 更新最后一部分的内容,确保只到 file_ids
return results
def find_references_in_extracted(formatted_ans, extracted_references):
results = {} # 用来存储匹配结果的字典
# 递归函数,用于处理多层嵌套的字典
def recurse_through_dict(current_dict, path=[]):
for key, value in current_dict.items():
# 检查值是否还是字典,如果是,进行递归
if isinstance(value, dict):
recurse_through_dict(value, path + [key])
else:
# 特定值处理:如果值为'未知',直接设置索引为-1
if value == '未知':
results['.'.join(path + [key])] = -1
else:
# 进行匹配检查
found = False
for index, reference in enumerate(extracted_references):
if str(value) in reference: # 转换为字符串,确保兼容性
results['.'.join(path + [key])] = index # 使用点表示法记录路径
found = True
break
if not found:
results['.'.join(path + [key])] = None
# 从根字典开始递归
recurse_through_dict(formatted_ans)
return results
def send_message(assistant, message='百炼是什么?'):
ans = []
print(f"Query: {message}")
# create thread.
thread = Threads.create()
print(thread)
# create a message.
message = Messages.create(thread.id, content=message)
# create run
run = Runs.create(thread.id, assistant_id=assistant.id)
print(run)
# wait for run completed or requires_action
run_status = Runs.wait(run.id, thread_id=thread.id)
print(run_status)
reference_txt = str(run_status)
extracted_references = extract_content_between_tags(reference_txt) #引用的文章来源list
# get the thread messages.
msgs = Messages.list(thread.id)
for message in msgs['data'][::-1]:
ans.append(message['content'][0]['text']['value'])
return ans,extracted_references
def rag_assistant(knowledge_name):
retriever = DashScopeCloudRetriever(knowledge_name)
pipeline_id = str(retriever.pipeline_id)
assistant = Assistants.create(
model='qwen-max',
name='smart helper',
description='智能助手,支持知识库查询和插件调用。',
temperature='0.3',
instructions=prom,
tools=[
{
"type": "code_interpreter"
},
{
"type": "rag",
"prompt_ra": {
"pipeline_id": pipeline_id,
"parameters": {
"type": "object",
"properties": {
"query_word": {
"type": "str",
"value": "${document1}"
}
}
}
}
}]
)
return assistant
def pure_assistant():
assistant = Assistants.create(
model='qwen-max',
name='smart helper',
description='智能助手,能基于用户的要求精准简洁地回答用户的提问',
instructions='智能助手,能基于用户的要求精准简洁地回答用户的提问',
tools=[
{
"type": "code_interpreter"
},
]
)
return assistant
def llm_call(question, knowledge_name, result_queue, ans_index, use_rag=True):
if use_rag:
assistant = rag_assistant(knowledge_name)
else:
assistant = pure_assistant()
ans,extracted_references = send_message(assistant, message=question)
for index, reference in enumerate(extracted_references, start=0):
print(f"{index}. {reference}")
formatted_ans=extract_content_from_json(ans[1])
print(formatted_ans)
results = find_references_in_extracted(formatted_ans, extracted_references)
for key, index in results.items():
print(f"{key}: Found at index {index}")
result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans)
def multi_threading(queries, knowledge_name, use_rag=True):
result_queue = queue.Queue()
# 使用 ThreadPoolExecutor 管理线程
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
# 使用字典保存每个提交的任务的Future对象以便按顺序访问结果
future_to_query = {executor.submit(llm_call, query, knowledge_name, result_queue, index, use_rag): index for
index, query in enumerate(queries)}
# 收集每个线程的结果
for future in concurrent.futures.as_completed(future_to_query):
index = future_to_query[future]
# 由于 llm_call 函数本身会处理结果,这里只需要确保任务执行完成
try:
future.result() # 可以用来捕获异常或确认任务完成
except Exception as exc:
print(f"Query {index} generated an exception: {exc}")
# 从队列中获取所有结果并按索引排序
results = [None] * len(queries)
while not result_queue.empty():
index, result = result_queue.get()
results[index] = result
return results
if __name__ == "__main__":
# 读取问题列表
questions = ["该招标文件的工程概况或项目概况招标范围是请按json格式给我提供信息键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'"]
knowledge_name = "招标解析5word"
results = multi_threading(questions, knowledge_name, use_rag=True)
# 打印结果
for question, response in results:
print(f"Question: {question}")
print(f"Response: {response}")

View File

@ -0,0 +1,135 @@
from json_utils import clean_json_string, nest_json_under_key,rename_outer_key, combine_json_results
from 投标人须知正文提取指定内容 import extract_from_notice
from 判断是否分包等 import judge_whether_main, read_questions_from_judge
from 多线程提问 import read_questions_from_file, multi_threading
from 通义千问long import upload_file
def combine_basic_info(baseinfo_list):
combined_baseinfo_list = []
key_groups = {
"招标人/代理信息": ["招标人","招标人联系方式", "招标代理机构","招标代理机构联系方式"],
"项目信息": ["工程名称", "招标编号","工程概况","招标范围","招标控制价","投标竞争下浮率","是否接受联合体投标"],
"关键时间/内容":["投标文件递交截止日期","递交方式","投标人要求澄清招标文件的截止时间","投标有效期","评标结果公示媒介"],
"保证金相关":['质量保证金','退还投标保证金'],
"其他信息":["重新招标、不再招标和终止招标","是否退还投标文件","费用承担"]
}
# 将所有基础信息合并到一个字典中
combined_data = {}
relevant_keys_detected = set()
# 预处理以决定哪些键名将被使用
for baseinfo in baseinfo_list:
json_data = clean_json_string(baseinfo)
combined_data.update(json_data)
relevant_keys_detected.update(json_data.keys())
# for key in relevant_keys.keys():
# if key in json_data:
# relevant_keys[key] = True
# 根据检测到的键动态调整 key_groups
dynamic_key_handling(key_groups, relevant_keys_detected)
# 打印 key_groups 的内容检查它们是否被正确更新
print("Updated key_groups after dynamic handling:")
print(key_groups)
# 使用合并后的字典创建最终输出
for group_name, keys in key_groups.items():
group_data = {key: combined_data.get(key, "未提供") for key in keys}
combined_json = nest_json_under_key(group_data, group_name)
combined_baseinfo_list.append(combined_json)
return combined_baseinfo_list
def dynamic_key_handling(key_groups, detected_keys):
# 检查和调整键组配置
for key in detected_keys:
if "投标保证金" in key or "履约保证金" in key:
key_groups["保证金相关"].append(key)
elif "联合体投标要求" in key:
key_groups["项目信息"].append(key)
elif "分包" in key:
key_groups["项目信息"].append(key)
elif "踏勘现场" in key:
key_groups["其他信息"].append(key)
elif "投标预备会" in key:
key_groups["其他信息"].append(key)
elif "偏离" in key:
key_groups["其他信息"].append(key)
def judge_consortium_bidding(baseinfo_list):
for baseinfo in baseinfo_list:
json_data = clean_json_string(baseinfo)
# 检查 "是否接受联合体投标" 键是否存在且其值为 "是"
if json_data.get("是否接受联合体投标") == "":
return True
return False
def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #投标人须知前附表
# 调用大模型回答项目基础信息
print("starting基础信息...")
baseinfo_list = []
# baseinfo_file_path='../static/提示词/前两章提问总结.txt'
baseinfo_file_path = 'static/提示词/前两章提问总结.txt' # 替换为你的txt文件路径
questions = read_questions_from_file(baseinfo_file_path)
res1 = multi_threading(questions, knowledge_name)
for _, response in res1: # _占位代表ques;response[0]也是ques;response[1]是ans
try:
if response and len(response) > 1: # 检查response存在且有至少两个元素
baseinfo_list.append(response[1])
else:
print(f"Warning: Missing or incomplete response data for query index {_}.")
except Exception as e:
print(f"Error processing response for query index {_}: {e}")
print("basic信息done...")
# 判断是否分包、是否需要递交投标保证金等
chosen_numbers, merged = judge_whether_main(truncate0,output_folder)
baseinfo_list.append(merged)
# judge_file_path = '../static/提示词/是否相关问题.txt'
judge_file_path ='static/提示词/是否相关问题.txt'
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
judge_consortium = judge_consortium_bidding(baseinfo_list) #通过招标公告判断是否接受联合体投标
if judge_consortium:
judge_consortium_question = "该招标文件对于联合体投标的要求是怎样的请按json格式给我提供信息外层键名为'联合体投标要求'"
judge_questions.append(judge_consortium_question)
file_id=upload_file(truncate0)
res2 = multi_threading(judge_questions, "",file_id,2) #调用千问-long
if not res2:
print("errror!")
else:
# 打印结果
for question, response in res2:
baseinfo_list.append(response)
# for _, response in res2: # _占位代表ques;response[0]也是ques;response[1]是ans #调用百炼rag
# try:
# if response and len(response) > 1: # 检查response存在且有至少两个元素
# baseinfo_list.append(response[1])
# else:
# print(f"Warning: Missing or incomplete response data for query index {_}.")
# except Exception as e:
# print(f"Error processing response for query index {_}: {e}")
rebidding_situation = extract_from_notice(clause_path, 3) #"重新招标, 不再招标和终止招标"需从投标人须知正文提取
update_json=rename_outer_key(rebidding_situation,"重新招标、不再招标和终止招标")
baseinfo_list.append(update_json)
update_baseinfo_list=combine_basic_info(baseinfo_list) #整合基础信息核心代码
baseinfo_combined_res = combine_json_results(update_baseinfo_list) # 返回值是字典
print("基础信息done")
return nest_json_under_key(baseinfo_combined_res, "基础信息") #返回值是json字符串
if __name__ == "__main__":
knowledge_name = "ztb"
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\test2"
truncate0="C:\\Users\\Administrator\\Desktop\\招标文件\\test2\\zbtest10_tobidders_notice_table.pdf"
clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\clause1.json"
res=project_basic_info(knowledge_name,truncate0,output_folder,clause_path)
print(res)

View File

@ -0,0 +1,373 @@
# -*- coding: utf-8 -*-
import os
import shutil
from PyPDF2 import PdfReader, PdfWriter
def validate_pdf(file_path):
""" 验证PDF文件是否损坏 """
try:
with open(file_path, "rb") as file:
pdf = PdfReader(file)
return len(pdf.pages) > 0
except Exception as e:
print(f"Error reading PDF {file_path}: {str(e)}")
return False
def truncate_pdf(source_path, target_path, max_pages=15):
"""截取PDF文件的前15页并保存"""
try:
with open(source_path, "rb") as infile:
reader = PdfReader(infile)
writer = PdfWriter()
for i in range(min(max_pages, len(reader.pages))):
writer.add_page(reader.pages[i])
with open(target_path, "wb") as outfile:
writer.write(outfile)
except Exception as e:
print(f"Error processing PDF {source_path}: {str(e)}")
def copy_file(src, dest):
""" 复制单个文件 """
os.makedirs(os.path.dirname(dest), exist_ok=True)
shutil.copy2(src, dest)
def copy_directory(source, destination):
"""复制整个目录"""
os.makedirs(destination, exist_ok=True)
for item in os.listdir(source):
src_path = os.path.join(source, item)
dest_path = os.path.join(destination, item)
if os.path.isfile(src_path):
copy_file(src_path, dest_path)
else:
copy_directory(src_path, dest_path)
def unique_file_name(base_path, file_name):
counter = 1
name_part, extension = os.path.splitext(file_name)
new_file_name = file_name
# 检查文件是否存在,若存在则修改文件名
while os.path.exists(os.path.join(base_path, new_file_name)):
new_file_name = f"{name_part}_{counter}{extension}"
counter += 1
return new_file_name
def process_pdf_folders(source_dir, target_dir):
""" 处理源目录中的PDF文件并基于条件选择目标文件夹 """
for root, dirs, files in os.walk(source_dir, topdown=False):
for file in files:
if file.lower().endswith('.pdf'):
source_file_path = os.path.join(root, file)
if validate_pdf(source_file_path):
relative_path = os.path.relpath(root, source_dir)
target_file_dir = os.path.join(target_dir, relative_path)
target_file_path = os.path.join(target_file_dir, file)
copy_file(source_file_path, target_file_path)
else:
print(f"Deleted corrupt file: {source_file_path}")
# 清除空目录
if not os.listdir(root):
os.rmdir(root)
def classify_folders(source_dir, target_dir, project_index):
"""Classifies folders and processes files based on specific criteria."""
temp_dir = os.path.join(source_dir, 'temp')
os.makedirs(temp_dir, exist_ok=True)
target_project_dir = None
processed_dirs = set() # Set to track processed directories
for subdir in os.listdir(source_dir):
subdir_path = os.path.join(source_dir, subdir)
if not os.path.isdir(subdir_path):
continue
files = [f.lower() for f in os.listdir(subdir_path)]
if 'zb.pdf' in files:
target_project_dir, target_zb_name= process_tender_files(subdir_path, temp_dir, target_dir, project_index)
processed_dirs.add(subdir_path) # Mark this directory as processed
elif subdir.lower() == "输出文件":
process_evaluation_files(subdir_path, target_project_dir)
processed_dirs.add(subdir_path) # Mark this directory as processed
# Process remaining folders, skipping already processed ones
process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs,target_zb_name)
#删除tmp目录
# if os.path.exists(temp_dir):
# shutil.rmtree(temp_dir)
if os.path.exists(source_dir):
shutil.rmtree(source_dir)
def process_tender_files(subdir_path, temp_dir, target_dir, project_index):
"""Processes tender files and returns the target project directory and updated index."""
zb_path = os.path.join(subdir_path, "zb.pdf")
truncated_zb_path = os.path.join(temp_dir, "truncate_zb.pdf")
truncate_pdf(zb_path, truncated_zb_path, 30) # Truncate to the first 30 pages
bot_response = file_parse(truncated_zb_path, project_index, 1)
zb_response = extract_answer(bot_response[1], 1)
zb_response[0]=zb_response[0].replace('/', '-').replace('\\', '-') #用'-'代替'/' ,目录名不允许出现斜杠
target_zb_name, category = zb_response[2] + ".pdf", zb_response[3]
new_zb_path = os.path.join(subdir_path, target_zb_name)
os.rename(zb_path, new_zb_path)
target_category_dir = os.path.join(target_dir, category)
target_project_dir = os.path.join(target_category_dir, zb_response[0] + "_" + zb_response[1])
copy_directory(subdir_path, os.path.join(target_project_dir, "招标文件夹"))
os.remove(truncated_zb_path)
shutil.rmtree(subdir_path)
return target_project_dir, zb_response[2]
def process_evaluation_files(subdir_path, target_project_dir):
"""Processes evaluation folders."""
copy_directory(subdir_path, os.path.join(target_project_dir, "评标文件夹"))
shutil.rmtree(subdir_path)
def process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs, target_zb_name):
"""Processes remaining folders containing bid files."""
target_tb_dir = os.path.join(target_project_dir, "投标文件夹")
for subdir in os.listdir(source_dir):
subdir_path = os.path.join(source_dir, subdir)
if not os.path.isdir(subdir_path) or subdir_path in processed_dirs:
continue # Skip processed directories
target_tbcom_dir = None # Initialize outside the file loop
for item in os.listdir(subdir_path):
item_src_path = os.path.join(subdir_path, item)
new_name = "truncate_" + item
truncated_tb_path = os.path.join(temp_dir, new_name)
truncate_pdf(item_src_path, truncated_tb_path)
bot_response = file_parse(truncated_tb_path, project_index, 2)
tb_response = extract_answer(bot_response[1], 2)
if not tb_response:
continue # If there's no response, skip this file
# Initialize target_tbcom_dir only once based on the first processed file
if not target_tbcom_dir:
target_tb_name, _ = tb_response
if(target_tb_name != target_zb_name and target_tb_name != "未知"):
target_tbcom_dir = os.path.join(target_tb_dir, target_tb_name)
print(target_tbcom_dir)
os.makedirs(target_tbcom_dir, exist_ok=True)
tb_section = tb_response[1] + ".pdf"
new_tb_path = os.path.join(subdir_path, tb_section)
# 获取唯一文件名
new_tb_path = os.path.join(subdir_path, unique_file_name(subdir_path, tb_section))
os.rename(item_src_path, new_tb_path)
# Remove temporary truncated file
os.remove(truncated_tb_path)
# Copy the whole directory at once after all files have been processed and renamed
if target_tbcom_dir:
copy_directory(subdir_path, target_tbcom_dir)
shutil.rmtree(subdir_path) # Optionally remove the original directory after copying
import re
def extract_answer(input_string, type):
# 使用正则表达式匹配所需的信息
# 第一种模式:项目名称、项目编号、招标人、类别
# 在每个字段值后都添加了\s*以忽略尾随的空格
pattern1 = r"项目名称[:]\s*(.*?)[;]\s*项目编号[:]\s*(.*?)[;]\s*招标人[:]\s*(.*?)[;]\s*类别[:]\s*([^。;]*).*"
# 第二种模式:投标人、类别
pattern2 = r"投标人[:]\s*(.*?)[;]\s*类别[:]\s*([^。;]*).*"
if type == 1:
match = re.search(pattern1, input_string)
if match:
print(f"1: {match.group(1).strip()} 2: {match.group(2).strip()} 3: {match.group(3).strip()} 4: {match.group(4).strip()}")
return [match.group(1).strip(), match.group(2).strip(), match.group(3).strip(), match.group(4).strip()]
else:
print("No match found for type 1.")
return []
elif type == 2:
match = re.search(pattern2, input_string)
if match:
# 检查是否包含“投标函”,如果是则替换部分内容为“商务文件”
part = match.group(2).strip()
if "投标函" in part:
part = "商务文件"
return [match.group(1).strip(), part]
else:
print("No match found for type 2.")
return []
from llama_index.readers.dashscope.base import DashScopeParse
from llama_index.readers.dashscope.utils import ResultType
from llama_index.indices.managed.dashscope import DashScopeCloudIndex
from dashscope import Assistants, Messages, Runs, Threads
def send_message(assistant, index,documents,message='百炼是什么?'):
print(f"Query: {message}")
# create thread.
# create a thread.
thread = Threads.create()
print(thread)
# create a message.
message = Messages.create(thread.id, content=message)
# create run
run = Runs.create(thread.id, assistant_id=assistant.id)
print("run:" + str(run))
# # get run statue
run_status = Runs.get(run.id, thread_id=thread.id)
# print(run_status)
# wait for run completed or requires_action
run_status = Runs.wait(run.id, thread_id=thread.id)
# print(run_status)
# if prompt input tool result, submit tool result.
run_status = Runs.get(run.id, thread_id=thread.id)
# print(run_status)
# verify_status_code(run_status)
# get the thread messages.
msgs = Messages.list(thread.id)
# print(msgs)
# print(json.dumps(msgs, default=lambda o: o.__dict__, sort_keys=True, indent=4))
ans = []
print("运行结果:")
for message in msgs['data'][::-1]:
ans.append(message['content'][0]['text']['value'])
print("content: ", message['content'][0]['text']['value'])
print("\n")
deleteFileFromKnowledge(index,documents)
return ans
def file_parse(filepath, knowledge_index, type):
parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND)
documents = parse.load_data(file_path=filepath)
# 创建一个字典来映射index值到知识库名
index_to_name = {
0: "文件分类知识库0",
1: "文件分类知识库1",
2: "文件分类知识库2",
3: "文件分类知识库3"
}
# 使用get方法获取对应的知识库名如果index不存在返回"默认文件分类知识库"
cloud_index_name = index_to_name.get(knowledge_index, "0")
index = DashScopeCloudIndex(cloud_index_name)
index._insert(documents)
retriever = index.as_retriever()
pipeline_id = str(retriever.pipeline_id)
assistant = Assistants.create(
model='qwen-plus',
name='smart helper',
description='智能助手,支持知识库查询和插件调用。',
instructions='请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}',
tools=[
{
"type": "code_interpreter"
},
{
"type": "rag",
"prompt_ra": {
"pipeline_id": pipeline_id,
"parameters": {
"type": "object",
"properties": {
"query_word": {
"type": "str",
"value": "${document1}"
}
}
}
}
}]
)
questions1 = "这份招标文件的项目名称是什么?这份文件的招标编号是多少?这份文件的招标人是谁?这份招标文件属于以下哪类招标:服务类、工程类、还是货物类?你可以结合你对招投标的了解和以下内容:工程类招标投标是指建设单位对拟建的工程项目通过法定的程序和方式吸引建设项目的承包单位竞争;\
货物类招投标是指以货物作为采购对象的招标业主或称购货方为获得货物通过招标的形式选择合格的供货商或称供货方包括原材料产品设备电能和固态液态气态物体等服务类招标又称为服务采购指的是除了工程和货之外的其他招标投标活动服物招标投标范围包含建设工程的勘察设计监理工程咨询评估科技项目科研课题物业管理金融保险服务等\
请按下列键值对格式给我提供信息,避免无关内容:项目名称:XXX;项目编号:XXX;招标人:XXX;类别XXX"
questions2 = "这份投标文件的投标人是谁?如果文件中未明确提及投标人,投标人以未知替代;这份文件属于哪类投标文件?你的回答仅限于以下三种:商务文件、技术文件、报价文件,不要给出其他回答。请按下列键值对格式给我提供信息,避免无关内容:‘投标人:XXX;类别:XXX"
if (type == 1):
questions = questions1
elif (type == 2):
questions = questions2
return send_message(assistant,index,documents, message=questions)
def deleteFileFromKnowledge(index,documents):
# 初始化一个列表来存储所有文档的 ID
file_ids = []
# 检查documents是否为列表且不为空
if isinstance(documents, list) and documents:
# 遍历每个文档
for document in documents:
# 使用属性访问方式获取每个文档的 id_
# 确保 document 对象有一个名为 id_ 的属性
file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常
if file_id:
file_ids.append(file_id) # 将 id 添加到列表中
index.delete_ref_doc(file_ids)
import concurrent.futures
import logging
# 配置日志,只输出到控制台
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def process_directory(subdir_path, base_intermediate_directory, final_directory, project_index):
logging.info(f"Starting to process directory: {subdir_path} with index {project_index}")
# 为每个子目录创建一个专用的临时目录
intermediate_directory = os.path.join(base_intermediate_directory, f"intermediate_{project_index}")
os.makedirs(intermediate_directory, exist_ok=True)
logging.info(f"Created intermediate directory: {intermediate_directory}")
# 这里可以添加具体的目录处理函数,例如:
process_pdf_folders(subdir_path, intermediate_directory)
classify_folders(intermediate_directory, final_directory, project_index % 4)
# 处理完毕后清理该目录
shutil.rmtree(intermediate_directory)
logging.info(f"Deleted intermediate directory: {intermediate_directory}")
def main(base_directory, base_intermediate_directory, final_directory):
os.makedirs(final_directory, exist_ok=True)
logging.info(f"Final directory ensured at: {final_directory}")
project_index = 0
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
futures = []
for subdir in os.listdir(base_directory):
subdir_path = os.path.join(base_directory, subdir)
if os.path.isdir(subdir_path):
logging.info(f"Submitting job for directory: {subdir_path}")
future = executor.submit(process_directory, subdir_path, base_intermediate_directory, final_directory, project_index)
futures.append(future)
project_index += 1
for future in concurrent.futures.as_completed(futures):
try:
future.result() # 如果执行没有抛出异常,完成该任务
except Exception as e:
logging.error(f"Thread resulted in an error: {e}")
if __name__ == "__main__":
base_directory = 'D:\\bidding_trading_files\\招投标文件2023\\2023'
base_intermediate_directory = 'D:\\tmp'
final_directory = 'D:\\output'
main(base_directory, base_intermediate_directory, final_directory)

View File

@ -0,0 +1,189 @@
# 基于知识库提问的通用模板,
# assistant_id
import re
import queue
import concurrent.futures
import time
from dashscope import Assistants, Messages, Runs, Threads
from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
from 通义千问long import qianwen_long, upload_file
prompt = """
# 角色
你是一个文档处理专家专门负责理解和操作基于特定内容的文档任务这包括解析总结搜索或生成与给定文档相关的各类信息
## 技能
### 技能 1文档解析与摘要
- 深入理解并分析${documents}的内容提取关键信息
- 根据需求生成简洁明了的摘要保持原文核心意义不变
### 技能 2信息检索与关联
- ${documents}中高效检索特定信息或关键词
- 能够识别并链接到文档内部或外部的相关内容增强信息的连贯性和深度
## 限制
- 所有操作均需基于${documents}的内容不可超出此范围创造信息
- 在处理敏感或机密信息时需遵守严格的隐私和安全规定
- 确保所有生成或改编的内容逻辑连贯无误导性信息
请注意上述技能执行时将直接利用并参考${documents}的具体内容以确保所有产出紧密相关且高质量
"""
prom = '请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${documents}'
def read_questions_from_file(file_path):
questions = []
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
line = line.strip()
# 使用正则表达式匹配以数字开头,后接一个点的行
if re.match(r'\d+\.', line):
# 从点后分割并去除前后空格获取问题部分
question = line.split('.', 1)[1].strip()
questions.append(question)
return questions
#正文和文档名之间的内容
def send_message(assistant, message='百炼是什么?'):
ans = []
print(f"Query: {message}")
# create thread.
thread = Threads.create()
print(thread)
# create a message.
message = Messages.create(thread.id, content=message)
# create run
run = Runs.create(thread.id, assistant_id=assistant.id)
# print(run)
# wait for run completed or requires_action
run_status = Runs.wait(run.id, thread_id=thread.id)
# print(run_status)
# get the thread messages.
msgs = Messages.list(thread.id)
for message in msgs['data'][::-1]:
ans.append(message['content'][0]['text']['value'])
return ans
def rag_assistant(knowledge_name):
retriever = DashScopeCloudRetriever(knowledge_name)
pipeline_id = str(retriever.pipeline_id)
assistant = Assistants.create(
model='qwen-max',
name='smart helper',
description='智能助手,支持知识库查询和插件调用。',
temperature='0.3',
instructions="请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${documents}",
tools=[
{
"type": "code_interpreter"
},
{
"type": "rag",
"prompt_ra": {
"pipeline_id": pipeline_id,
"parameters": {
"type": "object",
"properties": {
"query_word": {
"type": "str",
"value": "${documents}"
}
}
}
}
}]
)
return assistant
def pure_assistant():
assistant = Assistants.create(
model='qwen-max',
name='smart helper',
description='智能助手,能基于用户的要求精准简洁地回答用户的提问',
instructions='智能助手,能基于用户的要求精准简洁地回答用户的提问',
tools=[
{
"type": "code_interpreter"
},
]
)
return assistant
def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type):
if llm_type==1:
assistant = rag_assistant(knowledge_name)
elif llm_type==2:
qianwen_res = qianwen_long(file_id,question)
result_queue.put((ans_index,(question,qianwen_res)))
return
else :
assistant = pure_assistant()
ans = send_message(assistant, message=question)
result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans)
def multi_threading(queries, knowledge_name="", file_id="",llm_type=1):
result_queue = queue.Queue()
# 使用 ThreadPoolExecutor 管理线程
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
# 逐个提交任务每提交一个任务后休眠1秒
future_to_query = {}
for index, query in enumerate(queries):
future = executor.submit(llm_call, query, knowledge_name, file_id, result_queue, index, llm_type)
future_to_query[future] = index
time.sleep(1) # 每提交一个任务后等待1秒
# 收集每个线程的结果
for future in concurrent.futures.as_completed(future_to_query):
index = future_to_query[future]
# 由于 llm_call 函数本身会处理结果,这里只需要确保任务执行完成
try:
future.result() # 可以用来捕获异常或确认任务完成
except Exception as exc:
print(f"Query {index} generated an exception: {exc}")
# 从队列中获取所有结果并按索引排序
results = [None] * len(queries)
while not result_queue.empty():
index, result = result_queue.get()
results[index] = result
return results
if __name__ == "__main__":
start_time=time.time()
# 读取问题列表
questions =read_questions_from_file('../static/提示词/前两章提问总结.txt')
for i in questions:
print(i)
knowledge_name = "招标解析5word"
llm_type=1
results = multi_threading(questions, knowledge_name)
end_time = time.time()
if not results:
print("errror!")
else:
print("elapsed time:"+str(end_time-start_time))
# 打印结果
for question, response in results:
print(f"Question: {question}")
print(f"Response: {response}")
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\ztb_evaluation_method.pdf"
# file_id = upload_file(file_path)
# questions=["根据该文档中的评标办法前附表,请你列出该文件的技术标,以json的格式返回结果","根据该文档中的评标办法前附表,请你列出该文件的商务标,以json的格式返回结果","根据该文档中的评标办法前附表,请你列出该文件的投标报价,以json的格式返回结果"]
# results=multi_threading(questions,"",file_id,2) #1代表使用百炼rag 2代表使用qianwen-long
# if not results:
# print("errror!")
# else:
# # 打印结果
# for question, response in results:
# print(f"Question: {question}")
# print(f"Response: {response}")

View File

@ -0,0 +1,66 @@
import fitz # PyMuPDF
import re
def extract_text_with_keywords(pdf_path, keywords, follow_up_keywords):
"""
提取PDF文档中包含特定关键词的段落包括正文和表格如果段落中包含特定的后续关键词也提取其后的段落直到遇到下一个相似的序列编号
:param pdf_path: PDF文件的路径
:param keywords: 包含关键词的列表
:param follow_up_keywords: 触发连续提取的关键词列表
:return: 包含关键词的段落列表
"""
doc = fitz.open(pdf_path)
extracted_paragraphs = []
continue_collecting = False
current_section_pattern = None
for page in doc:
text_blocks = page.get_text("blocks")
for index, block in enumerate(text_blocks):
text = block[4].strip() # Text content of the block
if text == "": # Skip empty lines
continue
if continue_collecting:
if current_section_pattern and re.match(current_section_pattern, text):
continue_collecting = False
else:
extracted_paragraphs.append(text)
if any(keyword in text for keyword in keywords):
extracted_paragraphs.append(text)
if any(follow_up in text for follow_up in follow_up_keywords):
continue_collecting = True
section_number = re.match(r'(\d+(\.\d+)*)', text)
if section_number:
current_section_number = section_number.group(1)
base_section_number = current_section_number.rsplit('.', 1)[0]
current_section_pattern = re.compile(rf'^{re.escape(base_section_number)}\.\d+\b')
else:
found_next_number = False
for next_index in range(index + 1, len(text_blocks)):
next_text = text_blocks[next_index][4].strip()
next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)', next_text)
if next_section_number:
found_next_number = True
current_section_pattern = re.compile(rf'^{next_section_number.group(1)}\b')
elif found_next_number:
break
doc.close()
return extracted_paragraphs
# Example usage
doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标02_invalid.docx'
keywords = ['否决', '无效投标', '被拒绝', '予以拒绝']
follow_up_keywords = ['情形之一']
extracted_contents = extract_text_with_keywords(doc_path, keywords, follow_up_keywords)
# Writing to file and handling duplicates
output_file = "C:\\Users\\Administrator\\Desktop\\temp.txt"
with open(output_file, 'w', encoding='utf-8') as file:
for content in extracted_contents:
file.write(content + '\n')
# file_id = upload_file(output_file)
# user_query="根据文本中的内容请你回答否决投标和拒绝投标的情况有哪些由于文本中存在冗余且无关的信息且文本中的序号是混乱的请你重新编排答案返回给我以json格式返回你的回答仅限于原文内容但删去原有序号请不要自己组织语言。"
# res=qianwen_long(file_id,user_query)
# print("Query Result:", res)

View File

@ -0,0 +1,170 @@
import re
import json
import time
from 多线程提问 import multi_threading
from 根据条款号整合json import process_and_merge_entries
from 通义千问long import qianwen_long
from json_utils import extract_content_from_json
prompt = """
# 角色
你是一个文档处理专家专门负责理解和操作基于特定内容的文档任务这包括解析总结搜索或生成与给定文档相关的各类信息
## 技能
### 技能 1文档解析与摘要
- 深入理解并分析${document1}的内容提取关键信息
- 根据需求生成简洁明了的摘要保持原文核心意义不变
### 技能 2信息检索与关联
- ${document1}中高效检索特定信息或关键词
- 能够识别并链接到文档内部或外部的相关内容增强信息的连贯性和深度
## 限制
- 所有操作均需基于${document1}的内容不可超出此范围创造信息
- 在处理敏感或机密信息时需遵守严格的隐私和安全规定
- 确保所有生成或改编的内容逻辑连贯无误导性信息
请注意上述技能执行时将直接利用并参考${document1}的具体内容以确保所有产出紧密相关且高质量
"""
def extract_matching_keys(json_data):
# 函数首先检查输入 json_data 是否为字符串类型。如果是,它会使用 json.loads() 将字符串解析为字典。
if isinstance(json_data, str):
data = json.loads(json_data)
else:
data = json_data
# 正则表达式匹配
include_patterns = [re.compile(r"符合第"), re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目")]
additional_include_patterns = [re.compile(r"规定"), re.compile(r"条目")]
exclude_patterns = ["投标文件格式", "权利义务", "技术标准","工程量清单"]
additional_exclude_patterns = ["按要求", "按规定"]
# Initialize a list to hold filtered key-value pairs
final_matching = []
# Recursive function to traverse and filter data
def recursive_search(current_data, path=[]):
if isinstance(current_data, dict):
for key, value in current_data.items():
new_path = path + [key] # Update path for nested keys
if isinstance(value, (dict, list)):
recursive_search(value, new_path)
else:
process_value(key, str(value), new_path)
elif isinstance(current_data, list):
for item in current_data:
recursive_search(item, path)
# Function to process each value against the patterns
def process_value(key, value, path):
# Check exclude patterns first
if any(ex in key or ex in value for ex in exclude_patterns):
return
# Main include patterns
if any(pattern.search(value) for pattern in include_patterns):
# Additional exclude patterns
if not any(ex in key or ex in value for ex in additional_exclude_patterns):
# Additional include patterns
if any(pattern.search(value) for pattern in additional_include_patterns):
final_matching.append({".".join(path): value}) # Use dot notation for nested keys
# Start the recursive search
recursive_search(data)
return final_matching
def reformat_questions(match_keys):
"""
根据是否包含特定序号格式如3.7.4或3.7.4(5)或3.7.45重新格式化匹配到的评审条目
若包含序号则提取出来若不包含则生成格式化的问题字符串
"""
entries_with_numbers = []
formatted_questions = []
# 正则表达式,同时匹配全角和半角括号
pattern = re.compile(r'(\d+(?:\.\d+)+)(?:[\(\](\d+)[\)\])?')
for entry in match_keys:
key, value = next(iter(entry.items()))
match = pattern.search(value)
if match:
# 如果存在序号,保存序号与对应的键值对,包括括号内的数字(如果存在)
num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "")
entries_with_numbers.append({key: num})
else:
# 如果不存在序号,删除“符合”并格式化文本
revised_standard = re.sub(r'符合', '', value)
formatted_entry = f"关于‘{key}{revised_standard}的内容是怎样的请按json格式给我提供信息键名为'{key}',如果存在未知信息,请在对应键值处填'未知'"
formatted_questions.append(formatted_entry)
return entries_with_numbers, formatted_questions
def update_json_data(original_data, updates, second_response_list):
"""
根据提供的更新字典覆盖原始JSON数据中对应的键值支持点分隔的键来表示嵌套结构
参数:
- original_data: dict, 原始的JSON数据
- updates: dict, 包含需要更新的键值对
- second_response_list: list, 包含多个字典每个字典包含需要更新的键值对
返回:
- updated_data: dict, 更新后的JSON数据
"""
def recursive_update(data, key, value):
# 处理点分隔的键,递归定位并更新嵌套字典
keys = key.split('.')
for k in keys[:-1]:
data = data.setdefault(k, {})
if isinstance(value, dict) and isinstance(data.get(keys[-1], None), dict):
data[keys[-1]] = {**data.get(keys[-1], {}), **value}
else:
data[keys[-1]] = value
# 合并 updates 到 original_data 中
for key, value in updates.items():
recursive_update(original_data, key, value)
# 遍历 second_response_list 中的每个字典,并合并到 original_data 中
for response_dict in second_response_list:
for key, value in response_dict.items():
recursive_update(original_data, key, value)
return original_data
def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause_json_path):
matched_keys = extract_matching_keys(original_dict_data) #[{'形式评审标准.投标文件签字盖章': '符合第二章“投标人须知”第 3.7.34目 规定'}, {'形式评审标准.多标段投标': '符合第二章“投标人须知”第 10.1款规定'}]
entries_with_numbers, formatted_questions = reformat_questions(matched_keys)
results_2 = multi_threading(formatted_questions, knowledge_name, True) #无序号的直接问大模型
second_response_list = []
for _, response in results_2:
try:
if response and len(response) > 1: # 检查response存在且有至少两个元素
temp = extract_content_from_json(response[1])
second_response_list.append(temp)
else:
print(f"Warning: Missing or incomplete response data for query index {_}.")
except Exception as e:
print(f"Error processing response for query index {_}: {e}")
# Assume JSON file paths are defined or configured correctly
combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath, clause_json_path) #脚本提取的要求
updated_json = update_json_data(original_dict_data, combined_results, second_response_list)
return updated_json
if __name__ == "__main__":
start_time=time.time()
knowledge_name="zbfile"
truncate_tobidders_table_json_path="C:\\Users\\Administrator\\Desktop\\招标文件\\output\\truncate_output.json"
clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\output\\clause.json"
original_dict_data={'营业执照': '具备有效的营业执照', '资质条件': '符合第二章“投标人须知”第 1.4.1项规定', '财务状况': '符合第二章“投标人须知”第 1.4.1项规定', '类似业绩': '符合第二章“投标人须知”第 1.4.1项规定', '信誉': '符合第二章“投标人须知”第 1.4.1项规定', '项目经理资格': '符合第二章“投标人须知”第 1.4.1项规定', '设计负责人资格': '符合第二章“投标人须知”第 1.4.1项规定', '施工负责人资格': '符合第二章“投标人须知”第 1.4.1项规定', '施工机械设备': '符合第二章“投标人须知”第 1.4.1项规定', '项目管理机构及人员': '符合第二章“投标人须知”第 1.4.1项规定', '其他要求': '符合第二章“投标人须知”第 1.4.1项规定', '联合体投投人 (如有)': '符合第二章“投标人须知”第 1.4.2项规定', '不存在禁止投标的情形': '不存在第二章“投标人须知”第 1.4.3项规 定的任何一种情形'}
formal_json = process_reviews(original_dict_data,knowledge_name, truncate_tobidders_table_json_path, clause_path)
data = json.loads(formal_json)
end_time=time.time()
elapsed_time = end_time - start_time
print(f"Function execution took {elapsed_time} seconds.")

149
flask_app/main/截取pdf.py Normal file
View File

@ -0,0 +1,149 @@
from PyPDF2 import PdfReader, PdfWriter
import re # 导入正则表达式库
import os # 用于文件和文件夹操作
def clean_page_numbers(text):
# 使用正则表达式删除页码
# 假设页码在文本的最开始,紧跟着文字且无空格分隔
cleaned_text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
# 删除结尾的页码
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text)
# 删除形如 /129 的页码
cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text)
return cleaned_text
def extract_pages(pdf_path, output_folder, chapter_pattern, begin_page, end_phrase_pattern, output_suffix):
# 打开PDF文件
pdf_document = PdfReader(pdf_path)
start_page = None
end_page = None
qualification_pattern = re.compile(r'资格审查|资质条件|能力')
# 遍历文档的每一页,查找开始和结束短语的位置
for i in range(len(pdf_document.pages)):
page = pdf_document.pages[i]
text = page.extract_text()
if text:
cleaned_text = clean_page_numbers(text)
if re.search(chapter_pattern, cleaned_text) and i > begin_page:
if output_suffix == "invalid" and start_page: #仅当提取Invalid的时候判断初始页码是第一个匹配到的页码因为招标编号可能存在多个后面的覆盖前面
continue
if output_suffix == "qualification" and not re.search(qualification_pattern, cleaned_text):
# 如果是资格审查条件,但当前页不包含相关词汇,则不执行任何操作
pass
else:
start_page = i
if start_page is not None and re.search(end_phrase_pattern, cleaned_text) and i > (start_page+1):
is_invalid_condition = output_suffix == "invalid" and i > 30
if is_invalid_condition or output_suffix != "invalid":
end_page = i
break
# 确保找到了起始和结束页面
if start_page is None or end_page is None:
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
return None
# 创建一个新的PDF文档保存截取的页面
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # Get the base name without extension
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
output_doc = PdfWriter()
# 添加需要的页面,从 start_page 开始,包括 end_page
for page_num in range(start_page, end_page + 1):
output_doc.add_page(pdf_document.pages[page_num])
# 保存新的PDF文件
with open(output_pdf_path, 'wb') as f:
output_doc.write(f)
print(f"已截取并保存页面从 {start_page}{end_page}{output_pdf_path}")
return output_pdf_path
def process_input(input_path, output_folder, chapter_pattern, begin_page, end_phrases, output_suffix,selection):
# 确保输出文件夹存在
if not os.path.exists(output_folder):
os.makedirs(output_folder)
if(selection==3 or selection==4 or selection==5):
end_phrase_pattern = re.compile('|'.join([phrase for phrase in end_phrases]), re.MULTILINE)
else:
end_phrase_pattern = re.compile('|'.join([re.escape(phrase) for phrase in end_phrases]))
if os.path.isdir(input_path):
generated_files = []
# 遍历文件夹内的所有PDF文件
for file in os.listdir(input_path):
if file.endswith(".pdf"):
pdf_path = os.path.join(input_path, file)
output_pdf_path = extract_pages(pdf_path, output_folder, chapter_pattern, begin_page, end_phrase_pattern, output_suffix)
if output_pdf_path and os.path.isfile(output_pdf_path):
generated_files.append(output_pdf_path)
return generated_files
elif os.path.isfile(input_path) and input_path.endswith(".pdf"):
# 处理单个PDF文件
output_pdf_path = extract_pages(input_path, output_folder, chapter_pattern, begin_page, end_phrase_pattern, output_suffix)
if output_pdf_path and os.path.isfile(output_pdf_path):
return [output_pdf_path] # 以列表形式返回,以保持一致性
else:
print("提供的路径既不是文件夹也不是PDF文件。")
return []
def truncate_pdf_main(input_path, output_folder, selection):
if selection == 1:
# Configure patterns and phrases for "投标人须知前附表"
pattern = re.compile(r'第[一二三四五六七八九十]+章\s*投标人须知')
begin_page = 3
end_phrases = ["投标人须知正文"]
output_suffix = "tobidders_notice_table"
elif selection == 2:
# Configure patterns and phrases for "评标办法"
pattern = re.compile(r'第[一二三四五六七八九十]+章\s*评标办法')
begin_page = 10
end_phrases = ["评标办法正文", "评标办法"]
output_suffix = "evaluation_method"
elif selection == 3:
# Configure patterns and phrases for "投标人须知正文"
pattern = re.compile(r'投标人须知正文')
begin_page = 5
end_phrases = [
r'^第[一二三四五六七八九十]+章\s*评标办法',r'^评标办法前附表',r'^附录:', r'^附录一:',r'^附件:', r'^附件一:',r'^附表:', r'^附表一:',r'^附录:', r'^附录一:',r'^附件:', r'^附件一:',r'^附表:', r'^附表一:',
]
output_suffix = "tobidders_notice"
elif selection==4:
pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:')
begin_page = 0
end_phrases = [
r'第[一二三四五六七八九十]+章\s*合同',
r':清标报告',# 添加了新的匹配项
r':清标报告'
]
output_suffix="invalid"
elif selection==5:
appendix_pattern = r'^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]'
pattern = re.compile(appendix_pattern)
begin_page=5
end_phrases = [r'评标办法正文', r'评标办法',appendix_pattern]
output_suffix="qualification"
else:
print("无效的选择")
return None
# Process the selected input
return process_input(input_path, output_folder, pattern, begin_page, end_phrases, output_suffix,selection)
def truncate_pdf_multiple(input_path, output_folder):
truncate_files = []
for selection in range(1, 6):
files = truncate_pdf_main(input_path, output_folder, selection)
truncate_files.extend(files)
return truncate_files
if __name__ == "__main__":
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest5.pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\test"
truncate_pdf_multiple(input_path,output_folder)
# selection = 5 # 例如1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-招标公告-合同条款前
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print("生成的文件:", generated_files)

View File

@ -0,0 +1,120 @@
import json
import re
# 定义查找与目标值匹配的键的函数
def find_keys_by_value(target_value, json_data):
matched_keys = [k for k, v in json_data.items() if v == target_value]
if not matched_keys:
matched_keys = [k for k, v in json_data.items() if isinstance(v, str) and v.startswith(target_value)]
return matched_keys
# 定义查找以特定前缀开始的键的函数
def find_keys_with_prefix(key_prefix, json_data):
subheadings = [k for k in json_data if k.startswith(key_prefix)]
return subheadings
# 从文件中读取JSON数据并提取特定内容
def extract_json(data, target_values):
results = {}
for target_value in target_values:
matched_keys = find_keys_by_value(target_value, data)
for key in matched_keys:
key_and_subheadings = find_keys_with_prefix(key, data)
for subkey in key_and_subheadings:
if "." in subkey:
parent_key = subkey.rsplit('.', 1)[0]
top_level_key = parent_key.split('.')[0] + '.'
# 特别处理定标相关的顶级键,确保不会重复添加其他键
if target_value == "定标" and top_level_key not in results:
results[top_level_key] = "定标"
# 添加或更新父级键
if parent_key not in results:
if parent_key in data:
results[parent_key] = data[parent_key]
# 添加当前键
results[subkey] = data[subkey]
return results
# 转换结构化的JSON数据
def transform_json(data):
result = {}
temp = {0: result}
for key, value in data.items():
match = re.match(r'(\d+)(?:\.(\d+))?(?:\.(\d+))?', key)
if match:
levels = [int(l) for l in match.groups() if l is not None]
parent = temp[len(levels) - 1]
if len(levels) == len(match.groups()):
if isinstance(parent, list):
parent.append(value)
else:
# 对于没有 \n 的情况,使用首个空格分割的词作为键
parent[value.split()[0]] = value
else:
new_key = value.split()[0]
if '\n' in value and len(levels) == 2:
# 处理换行情况并分割键和值
new_key, new_value = value.split('\n', 1)
new_key = new_key.strip()
new_value = new_value.strip()
# 确保父级是字典
if isinstance(parent, list):
if len(parent) == 0 or not isinstance(parent[-1], dict):
parent.append({})
parent[-1][new_key] = new_value
else:
parent[new_key] = new_value
else:
if isinstance(parent, list):
if len(parent) == 0 or not isinstance(parent[-1], dict):
parent.append({})
parent = parent[-1]
if new_key not in parent:
parent[new_key] = []
temp[len(levels)] = parent[new_key]
# 修改函数以移除只有一个元素的列表和空列表
def remove_single_item_lists(node):
if isinstance(node, dict):
for key in list(node.keys()):
node[key] = remove_single_item_lists(node[key])
if isinstance(node[key], list) and not node[key]:
node[key] = "" # 如果列表为空,转换为空字符串
elif isinstance(node, list) and len(node) == 1:
return remove_single_item_lists(node[0])
return node
return remove_single_item_lists(result)
# 读取JSON数据提取内容转换结构并打印结果
def extract_from_notice(file_path, type):
if type == 1:
target_values = ["投标文件", "投标"]
elif type == 2:
target_values = ["开标", "评标", "定标"]
elif type == 3:
target_values = ["重新招标、不再招标和终止招标", "重新招标", "不再招标", "终止招标"]
else:
raise ValueError("Invalid type specified. Use 1 for '开标, 评标, 定标' or 2 for '投标文件, 投标'.")
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
extracted_data = extract_json(data, target_values) # 读取json
transformed_data = transform_json(extracted_data)
return transformed_data
# 假设原始数据文件路径
if __name__ == "__main__":
file_path = 'clause2.json'
try:
res = extract_from_notice(file_path, 3) # 可以改变此处的 type 参数测试不同的场景
print(res)
except ValueError as e:
print(e)

View File

@ -0,0 +1,152 @@
import json
import docx
import fitz
import re
import os
def extract_text_from_docx(file_path):
doc = docx.Document(file_path)
return '\n'.join([para.text for para in doc.paragraphs])
def clean_page_numbers(text):
# 删除每页开头的页码,假设页码后跟至少一个空格
cleaned_text = re.sub(r'^\s*\d+\s+', '', text)
# 删除每页末尾的页码,假设页码前有至少一个空格
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text)
# 删除形如 /123 或 123/ 的页码
cleaned_text = re.sub(r'\s*/\s*\d+\s*|\s*\d+\s*/\s*', '', cleaned_text)
return cleaned_text
def extract_text_from_pdf(file_path):
doc = fitz.open(file_path)
text = ""
for page in doc:
page_text = page.get_text()
page_text = clean_page_numbers(page_text)
text += page_text
return text
def extract_section(text, start_keyword, end_phrases):
start_index = text.find(start_keyword)
if start_index == -1:
return ""
end_index = len(text)
for phrase in end_phrases:
match = re.search(phrase, text[start_index:])
if match:
end_index = start_index + match.start()
break
return text[start_index:end_index]
def compare_headings(current, new):
# 使用过滤来确保只处理非空且为数字的部分
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
# 比较数字序列以确定标题的层次关系
for c, n in zip(current_nums, new_nums):
if n > c:
return True
elif n < c:
return False
# 如果新标题有更多层次,认为是新的子章节
return len(new_nums) > len(current_nums)
def should_add_newline(content, keywords, max_length=20):
content_str = ''.join(content).strip()
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
def handle_content_append(current_content, line_content, append_newline, keywords):
if append_newline:
if should_add_newline(current_content, keywords):
current_content.append('\n') # 添加换行符
append_newline = False
current_content.append(line_content)
return append_newline
#对二级标题如x.x进行额外处理如果当前处理内容包含keywords中的内容则必须保留换行符/如果当前内容字数大于20不保留换行。
def parse_text_by_heading(text):
keywords = ['包含', '以下']
data = {}
current_key = None
current_content = []
append_newline = False
lines = text.split('\n')
for i, line in enumerate(lines):
line_stripped = line.strip()
# 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match:
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
if match:
new_key, line_content = match.groups()
line_content = line_content.lstrip('.')
# 检查是否应该更新当前键和内容
if current_key is None or (compare_headings(current_key, new_key) and (
len(current_content) == 0 or current_content[-1][-1] != '')):
if current_key is not None:
# 将之前的内容保存到data中保留第一个换行符后续的换行符转为空字符
content_string = ''.join(current_content).strip()
data[current_key] = content_string.replace(' ', '')
current_key = new_key
current_content = [line_content]
# 只有当标题级别为两级(如 1.1)时,才设置 append_newline 为 True
append_newline = len(new_key.split('.')) == 2
else:
append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
else:
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
if current_key is not None:
# 保存最后一部分内容
content_string = ''.join(current_content).strip()
data[current_key] = content_string.replace(' ', '')
return data
def convert_to_json(file_path, start_word, end_phrases):
if file_path.endswith('.docx'):
text = extract_text_from_docx(file_path)
elif file_path.endswith('.pdf'):
text = extract_text_from_pdf(file_path)
else:
raise ValueError("Unsupported file format")
# 提取从 start_word 开始到 end_phrases 结束的内容
text = extract_section(text, start_word, end_phrases)
parsed_data = parse_text_by_heading(text)
return parsed_data
def convert_clause_to_json(input_path,output_folder):
start_word = "投标人须知正文"
end_phrases = [
r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
]
result = convert_to_json(input_path, start_word, end_phrases)
output_path = os.path.join(output_folder, "clause.json")
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=4, ensure_ascii=False)
return output_path
if __name__ == "__main__":
file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp6\\48e650ff-70eb-48df-874c-66c8abbcd89d\\ztbfile_tobidders_notice.pdf'
start_word = "投标人须知正文"
end_phrases = [
r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
]
output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件'
try:
output_path = convert_clause_to_json(file_path,output_folder)
print(f"Final JSON result saved to: {output_path}")
except ValueError as e:
print("Error:", e)

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,30 @@
import PyPDF2
import re # 导入正则表达式库
def clean_page_numbers(text):
# 使用正则表达式删除页码
# 假设页码在文本的最开始,最结尾,或中间(如 /129 或 129
cleaned_text = re.sub(r'^\s*\d+\s+', '', text) # 删除开头的页码
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text) # 删除结尾的页码
cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text) # 删除形如 /129 的页码
return cleaned_text
def extract_text_by_page(file_path):
result = ""
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
num_pages = len(reader.pages)
# print(f"Total pages: {num_pages}")
for page_num in range(num_pages):
page = reader.pages[page_num]
text = page.extract_text()
if text:
cleaned_text = clean_page_numbers(text)
result += cleaned_text
# print(f"Page {page_num + 1} Content:\n{cleaned_text}")
else:
print(f"Page {page_num + 1} is empty or text could not be extracted.")
return result
if __name__ == '__main__':
file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\zbfile_tobidders_notice_table.pdf"
extract_text_by_page(file_path)

View File

@ -0,0 +1,83 @@
import re
import PyPDF2
import json
def extract_key_value_pairs(text):
import re
# 使用正则表达式来找到通常的键值对
pattern = r'\d+\.\d+\s*([\w\s\u4e00-\u9fff]+)[\x01\x02☑]([\w\s\u4e00-\u9fff]+)'
matches = re.findall(pattern, text)
results = {}
for key, value in matches:
# 移除键中的数字和点序号
key = re.sub(r'^\d+(\.\d+)*\s+', '', key)
# 移除键中的多余空格和特殊字符
cleaned_key = re.sub(r'\s+', '', key).replace('', '').replace('', '')
# 清理值并停止提取到特定标点符号为止
cleaned_value = re.split(r'[,。、,]', value)[0].strip()
results[cleaned_key] = cleaned_value
# 处理 '' 或 '☑' 位于行首的特殊情况
lines = text.split('\n')
previous_lines = []
last_serial_key = ""
for line in lines:
if re.match(r'^[\x01\x02☑√]', line):
# 提取当前行的值,直到特定标点符号
value = re.sub(r'^[\x01\x02☑√]\s*', '', line)
cleaned_value = re.split(r'[,。、,]', value)[0].strip()
# 使用最后一个包含序号的行作为键名的候选
if last_serial_key:
# 从最后一个有效序号行提取键名,并在特殊字符前停止
key = re.sub(r'^\d+(\.\d+)*\s+', '', last_serial_key)
key = re.split(r'[\x01\x02□]', key)[0].strip() # 停止提取到特殊字符为止
# 处理重复键名的情况
original_key = key
count = 1
while key in results:
key = f"{original_key}{' ' * count}"
count += 1
results[key] = cleaned_value
else:
# 保留最近的几行作为键的候选行
if re.search(r'\d+\.\d+', line):
last_serial_key = line # 更新最后一个包含序号的行作为键名的候选
previous_lines.append(line)
if len(previous_lines) > 10:
previous_lines.pop(0)
return results
def read_pdf_and_judge_main(file_path, output_json_path):
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
num_pages = len(reader.pages)
print(f"Total pages: {num_pages}")
all_data = {}
for page_num in range(num_pages):
page = reader.pages[page_num]
text = page.extract_text() if page.extract_text() else ""
# 使用正则表达式删除每页开头的页码,格式通常为“数字+空格”
cleaned_text = re.sub(r'^\d+\s+', '', text)
key_value_pairs = extract_key_value_pairs( cleaned_text)
all_data.update(key_value_pairs)
# 有区别吗  
# 将所有数据保存为 JSON 文件
with open(output_json_path, "w", encoding="utf-8") as json_file:
json.dump(all_data, json_file, ensure_ascii=False, indent=4)
print(f"Data extraction complete and saved to '{output_json_path}'.")
if __name__ == "__main__":
# 示例调用
file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\ztb_tobidders_notice_table.pdf'
output_json_path = 'judge_exist.json'
read_pdf_and_judge_main(file_path, output_json_path)

View File

@ -0,0 +1,350 @@
# -*- coding: utf-8 -*-
import os
import shutil
from PyPDF2 import PdfReader, PdfWriter
def validate_pdf(file_path):
""" 验证PDF文件是否损坏 """
try:
with open(file_path, "rb") as file:
pdf = PdfReader(file)
return len(pdf.pages) > 0
except Exception as e:
print(f"Error reading PDF {file_path}: {str(e)}")
return False
def truncate_pdf(source_path, target_path, max_pages=15):
"""截取PDF文件的前15页并保存"""
try:
with open(source_path, "rb") as infile:
reader = PdfReader(infile)
writer = PdfWriter()
for i in range(min(max_pages, len(reader.pages))):
writer.add_page(reader.pages[i])
with open(target_path, "wb") as outfile:
writer.write(outfile)
except Exception as e:
print(f"Error processing PDF {source_path}: {str(e)}")
def copy_file(src, dest):
""" 复制单个文件 """
os.makedirs(os.path.dirname(dest), exist_ok=True)
shutil.copy2(src, dest)
def copy_directory(source, destination):
"""复制整个目录"""
os.makedirs(destination, exist_ok=True)
for item in os.listdir(source):
src_path = os.path.join(source, item)
dest_path = os.path.join(destination, item)
if os.path.isfile(src_path):
copy_file(src_path, dest_path)
else:
copy_directory(src_path, dest_path)
def unique_file_name(base_path, file_name):
counter = 1
name_part, extension = os.path.splitext(file_name)
new_file_name = file_name
# 检查文件是否存在,若存在则修改文件名
while os.path.exists(os.path.join(base_path, new_file_name)):
new_file_name = f"{name_part}_{counter}{extension}"
counter += 1
return new_file_name
def process_pdf_folders(source_dir, target_dir):
""" 处理源目录中的PDF文件并基于条件选择目标文件夹 """
for root, dirs, files in os.walk(source_dir, topdown=False):
for file in files:
if file.lower().endswith('.pdf'):
source_file_path = os.path.join(root, file)
if validate_pdf(source_file_path):
relative_path = os.path.relpath(root, source_dir)
target_file_dir = os.path.join(target_dir, relative_path)
target_file_path = os.path.join(target_file_dir, file)
copy_file(source_file_path, target_file_path)
else:
print(f"Deleted corrupt file: {source_file_path}")
# 清除空目录
if not os.listdir(root):
os.rmdir(root)
def classify_folders(source_dir, target_dir, project_index):
"""Classifies folders and processes files based on specific criteria."""
temp_dir = os.path.join(source_dir, 'temp')
os.makedirs(temp_dir, exist_ok=True)
target_project_dir = None
processed_dirs = set() # Set to track processed directories
for subdir in os.listdir(source_dir):
subdir_path = os.path.join(source_dir, subdir)
if not os.path.isdir(subdir_path):
continue
files = [f.lower() for f in os.listdir(subdir_path)]
if 'zb.pdf' in files:
target_project_dir, project_index ,target_zb_name= process_tender_files(subdir_path, temp_dir, target_dir, project_index)
processed_dirs.add(subdir_path) # Mark this directory as processed
elif subdir.lower() == "输出文件":
process_evaluation_files(subdir_path, target_project_dir)
processed_dirs.add(subdir_path) # Mark this directory as processed
# Process remaining folders, skipping already processed ones
process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs,target_zb_name)
#删除tmp目录
# if os.path.exists(temp_dir):
# shutil.rmtree(temp_dir)
if os.path.exists(source_dir):
shutil.rmtree(source_dir)
def process_tender_files(subdir_path, temp_dir, target_dir, project_index):
"""Processes tender files and returns the target project directory and updated index."""
zb_path = os.path.join(subdir_path, "zb.pdf")
truncated_zb_path = os.path.join(temp_dir, "truncate_zb.pdf")
truncate_pdf(zb_path, truncated_zb_path, 30) # Truncate to the first 30 pages
bot_response = file_parse(truncated_zb_path, "file_" + str(project_index), 1)
project_index += 1
zb_response = extract_answer(bot_response[1], 1)
zb_response[0]=zb_response[0].replace('/', '-').replace('\\', '-') #用'-'代替'/' ,目录名不允许出现斜杠
target_zb_name, category = zb_response[2] + ".pdf", zb_response[3]
new_zb_path = os.path.join(subdir_path, target_zb_name)
os.rename(zb_path, new_zb_path)
target_category_dir = os.path.join(target_dir, category)
target_project_dir = os.path.join(target_category_dir, zb_response[0] + "_" + zb_response[1])
copy_directory(subdir_path, os.path.join(target_project_dir, "招标文件夹"))
os.remove(truncated_zb_path)
shutil.rmtree(subdir_path)
return target_project_dir, project_index,zb_response[2]
def process_evaluation_files(subdir_path, target_project_dir):
"""Processes evaluation folders."""
copy_directory(subdir_path, os.path.join(target_project_dir, "评标文件夹"))
shutil.rmtree(subdir_path)
def process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs, target_zb_name):
"""Processes remaining folders containing bid files."""
target_tb_dir = os.path.join(target_project_dir, "投标文件夹")
for subdir in os.listdir(source_dir):
subdir_path = os.path.join(source_dir, subdir)
if not os.path.isdir(subdir_path) or subdir_path in processed_dirs:
continue # Skip processed directories
target_tbcom_dir = None # Initialize outside the file loop
for item in os.listdir(subdir_path):
item_src_path = os.path.join(subdir_path, item)
new_name = "truncate_" + item
truncated_tb_path = os.path.join(temp_dir, new_name)
truncate_pdf(item_src_path, truncated_tb_path)
bot_response = file_parse(truncated_tb_path, "file_" + str(project_index), 2)
project_index += 1
tb_response = extract_answer(bot_response[1], 2)
if not tb_response:
continue # If there's no response, skip this file
# Initialize target_tbcom_dir only once based on the first processed file
if not target_tbcom_dir:
target_tb_name, _ = tb_response
if(target_tb_name != target_zb_name and target_tb_name != "未知"):
target_tbcom_dir = os.path.join(target_tb_dir, target_tb_name)
print(target_tbcom_dir)
os.makedirs(target_tbcom_dir, exist_ok=True)
tb_section = tb_response[1] + ".pdf"
new_tb_path = os.path.join(subdir_path, tb_section)
# 获取唯一文件名
new_tb_path = os.path.join(subdir_path, unique_file_name(subdir_path, tb_section))
os.rename(item_src_path, new_tb_path)
# Remove temporary truncated file
os.remove(truncated_tb_path)
# Copy the whole directory at once after all files have been processed and renamed
if target_tbcom_dir:
copy_directory(subdir_path, target_tbcom_dir)
shutil.rmtree(subdir_path) # Optionally remove the original directory after copying
import re
import re
def extract_answer(input_string, type):
# 使用正则表达式匹配所需的信息
# 第一种模式:项目名称、项目编号、招标人、类别
# 在每个字段值后都添加了\s*以忽略尾随的空格
pattern1 = r"项目名称[:]\s*(.*?)[;]\s*项目编号[:]\s*(.*?)[;]\s*招标人[:]\s*(.*?)[;]\s*类别[:]\s*([^。;]*).*"
# 第二种模式:投标人、类别
pattern2 = r"投标人[:]\s*(.*?)[;]\s*类别[:]\s*([^。;]*).*"
if type == 1:
match = re.search(pattern1, input_string)
if match:
print(f"1: {match.group(1).strip()} 2: {match.group(2).strip()} 3: {match.group(3).strip()} 4: {match.group(4).strip()}")
return [match.group(1).strip(), match.group(2).strip(), match.group(3).strip(), match.group(4).strip()]
else:
print("No match found for type 1.")
return []
elif type == 2:
match = re.search(pattern2, input_string)
if match:
# 检查是否包含“投标函”,如果是则替换部分内容为“商务文件”
part = match.group(2).strip()
if "投标函" in part:
part = "商务文件"
return [match.group(1).strip(), part]
else:
print("No match found for type 2.")
return []
from llama_index.readers.dashscope.base import DashScopeParse
from llama_index.readers.dashscope.utils import ResultType
from llama_index.indices.managed.dashscope import DashScopeCloudIndex
from dashscope import Assistants, Messages, Runs, Threads
def send_message(assistant, index,documents,message='百炼是什么?'):
print(f"Query: {message}")
# create thread.
# create a thread.
thread = Threads.create()
print(thread)
# create a message.
message = Messages.create(thread.id, content=message)
# create run
run = Runs.create(thread.id, assistant_id=assistant.id)
print("run:" + str(run))
# # get run statue
run_status = Runs.get(run.id, thread_id=thread.id)
# print(run_status)
# wait for run completed or requires_action
run_status = Runs.wait(run.id, thread_id=thread.id)
# print(run_status)
# if prompt input tool result, submit tool result.
run_status = Runs.get(run.id, thread_id=thread.id)
# print(run_status)
# verify_status_code(run_status)
# get the thread messages.
msgs = Messages.list(thread.id)
# print(msgs)
# print(json.dumps(msgs, default=lambda o: o.__dict__, sort_keys=True, indent=4))
ans = []
print("运行结果:")
for message in msgs['data'][::-1]:
ans.append(message['content'][0]['text']['value'])
print("content: ", message['content'][0]['text']['value'])
print("\n")
deleteFileFromKnowledge(index,documents)
return ans
def file_parse(filepath, knowledge_index, type):
parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND)
documents = parse.load_data(file_path=filepath)
index = DashScopeCloudIndex("文件分类临时知识库")
index._insert(documents)
retriever = index.as_retriever()
pipeline_id = str(retriever.pipeline_id)
assistant = Assistants.create(
model='qwen-max',
name='smart helper',
description='智能助手,支持知识库查询和插件调用。',
instructions='请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}',
tools=[
{
"type": "code_interpreter"
},
{
"type": "rag",
"prompt_ra": {
"pipeline_id": pipeline_id,
"parameters": {
"type": "object",
"properties": {
"query_word": {
"type": "str",
"value": "${document1}"
}
}
}
}
}]
)
questions1 = "这份招标文件的项目名称是什么?这份文件的招标编号是多少?这份文件的招标人是谁?这份招标文件属于以下哪类招标:服务类、工程类、还是货物类?你可以结合你对招投标的了解和以下内容:工程类招标投标是指建设单位对拟建的工程项目通过法定的程序和方式吸引建设项目的承包单位竞争;\
货物类招投标是指以货物作为采购对象的招标业主或称购货方为获得货物通过招标的形式选择合格的供货商或称供货方包括原材料产品设备电能和固态液态气态物体等服务类招标又称为服务采购指的是除了工程和货之外的其他招标投标活动物招标投标范围包含建设工程的勘察设计监理工程咨询评估科技项目科研课题物业管理金融保险服务等\
请按下列格式给我提供信息,避免无关内容:项目名称:XXX;项目编号:XXX;招标人:XXX;类别XXX"
questions2 = "这份投标文件的投标人是谁?如果文件中未明确提及投标人,投标人以未知替代;这份文件属于哪类投标文件?你的回答仅限于以下三种:商务文件、技术文件、报价文件,不要给出其他回答。请按下列格式给我提供信息,避免无关内容:‘投标人:XXX;类别:XXX"
if (type == 1):
questions = questions1
elif (type == 2):
questions = questions2
return send_message(assistant,index,documents, message=questions)
def deleteFileFromKnowledge(index,documents):
# 初始化一个列表来存储所有文档的 ID
file_ids = []
# 检查documents是否为列表且不为空
if isinstance(documents, list) and documents:
# 遍历每个文档
for document in documents:
# 使用属性访问方式获取每个文档的 id_
# 确保 document 对象有一个名为 id_ 的属性
file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常
if file_id:
file_ids.append(file_id) # 将 id 添加到列表中
index.delete_ref_doc(file_ids)
def process_directory(base_directory, intermediate_directory, final_directory):
# 遍历base_directory下的所有子目录
for subdir in os.listdir(base_directory):
subdir_path = os.path.join(base_directory, subdir)
# 确保是目录
if os.path.isdir(subdir_path):
# 处理每个项目的目录
process_pdf_folders(subdir_path, intermediate_directory)
classify_folders(intermediate_directory, final_directory, 0)
# 清理临时目录以备下一个项目使用
if os.path.exists(intermediate_directory):
shutil.rmtree(intermediate_directory)
def main(base_directory, intermediate_directory, final_directory):
# 确保中间目录和最终目录存在
os.makedirs(intermediate_directory, exist_ok=True)
os.makedirs(final_directory, exist_ok=True)
process_directory(base_directory, intermediate_directory, final_directory)
#TODO后期可加多线程进行处理
if __name__ == "__main__":
base_directory = 'D:\\bidding_trading_files\\招投标文件2023\\2023'
intermediate_directory = 'D:\\tmp'
final_directory = 'D:\\output'
main(base_directory, intermediate_directory, final_directory) #处理多级目录
# process_pdf_folders(base_directory, intermediate_directory) ##处理单级目录
# classify_folders(intermediate_directory, final_directory, 0)

View File

@ -0,0 +1,302 @@
# -*- coding: utf-8 -*-
import json
import os.path
import time
import re
from json_utils import combine_json_results, nest_json_under_key
from 通义千问long import upload_file, qianwen_long
from concurrent.futures import ThreadPoolExecutor
from 禁止投标情形 import find_forbidden
#如果当前段落有序号,则向下匹配直接遇到相同的序号样式
#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。
def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
from collections import OrderedDict
from docx import Document
import re
doc = Document(doc_path)
extracted_paragraphs = OrderedDict() # 使用OrderedDict保持顺序
continue_collecting = False
current_section_pattern = None
active_key = None # 用来标记当前正在收集的文本块的键
def match_keywords(text, patterns):
"""使用正则表达式匹配关键词。"""
return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns)
def extract_from_text(text, index):
nonlocal continue_collecting, current_section_pattern, active_key
if text == "": # Skip empty lines
return
if continue_collecting:
if current_section_pattern and re.match(current_section_pattern, text):
continue_collecting = False
active_key = None # 结束当前的收集
else:
if active_key is not None:
extracted_paragraphs[active_key].append(text)
if match_keywords(text, keywords):
active_key = text # 设置当前的关键词块
extracted_paragraphs[active_key] = [text] # 初始化列表以收集文本,包括当前匹配的文本
# 检查是否也匹配后续关键词
if match_keywords(text, follow_up_keywords):
continue_collecting = True
# 设置跟踪模式
section_number = re.match(r'(\d+(\s*\.\s*\d+)*)', text)
if section_number:
current_section_number = section_number.group(1)
level_count = current_section_number.count('.')
pattern = r'\b' + (r'\d+\s*\.\s*') * level_count + r'\d+\b'
current_section_pattern = re.compile(pattern)
else:
found_next_number = False
current_section_pattern = None
for next_index in range(index + 1, len(doc.paragraphs)):
next_text = doc.paragraphs[next_index].text.strip()
if not found_next_number:
next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)', next_text)
if next_section_number:
found_next_number = True
section_parts = next_section_number.group(1).split('.')
dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
current_section_pattern = re.compile(dynamic_pattern)
if current_section_pattern and re.match(current_section_pattern, next_text):
extracted_paragraphs[active_key].append(next_text) # 持续收集
for index, para in enumerate(doc.paragraphs):
extract_from_text(para.text.strip(), index)
return extracted_paragraphs # 返回字典,键是关键词,值是相关文本列表
def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达式提取到的东西格式化
all_texts1 = []
all_texts2=[]
# 定义用于分割句子的正则表达式,包括中文和西文的结束标点
split_pattern = r'(?<=[。!?\!\?])'
for key, text_list in extracted_contents.items():
if len(text_list) == 1:
for data in text_list:
# 检查是否包含任何需要排除的字符串
if any(exclude in data for exclude in excludes):
continue # 如果包含任何排除字符串,跳过这个数据
# 去掉开头的序号,包括字母+数字的格式 以及括号+数字
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
data = re.sub(pattern, '', data).strip()
keyword_match = re.search(keywords, data)
if keyword_match:
# 从关键词位置开始查找结束标点符号
start_pos = keyword_match.start()
# 截取从关键词开始到后面的内容
substring = data[start_pos:]
# 按定义的结束标点分割
sentences = re.split(split_pattern, substring, 1)
if len(sentences) > 0 and sentences[0]:
# 只取第一句,保留标点
cleaned_text = data[:start_pos] + sentences[0]
else:
cleaned_text = data # 如果没有标点,使用整个字符串
else:
# 如果没有找到关键词,保留原文本
cleaned_text = data
all_texts1.append(cleaned_text) # 将处理后的文本添加到结果列表
else:
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
data = re.sub(pattern, '', text_list[0]).strip()
# 将修改后的第一个元素和剩余的元素连接起来
text_list[0] = data # 更新列表中的第一个元素
joined_text = "\n".join(text_list) # 如果列表中有多个元素,则连接它们
all_texts2.append(joined_text) # 将每个列表的内容添加到 all_texts 中
return all_texts1,all_texts2
def find_sentences_with_keywords(data, keywords, follow_up_keywords):
"""递归查找并返回包含关键词的句子列表,并根据是否存在后续关键词分别存储到两个列表中。"""
sentences1 = [] # 保存没有后续关键词的情况
sentences2 = [] # 保存有后续关键词的情况
if isinstance(data, dict):
for value in data.values():
result1, result2 = find_sentences_with_keywords(value, keywords, follow_up_keywords)
sentences1.extend(result1)
sentences2.extend(result2)
elif isinstance(data, list):
for item in data:
result1, result2 = find_sentences_with_keywords(item, keywords, follow_up_keywords)
sentences1.extend(result1)
sentences2.extend(result2)
elif isinstance(data, str):
# 分割句子,保证句子完整性
split_sentences = re.split(r'(?<=[。!?\!\?])', data)
i = 0
while i < len(split_sentences):
sentence = split_sentences[i]
if re.search(keywords, sentence, re.IGNORECASE):
follow_up_present = any(
re.search(follow_up, sentence, re.IGNORECASE) for follow_up in follow_up_keywords)
if follow_up_present:
# 如果存在后续关键词,则从当前位置开始截取
start_index = i
end_index = start_index
found_next_section = False
for j in range(start_index + 1, len(split_sentences)):
if re.match(r'\d+\.\d+(\.\d+)?', split_sentences[j].strip()):
end_index = j
found_next_section = True
break
if found_next_section:
full_text = ' '.join(split_sentences[start_index:end_index]).strip()
else:
full_text = ' '.join(split_sentences[start_index:]).strip()
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
data=re.sub(pattern,'',full_text)
sentences2.append(data) # 存储有后续关键词的情况
i = end_index if found_next_section else len(split_sentences)
else:
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
data = re.sub(pattern, '', sentence).replace('\n','').strip()
sentences1.append(data) # 存储没有后续关键词的情况
i += 1
else:
i += 1
return sentences1, sentences2 # 返回两个列表
def extract_sentences_from_json(json_path, keywords,follow_up_keywords):
with open(json_path, 'r', encoding='utf-8') as file:
data = json.load(file)
"""从JSON数据中提取包含关键词的句子。"""
return find_sentences_with_keywords(data, keywords,follow_up_keywords)
#处理无效投标
def extract_values_if_contains(data, includes):
"""
递归检查字典中的值是否包含列表 'includes' 中的内容
如果包含将这些值添加到一个列表中并返回
参数:
data (dict): 字典或从 JSON 解析得到的数据
includes (list): 包含要检查的关键词的列表
返回:
list: 包含满足条件的值的列表
"""
included_values = [] # 初始化结果列表
# 定义递归函数来处理嵌套字典
def recursive_search(current_data):
if isinstance(current_data, dict):
for key, value in current_data.items():
if isinstance(value, dict):
# 如果值是字典,递归搜索
recursive_search(value)
elif isinstance(value, str):
# 如果值是字符串,检查是否包含任何 includes 中的关键词
if any(include in value for include in includes):
included_values.append(value)
elif isinstance(current_data, list):
for item in current_data:
# 如果是列表,递归每个元素
recursive_search(item)
# 开始递归搜索
recursive_search(data)
return included_values
#你是一个文本助手,文本内的信息以'...............'分割,你负责准确筛选所需的信息并返回,每块信息要求完整,不遗漏,你不得擅自进行总结或删减。
#以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果x为符合的信息的序号。
#以上是原文内容,文本内的信息以'...............'分割请你根据该信息回答否决投标或拒绝投标或无效投标或使投标失效的情况有哪些文本中可能存在无关的信息请你准确筛选所需的信息并返回。最终结果以json列表格式返回给我键名为'否决和无效投标情形',你的回答完全忠于原文内容,且回答内容与原文内容一致,要求完整与准确,不能擅自总结或者概括。",
def handle_query(file_path, user_query, output_file, result_key, keywords, truncate_json_path):
excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:"]
follow_up_keywords = [r'\s*形\s*之\s*一', r'\s*况\s*之\s*一', r'\s*列', r'\s*下']
extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) #字典结果
all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords)
qianwen_txt = all_texts1 + all_tables1
# Proceed only if there is content to write
if qianwen_txt:
with open(output_file, 'w', encoding='utf-8') as file:
# 初始化一个计数器
counter = 1
for content in qianwen_txt:
file.write("..............."+'\n')
# 写入内容前加上序号,后面接一个点和空格,然后是内容
file.write(f"{counter}. {content}\n")
# 更新计数器,每次循环递增
counter += 1
file_id = upload_file(output_file)
print("starting qianwen-long...")
qianwen_ans = qianwen_long(file_id, user_query)
selected_contents = []
num_list = json.loads(qianwen_ans)
print(num_list)
for index in num_list:
if index - 1 < len(qianwen_txt):
content = qianwen_txt[index - 1] # 转换序号为索引假设序号从1开始
selected_contents.append(content)
selected_contents += all_texts2
selected_contents += all_tables2
# 创建一个字典来保存结果
res = {result_key: selected_contents}
# 将结果转换为JSON字符串
# os.remove(output_file) # Remove the file after use
# print(f"Deleted temporary file: {output_file}")
else:
res = {result_key: ""} # Set the response to empty if no contents were extracted
return res
def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,truncate4):
print("starting无效标与废标...")
queries = [
(r'\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果x为符合的信息的序号。",
os.path.join(output_dir, "temp1.txt"), "否决和无效投标情形"),
(r'\s*标',
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:废标项的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果x为符合的信息的序号。",
os.path.join(output_dir, "temp2.txt"), "废标项")
]
results = []
# 使用线程池来并行处理查询
with ThreadPoolExecutor() as executor:
futures = []
for keywords, user_query, output_file, result_key in queries:
future = executor.submit(handle_query, file_path, user_query, output_file, result_key, keywords,
truncate_json_path)
futures.append(future)
time.sleep(1) # 暂停1秒后再提交下一个任务
for future in futures:
results.append(future.result())
forbidden_res = find_forbidden(truncate_json_path, clause_path, truncate4)
results.append(forbidden_res)
combined_dict = {}
for d in results:
combined_dict.update(d)
print("无效标与废标done...")
return nest_json_under_key(combined_dict, "无效标与废标项")
#TODO:1.运行时间约80s如果成为短板需要优化多线程 2.没有提取评标办法前附表中的表格 3.提取表格时根据中文的句号分割 4.qianwen-long存在bug
if __name__ == '__main__':
start_time = time.time()
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json"
clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\clause.json"
truncate4="C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest12_qualification.pdf"
output_dir = "C:\\Users\\Administrator\\Desktop\\招标文件"
doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标03.docx'
results = combine_find_invalid(doc_path, output_dir,truncate_json_path,clause_path,truncate4)
end_time = time.time()
print("Elapsed time:", str(end_time - start_time))
print("Results:", results)

View File

@ -0,0 +1,133 @@
import json
def load_json(file_path):
"""
加载JSON文件并统一其中的括号为全角括号
"""
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return standardize_brackets_in_json(data)
def standardize_brackets_in_json(data):
"""
递归地处理JSON数据将所有文本中的半角括号转换为全角括号
"""
if isinstance(data, dict):
return {k: standardize_brackets_in_json(v) for k, v in data.items()}
elif isinstance(data, list):
return [standardize_brackets_in_json(element) for element in data]
elif isinstance(data, str):
return standardize_brackets(data)
else:
return data
def convert_dict_to_str(d):
if isinstance(d, dict):
return "\n".join(f"{k}: {v}" for k, v in d.items())
return str(d)
def find_entries_in_jsons(entries, json_primary, json_secondary):
results = {}
for entry in entries:
key, value = next(iter(entry.items()))
combined_value = []
# 先尝试在json_primary中寻找如果找不到再在json_secondary中查找
found_in_primary = process_json_with_subentries(json_primary, value, combined_value)
if not found_in_primary:
process_json_with_subentries(json_secondary, value, combined_value)
if combined_value:
results[key] = "\n".join(combined_value)
return results
def process_json_with_subentries(json_data, value, combined_value):
"""
处理JSON数据寻找指定的条目考虑全角和半角括号
"""
value = standardize_brackets(value)
if "" in value and "" in value:
base_key, subentry_key = value.split("")
subentry_key = "" + subentry_key
content = json_data.get(base_key.strip())
if content:
if isinstance(content, str):
extracted_content = extract_specific_subentry(content, subentry_key)
if extracted_content:
combined_value.append(extracted_content)
return True
else:
return False
else:
return process_json(json_data, value, combined_value)
def process_json(json_data, value, combined_value):
found_subentries = check_and_collect_subentries(json_data, value, combined_value)
if not found_subentries:
content = json_data.get(value, "")
if content:
combined_value.append(get_values_only(content))
return True
return found_subentries
def check_and_collect_subentries(json_data, value, combined_value):
found_subentries = False
subentry_index = 1
for subkey in json_data:
if subkey.startswith(value + "."):
content = json_data[subkey]
combined_value.append(f"{subentry_index}. {get_values_only(content)}")
subentry_index += 1
found_subentries = True
return found_subentries
def extract_specific_subentry(content, subentry_key):
"""
提取指定的子条目文本考虑全角和半角括号
"""
subentry_index = subentry_key.replace("", "").replace("", "")
try:
idx = int(subentry_index)
bracket_pattern = f"{idx}"
parts = content.split(bracket_pattern)
if len(parts) > 1:
next_bracket_pattern = f"{idx+1}"
next_part = parts[1].split(next_bracket_pattern, 1)[0]
return next_part.strip()
except ValueError:
return ""
return ""
def get_values_only(content):
if isinstance(content, dict):
return " / ".join(content.values())
return content
def standardize_brackets(value):
"""
将输入中的所有半角括号转换为全角括号
"""
return value.replace('(', '').replace(')', '')
def process_and_merge_entries(entries_with_numbers, primary_json_path, secondary_json_path):
primary_json_data = load_json(primary_json_path)
secondary_json_data = load_json(secondary_json_path)
combined_results = find_entries_in_jsons(entries_with_numbers, primary_json_data, secondary_json_data)
return combined_results
if __name__ == "__main__":
# Hypothetical entries and file paths for testing
# entries_with_numbers = [{'形式评审标准.投标文件签字盖章': '3.7.3(3)'}, {'形式评审标准.多标段投标': '10.1'}, {'形式评审标准.“技术暗标”': '3.7.4(5)'}, {'响应性评审标准.投标内容': '1.3.1'}, {'响应性评审标准.工期': '1.3.2'}, {'响应性评审标准.工程质量': '1.3.3'}, {'响应性评审标准.投标有效期': '3.3.1'}, {'响应性评审标准.投标保证金': '3.4.1'}, {'响应性评审标准.分包计划': '1.11'}]
entries_with_numbers=[{'xxx': '3.7.45'}]
primary_json_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\truncate_output3.json'
secondary_json_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\clause3.json'
# Since this is just a test block, make sure these paths point to actual JSON files with the appropriate structure
try:
combined_results = process_and_merge_entries(entries_with_numbers, primary_json_path, secondary_json_path)
print("Combined Results:", json.dumps(combined_results, indent=4, ensure_ascii=False))
except FileNotFoundError:
print("One or more JSON files were not found. Please check the file paths.")
except json.JSONDecodeError:
print("One or more files could not be decoded. Please check the file content.")

View File

@ -0,0 +1,57 @@
import os
import uuid
from llama_index.readers.dashscope.base import DashScopeParse
from llama_index.readers.dashscope.utils import ResultType
from llama_index.indices.managed.dashscope import DashScopeCloudIndex
from 删除知识库 import delete_index, create_client
def addfileToKnowledge(filepath,knowledge_name):
parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND)
documents = parse.load_data(file_path=filepath)
index = DashScopeCloudIndex.from_documents(
documents,
knowledge_name,
verbose=True,
)
# index = DashScopeCloudIndex(knowledge_name)
# index._insert(documents)
# return index, documents
return index
def deleteKnowledge(index):
retriever = index.as_retriever()
index_id = str(retriever.pipeline_id)
workspace_id = os.environ.get('DASHSCOPE_WORKSPACE_ID')
client = create_client()
delete_index(client,workspace_id,index_id)
def deleteFileFromKnowledge(index, documents):
# 初始化一个列表来存储所有文档的 ID
file_ids = []
# 检查documents是否为列表且不为空
if isinstance(documents, list) and documents:
# 遍历每个文档
for document in documents:
# 使用属性访问方式获取每个文档的 id_
# 确保 document 对象有一个名为 id_ 的属性
file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常
if file_id:
file_ids.append(file_id) # 将 id 添加到列表中
print("deleted successfully")
index.delete_ref_doc(file_ids)
# 示例用法
if __name__ == "__main__":
filepath = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标01.pdf"
unique_id = str(uuid.uuid4())
knowledge_name="招标解析"+unique_id
# index = addfileToKnowledge(filepath,knowledge_name)
index = DashScopeCloudIndex("招标解析e8cc45f4-cd41-47cf-a5e6-2b7885debfff")
# 删除文件
# deleteFileFromKnowledge(index, document)
deleteKnowledge(index)

View File

@ -0,0 +1,151 @@
import ast
import json
import os
import re
from PyPDF2 import PdfWriter, PdfReader
from 通义千问long import upload_file, qianwen_long
from json_utils import clean_json_string
def extract_and_format_from_paths(json_paths, includes):
"""
从多个 JSON 文件路径读取数据提取包含特定关键词的内容并按照格式要求合并
参数:
json_paths (list): 包含多个 JSON 文件路径的列表
includes (list): 包含要检查的关键词的列表
返回:
list: 包含所有文件中满足条件的格式化字符串列表
"""
all_formatted_results = []
# 遍历每个文件路径
for path in json_paths:
try:
with open(path, 'r', encoding='utf-8') as file:
# 加载 JSON 数据
json_data = json.load(file)
formatted_results = []
# 遍历 JSON 数据的每个键值对
for key, value in json_data.items():
if isinstance(value, dict):
# 如果值是字典,检查嵌套字典的每个键值对
for sub_key, sub_value in value.items():
if any(include in sub_value for include in includes):
# 如果子值包含关键词,格式化并添加到结果列表
formatted_results.append(f"{sub_key}: {sub_value}")
elif isinstance(value, str):
# 如果值是字符串,直接检查是否包含关键词
if any(include in value for include in includes):
# 如果值包含关键词,添加到结果列表
formatted_results.append(value)
# 将当前文件的结果添加到总结果列表
all_formatted_results.extend(formatted_results)
except FileNotFoundError:
print(f"Error: The file '{path}' does not exist.")
except json.JSONDecodeError:
print(f"Error: The file '{path}' contains invalid JSON.")
return all_formatted_results
def extract_unique_items_from_texts(texts):
pattern = re.compile(r'(?:\d+\.|\\d+\|\(\d+\)|\d+\))\s*')
intro_pattern = re.compile(r'^.*[:]')
punctuation_pattern = re.compile(r'[;。,、..,:;!?]+$')
all_results = []
seen = set()
for text in texts:
text = intro_pattern.sub('', text)
items = pattern.split(text)
for item in items:
cleaned_item = item.strip()
if cleaned_item:
cleaned_item = pattern.sub('', cleaned_item)
cleaned_item = punctuation_pattern.sub('', cleaned_item)
cleaned_item = cleaned_item.strip()
if cleaned_item and cleaned_item not in seen:
seen.add(cleaned_item)
all_results.append(cleaned_item)
return all_results
def merge_pdfs(paths, output_filename):
pdf_writer = PdfWriter()
output_path = None
for path in paths:
pdf_reader = PdfReader(path)
for page in range(len(pdf_reader.pages)):
# 将每页添加到写入对象中
pdf_writer.add_page(pdf_reader.pages[page])
if output_path is None:
# 确定输出文件路径
output_path = os.path.join(os.path.dirname(path), output_filename)
# 写入合并的PDF到文件
if output_path:
with open(output_path, 'wb') as out:
pdf_writer.write(out)
print(f"Merged PDF saved to {output_path}")
else:
print("No files to merge.")
return output_path
def process_string_list(string_list):
# 使用正则表达式匹配方括号内的内容
match = re.search(r'\[(.*?)\]', string_list)
if match:
# 获取匹配的内容,即方括号内的部分
content_inside_brackets = match.group(1)
if content_inside_brackets: # 检查内容是否为空
# 将每个项目用引号包裹,并确保适当的空格和逗号处理
formatted_list = '[' + ', '.join(f"'{item.strip()}'" for item in content_inside_brackets.split(',') if item.strip()) + ']'
else:
return [] # 直接返回空列表如果内容为空
# 使用 ast.literal_eval 来解析格式化后的字符串
try:
actual_list = ast.literal_eval(formatted_list)
return actual_list
except SyntaxError as e:
print(f"Error parsing list: {e}")
return []
else:
# 如果没有匹配到内容,返回空列表
return []
def find_forbidden(truncate_json_path,clause_path,truncate4): #投标人须知前附表 条款 评分前附表和资格审查表中
# output_filename="merged.pdf"
# paths=[truncate1,truncate4]
# merged_filepath=merge_pdfs(paths,output_filename) #暂时废弃,评分前附表中的在'否决投标'中摘录了。
file_id=upload_file(truncate4)
#user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些请按json列表格式给我提供信息键名为'不得存在的其他情形',请你不要回答有关\"信誉要求\"的内容,若文件中未说明,请在键值中填'未知'。"
user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...],请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。"
qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden)
print(qianwen_forbidden_str)
actual_list=process_string_list(qianwen_forbidden_str)
print(actual_list)
includes = ["不得存在", "禁止投标"]
forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes)
processed_results = extract_unique_items_from_texts(forbidden_results)
print(processed_results)
merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results))
forbidden_dict={'不得存在的其他情形':merged_forbidden_list}
return forbidden_dict
if __name__ == '__main__':
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json"
clause_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\clause.json"
truncate4 = "C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest12_qualification.pdf"
output_dir = "C:\\Users\\Administrator\\Desktop\\招标文件"
doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标03.docx'
find_forbidden(truncate_json_path,clause_path,truncate4)

View File

@ -0,0 +1,18 @@
from docx import Document
def read_docx(file_path):
# 尝试打开文档
try:
doc = Document(file_path)
except Exception as e:
print(f"Error opening file: {e}")
return
# 读取文档中的所有段落并打印它们
for para in doc.paragraphs:
print(para.text)
if __name__ == "__main__":
file_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标02_invalid.docx"
read_docx(file_path)

View File

@ -0,0 +1,35 @@
import json
import os
from 投标人须知正文条款提取成json文件 import convert_clause_to_json
from json_utils import nest_json_under_key, extract_content_from_json
from 形式响应评审 import process_reviews
from 资格评审 import process_qualification
from 通义千问long import upload_file, qianwen_long
def combine_review_standards(truncate1,truncate4,knowledge_name,truncate0_jsonpath,clause_path): #评标办法前附表
# 形式评审、响应评审:千问
print("starting形式响应评审...")
file_id=upload_file(truncate1) #评标办法前附表
user_query_1 = "根据该文档中的评标办法前附表请你列出该文件中的形式评审标准和响应性评审标准和资格评审标准请以json格式返回外层键名为'形式评审标准''响应性评审标准''资格评审标准',嵌套键名为'评审因素'中的内容,相应的键值为对应'评审标准'中的内容。"
results = qianwen_long(file_id, user_query_1)
original_dict_data = extract_content_from_json(results)
qualification_review = original_dict_data.pop('资格评审标准', '默认值或None')
final_qualify_json=process_qualification(qualification_review,truncate4,knowledge_name)
form_response_dict=process_reviews(original_dict_data, knowledge_name, truncate0_jsonpath, clause_path)
print("形式响应评审done")
form_response_dict.update(final_qualify_json)
return nest_json_under_key(form_response_dict,"资格审查")
if __name__ == "__main__":
output_folder = "C:\\Users\\Administrator\Desktop\\fsdownload\\temp1"
truncate1 = os.path.join(output_folder, "ztbfile_tobidders_notice_table.pdf")
knowledge_name="zbfile"
truncate2=os.path.join(output_folder,"ztbfile_evaluation_method.pdf")
truncate4=os.path.join(output_folder,"ztbfile_qualification.pdf")
clause_path = convert_clause_to_json(truncate2, output_folder)
truncate1_jsonpath = os.path.join(output_folder, "truncate_output.json")
res=combine_review_standards(truncate2,truncate4, knowledge_name,truncate1_jsonpath,clause_path)
print(res)

View File

@ -0,0 +1,87 @@
#资格审查中,首先排除'联合体投标'和'不得存在的情况',有'符合'等的加入matching_keys列表否则保留原字典
import re
from json_utils import clean_json_string, combine_json_results, add_keys_to_json
from 多线程提问 import multi_threading, read_questions_from_file
from 通义千问long import upload_file
def merge_dictionaries_under_common_key(dicts, common_key):
# 初始化一个空字典来保存合并的结果
merged_dict = {common_key: {}}
# 遍历列表中的每个字典
for d in dicts:
if common_key in d:
# 使用字典解包来合并字典
merged_dict[common_key].update(d[common_key])
else:
print(f"Warning: Dictionary does not contain the key {common_key}")
return merged_dict
def generate_qual_question(matching_keys_list):
questions=[]
# 将列表转换为单引号包裹的格式,并用逗号和空格分隔
formatted_keys = ["'{}'".format(key) for key in matching_keys_list]
# 将格式化后的关键词列表连接成字符串
keys_string = ", ".join(formatted_keys)
# 构造完整的问题语句
question1 = (f"该招标文件中资格评审的内容是怎样的?具体内容包括{keys_string}"
"请你以json格式返回结果外层键名为'资格评审',嵌套键名为具体的字段,请你忠于原文,回答要求完整准确,不要擅自总结、删减。")
question2="该招标文件中资格评审中有关人员资格的要求是怎样的请依次给出所需的岗位、需要的数量、资格要求、需要提交的证明材料如具体的社保证明、技能证书等若有时间要求请注明时间范围、在岗要求、备注若相关要求不存在则以“未知”填充。请你以json格式返回结果外层键名为'资格评审',嵌套键名为具体的要求,请你忠于原文,回答要求完整准确,不要擅自总结、删减。"
questions.append(question1)
questions.append(question2)
return questions
def extract_matching_keys_qual(dict_data):
# 定义包含模式的列表
include_patterns = [re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"),re.compile(r"符合")]
# 初始化列表,用于存储匹配的键
matching_keys = []
non_matching_keys = {}
# 定义排除项
excludes = ['联合体', '禁止投标', '不存在', '不得存在','资格','管理机构','负责人']
# 遍历字典中的每个键值对
for key, value in dict_data.items():
# 检查键是否包含任何排除项
if any(ex in key for ex in excludes):
continue # 如果包含排除项,则跳过当前键值对
# 检查值是否符合任何一个包含模式
if any(pattern.search(value) for pattern in include_patterns):
# 如果匹配,将键添加到列表中
matching_keys.append(key)
else:
# 如果不匹配,将键值对添加到不匹配字典中
non_matching_keys[key] = value
return matching_keys,non_matching_keys #matching:['资质条件', '财务状况'] non_matching_keys:{'营业执照': '具备有效的营业执照', '施工机械设备': '具备完善的施工设备'}
def process_qualification(qualification_review,truncate4,knowledge_name):
# 资格评审
matching_keys_list, non_matching_dict = extract_matching_keys_qual(qualification_review)
user_querys = generate_qual_question(matching_keys_list) # 生成提问->附件:资格审查
file_id2 = upload_file(truncate4)
results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表
res_list = []
if not results2:
print("errror!")
else:
# 打印结果
for question, response in results2:
cleaned_res = clean_json_string(response)
res_list.append(cleaned_res) # 都是问资格评审表得出的
merged_dict = merge_dictionaries_under_common_key(res_list, '资格评审')
qualify_list = []
# qualification_review_file_path = '../static/提示词/资格评审问题.txt' # 替换为你的txt文件路径
qualification_review_file_path='static/提示词/资格评审问题.txt'
qualification_review_questions = read_questions_from_file(qualification_review_file_path) # 联合体投标
results1 = multi_threading(qualification_review_questions, knowledge_name)
for _, response in results1: # _占位代表ques;response[0]也是ques;response[1]是ans
try:
if response and len(response) > 1: # 检查response存在且有至少两个元素
qualify_list.append(response[1])
else:
print(f"Warning: Missing or incomplete response data for query index {_}.")
except Exception as e:
print(f"Error processing response for query index {_}: {e}")
qualify_combined_dict = combine_json_results(qualify_list)
updated_qualify_json = add_keys_to_json(merged_dict, qualify_combined_dict) # 合并字典
final_qualify_json = add_keys_to_json(updated_qualify_json, non_matching_dict)
return final_qualify_json

View File

@ -0,0 +1,29 @@
from 按页读取pdf import extract_text_by_page
def check_strings_in_pdf(file_path):
judge_list=['施工机械设备', '企业信息登记']
# Read text from PDF
text = extract_text_by_page(file_path) # Assuming this returns all text from the PDF
full_text = ''.join(text).replace('\n', '').replace(' ', '') # Clean up the text
# Initialize the questions list
ques_list = []
# Check for each string in the judge_list and construct questions accordingly
if judge_list[0] in full_text:
ques_list.append(f"该招标文件对于'{judge_list[0]}'的要求是怎样的请按json格式给我提供信息键名为'{judge_list[0]}',若存在未知信息,在对应的键值中填'未知'")
if len(judge_list) > 1 and judge_list[1] in full_text:
ques_list.append(f"该招标文件对于'{judge_list[1]}'的要求是怎样的请按json格式给我提供信息键名为'{judge_list[1]}',若存在未知信息,在对应的键值中填'未知'")
if not ques_list:
return None
return ques_list
# Test cases or example usage
if __name__ == '__main__':
file_path = 'C:/Users/Administrator/Desktop/zbtest18_39-45.pdf' # Replace with your actual PDF file path
judge_list = ['施工机械设备', '企业信息登记'] # List of strings to check in the PDF
questions = check_strings_in_pdf(file_path, judge_list)
for question in questions:
print(question)

View File

View File

@ -0,0 +1,30 @@
# filename: check_status.py
import hashlib
import email.utils
import http.client
import json
def get_download_url(task_id):
app_id = 'SX20240723LAKILA'
app_key = 'mIwDAgJZIZEUsOZatRrCvhtMkaxGdWbq'
get_uri = f"/api/developer/v1/tasks/convert/to/docx/{task_id}"
current_time = email.utils.formatdate(usegmt=True)
content_md5 = hashlib.md5(get_uri.encode('utf-8')).hexdigest()
data = app_key + content_md5 + "application/json" + current_time
sha1_hex = hashlib.sha1(data.encode('utf-8')).hexdigest()
authorization_header = f"WPS-2:{app_id}:{sha1_hex}"
conn = http.client.HTTPSConnection("solution.wps.cn")
headers = {
'Date': current_time,
'Content-Md5': content_md5,
'Content-Type': "application/json",
'Authorization': authorization_header
}
conn.request("GET", get_uri, headers=headers)
res = conn.getresponse()
data = res.read()
response_json = json.loads(data.decode("utf-8"))
return response_json['data']['download_url']

View File

@ -0,0 +1,41 @@
import requests
import mimetypes
import os
def download_file(url, local_filename):
try:
with requests.get(url, stream=True) as response:
response.raise_for_status() # 确保请求成功,否则抛出异常
# 获取文件类型并设置适当的文件扩展名
content_type = response.headers.get('Content-Type')
extension = mimetypes.guess_extension(content_type, strict=False)
if not extension:
# 如果无法猜测扩展名,默认使用 .bin
extension = '.bin'
full_filename = local_filename + extension # 追加扩展名
with open(full_filename, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
print(f"File downloaded successfully and saved as {full_filename}")
return full_filename # 返回文件的完整路径
except requests.HTTPError as e:
print(f"HTTP Error: {e}")
return None
except requests.RequestException as e:
print(f"Error downloading the file: {e}")
return None
except Exception as e:
print(f"An error occurred: {e}")
return None
if __name__ == '__main__':
# 测试下载的URL
test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/tender/28f7c0af7c7041bbbdf88ce6848e8a38.pdf?Expires=1722165340&OSSAccessKeyId=TMP.3KfNYFQchGtZWbjd2M1jR6y7PPqYTq1QLZ4pzbfEwkz3LwGLepVvr9371bndcRoMhHFhohaUJxrhiL63jKoAZk6VWQfwh4&Signature=RmktXAOwEbP1BBrkSfARfHtuXh8%3D"
# 基本的本地文件名,不包括扩展名
local_file_name = 'C:\\Users\\zhangsan\\Desktop\\temp\\downloaded_file'
file_path = download_file(test_url, local_file_name)
if file_path:
print(f"Downloaded file path: {file_path}")

View File

@ -0,0 +1,39 @@
import time
from submit_conversion import submit_conversion_task
from check_status import get_download_url
from download import download_file
def download_pdf_convert_docx(url, downloaded_filename):
"""
Download a PDF from a URL, convert it to a DOCX, and save it locally.
Args:
url (str): The URL of the PDF to be downloaded.
downloaded_filename (str): The filename to save the converted DOCX as.
"""
# 提交转换任务并获取task_id
task_id = submit_conversion_task(url)
if task_id:
download_url = None
# 使用while循环进行每秒的查询
while not download_url:
time.sleep(0.5)
download_url = get_download_url(task_id)
# 如果得到下载链接
if download_url:
# 下载文件
download_file(download_url, downloaded_filename)
print(f'File downloaded and saved as {downloaded_filename}')
else:
print("Failed to get download URL.")
else:
print("Failed to submit conversion task.")
if __name__ == "__main__":
# PDF文件URL
pdf_url = "https://temp-pdf2docx.oss-cn-wuhan-lr.aliyuncs.com/pdf/02cf0a7a8cda432a8ba7a929862510eb.pdf?Expires=1724035295&OSSAccessKeyId=TMP.3Kj9nRWk3bspYRpZJJeKSSDjuoiSsd1SYBnHtac62JciczGbftutcSUcM5RpLTQNQXeANRNbdSxK2VnX9cQZ9bUgR3dWDv&Signature=MJfXEZe1fy5CEIoJ1IxhliSv0Ss%3D"
# pdf_url="C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\zbfile.pdf"
downloaded_filename = 'downloaded_document.docx'
download_pdf_convert_docx(pdf_url, downloaded_filename)

View File

@ -0,0 +1,48 @@
import requests
import os
def convert_pdf_to_word(file_path, output_dir, output_format='docx'):
"""
Converts a PDF file to a Word document using a specified API.
:param file_path: Path to the PDF file to convert.
:param output_dir: Directory to save the converted Word document.
:param output_format: Format of the output Word document ('docx' or 'doc').
:return: None
"""
url = 'http://192.168.0.40:5000/api/v1/convert/pdf/word'
# Prepare the files and data dictionary for the multipart/form-data request
with open(file_path, 'rb') as file_handle:
files = {
'fileInput': (os.path.basename(file_path), file_handle, 'application/pdf')
}
data = {
'outputFormat': output_format
}
# Make the POST request
response = requests.post(url, files=files, data=data)
# Check the response
if response.status_code == 200:
print("Request was successful.")
# Determine the output filename based on the input filename but with the new extension
output_filename = os.path.splitext(os.path.basename(file_path))[0] + '.' + output_format
output_path = os.path.join(output_dir, output_filename)
# Save the output to a new file
with open(output_path, 'wb') as f:
f.write(response.content)
print(f"Output file saved to: {output_path}")
else:
print("Failed to make request:", response.status_code, response.text)
# Example usage:
if __name__ == '__main__':
file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标02_invalid.pdf"
output_dir = "C:\\Users\\Administrator\\Desktop\\招标文件"
convert_pdf_to_word(file_path, output_dir, 'doc')

View File

@ -0,0 +1,37 @@
# filename: submit_conversion.py
import http.client
import email.utils
import hashlib
import json
def submit_conversion_task(file_url):
app_id = 'SX20240723LAKILA'
app_key = 'mIwDAgJZIZEUsOZatRrCvhtMkaxGdWbq'
current_time = email.utils.formatdate(usegmt=True)
payload = json.dumps({"url": file_url})
md5_hasher = hashlib.md5()
md5_hasher.update(payload.encode('utf-8'))
content_md5 = md5_hasher.hexdigest()
content_type = "application/json"
signing_string = app_key + content_md5 + content_type + current_time
hasher = hashlib.sha1()
hasher.update(signing_string.encode('utf-8'))
signature = hasher.hexdigest()
authorization_header = f"WPS-2:{app_id}:{signature}"
conn = http.client.HTTPSConnection("solution.wps.cn")
headers = {
'Date': current_time,
'Content-Md5': content_md5,
'Content-Type': content_type,
'Authorization': authorization_header
}
conn.request("POST", "/api/developer/v1/office/pdf/convert/to/docx", payload, headers)
res = conn.getresponse()
data = res.read()
response_json = json.loads(data.decode("utf-8"))
return response_json['data']['task_id']
if __name__ == "__main__":
file_url="C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\zbfile.pdf"
submit_conversion_task(file_url)

View File

@ -0,0 +1,48 @@
import json
import random
from http import HTTPStatus
from dashscope import Generation
def call_with_messages(messages):
response = Generation.call(model="qwen-max",
messages=messages,
seed=random.randint(1, 10000),
temperature=0.5,
top_p=0.5,
top_k=50,
result_format='message')
if response.status_code == HTTPStatus.OK:
content = response.output['choices'][0]['message']['content']
return content
else:
raise Exception(f'Request id: {response.request_id}, Status code: {response.status_code}, error code: {response.code}, error message: {response.message}')
def prepare_question_from_json(json_path, prompt):
with open(json_path, 'r', encoding='utf-8') as file:
json_data = json.load(file)
question = json.dumps(json_data, ensure_ascii=False) + prompt
return question
#专用于判断是否
def qianwen_ask(json_path, prompt):
messages = []
question = prepare_question_from_json(json_path, prompt)
messages.append({'role': 'system', 'content': 'You are a helpful assistant.'})
messages.append({'role': 'user', 'content': question})
return call_with_messages(messages)
#通用问题
def qianwen_ask2(questions):
messages = []
messages.append({'role': 'system', 'content': 'You are a helpful assistant.'})
messages.append({'role': 'user', 'content': questions})
return call_with_messages(messages)
if __name__ == '__main__':
json_path = 'judge_exist.json'
prompt = "请你依据以上信息回答,是否组织踏勘现场?是否召开投标预备会?是否允许偏离?是否退还投标文件?是否允许分包? 是否需要递交投标保证金是否有履约保证金履约担保是否有招标代理服务费请按json格式给我提供信息键名分别为'是否组织踏勘现场','是否召开投标预备会','是否允许偏离','是否退还投标文件',是否允许分包','是否需要递交投标保证金','是否有履约保证金','是否有招标代理服务费',键值仅限于'','','未知',若存在未知或者矛盾信息,请回答'未知'"
try:
content = qianwen_ask(json_path, prompt)
print(content)
except Exception as e:
print(f"An error occurred: {e}")

View File

@ -0,0 +1,67 @@
import time
from pathlib import Path
from openai import OpenAI
import os
def upload_file(file_path):
"""
Uploads a file to DashScope and returns the file ID.
"""
client = OpenAI(
api_key=os.getenv("DASHSCOPE_API_KEY"),
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)
file = client.files.create(file=Path(file_path), purpose="file-extract")
return file.id
def qianwen_long(file_id, user_query):
print("call qianwen-long...")
"""
Uses a previously uploaded file to generate a response based on a user query.
"""
client = OpenAI(
api_key=os.getenv("DASHSCOPE_API_KEY"),
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)
# Generate a response based on the file ID
completion = client.chat.completions.create(
model="qwen-long",
top_p=0.5,
temperature=0.5,
messages=[
{
'role': 'system',
'content': f'fileid://{file_id}'
},
{
'role': 'user',
'content': user_query
}
],
stream=False
)
# Return the response content
return completion.choices[0].message.content
if __name__ == "__main__":
# Example file path - replace with your actual file path
file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\ztb_evaluation_method.pdf"
file_id = upload_file(file_path)
user_query1 = ("根据该文档中的评标办法前附表,请你列出该文件的技术标,商务标,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在嵌套键如'技术标'中新增键名'备注'存放该信息。如果评分内容不是这3个则返回文档中给定的评分内容以及它的评分要求都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容")
user_query2 = ("请提供文件中关于资格审查的具体内容和标准。")
start_time=time.time()
# First query
print("starting qianwen-long...")
result1 = qianwen_long(file_id, user_query1)
print("First Query Result:", result1)
# # Second query
# print("starting qianwen-long...")
# result2 = qianwen_long(file_id, user_query2)
# print("Second Query Result:", result2)
# end_time=time.time()
# print("elapsed time:"+str(end_time-start_time))

View File

@ -0,0 +1,31 @@
1.该招标文件的工程名称项目名称招标编号是招标人是招标代理机构是请按json格式给我提供信息键名分别是'工程名称','招标编号','招标人','招标代理机构',若存在未知信息,在对应的键值中填'未知'。
#该招标文件的工程概况(或项目概况)是?招标范围是?招标控制价(可指代投标限价、投资概算金额、工程概算金额、合同估算价,但非监理费用)是?该项目的计划工期(监理服务期)是该项目是否接受联合体投标请按json格式给我提供信息键名分别为'工程概况','招标范围','招标控制价','计划工期','是否接受联合体投标',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知''是否接受联合体投标'的键值仅限于'是'、'否'、'未知'。
2.该招标文件的工程概况或项目概况招标范围是请按json格式给我提供信息键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。
3.该招标文件的招标控制价可指代投标限价、投资概算金额、工程概算金额、合同估算价但非监理费用该项目是否接受联合体投标请按json格式给我提供信息键名分别为'招标控制价''是否接受联合体投标',若存在未知信息,在对应的键值中填'未知''是否接受联合体投标'的键值仅限于'是'、'否'、'未知'。
4.投标文件递交截止日期是递交方式是请按json格式给我提供信息键名分别是'投标文件递交截止日期','递交方式',若存在未知信息,在对应的键值中填'未知'。
5.招标人和招标代理机构的联系方式是请按json格式给我提供信息键名分别是'招标人联系方式''招标代理机构联系方式',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。
##8.该项目的开标时间和地点是请按json格式给我提供信息键名为'开标时间'和'开标地点',若存在未知信息,在对应的键值中填'未知'。
##(三个问题分开问)根据第二章投标人须知的内容,该招标文件是否允许分包? 是否需要递交投标保证金是否有履约保证金履约担保是否有招标代理服务费你需要留意☑后的内容。请按json格式给我提供信息键名分别为'是否允许分包','是否递交投标保证金','是否有履约保证金','是否有招标代理服务费',键值仅限于'是','否','未知'。可以一起问设置摘取分段为8仍存在问题pdf转word文件打勾符号可能会无法正常显示解决思路1根据原pdf进行提取
6.该招标文件的评标结果定标候选人公示媒介在哪请按json格式给我提供信息键名是'评标结果公示媒介',若存在未知信息,在对应的键值中填'未知'。
7.该招标文件的投标竞争下浮率是多少请按json格式给我提供信息键名是'投标竞争下浮率',若存在未知信息,在对应的键值中填'未知'。
#11.该招标文件的投标竞争下浮率是多少若请按json格式给我提供信息键名是'投标竞争下浮率',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。
8.该项目的投标有效期是什么请按json格式给我提供信息键名是'投标有效期',若存在未知信息,在对应的键值中填'未知'。
#该招标中对于实质性要求(废标项)的内容有哪些规定投标人不得存在的情形有哪些文件中提及的否决和无效投标情形有哪些请以json格式返回结果键名分别'实质性要求','不得存在的情形','否决和无效投标情形',若存在未知信息,请在对应键值中填'未知',你的回答一切以原文内容为准,不可改动。
#8.该招标文件的电子招标文件获取方式是请按原文段落全部完整内容回答以json的格式给我提供信息键名是'电子招标文件获取方式',若存在未知信息,在对应的键值中填'未知'。
9.该招标文件对投标人准备和参加投标活动发生的费用是如何规定的请以json的格式给我提供信息键名是'费用承担',若存在未知信息,在对应的键值中填'未知'。
10.求澄清的招标文件截止时间是请以json的格式给我提供信息键名是'投标人要求澄清招标文件的截止时间',若存在未知信息,在对应的键值中填'未知'。
11.该文档要求扣留的质量保证金百分比是多少请以json格式给我提供信息键名为'质量保证金',如果没有则以'未知'填充。

View File

@ -0,0 +1,10 @@
#pdf提取之后的提示词调用普通通译千问
#请你依据以上信息回答,是否允许分包? 是否需要递交投标保证金是否有履约保证金履约担保是否有招标代理服务费请按json格式给我提供信息键名分别为'是否允许分包','是否递交投标保证金','是否有履约保证金','是否有招标代理服务费',键值仅限于'是','否','未知'。
1.该招标文件对于分包的要求是怎样的请按json格式给我提供信息键名为'分包'。
2.根据招标文件第二章投标人须知该项目投标保证金需要缴纳金额是多少到账截止时间是缴纳形式是请按json格式给我提供信息外层键名为'投标保证金',嵌套键名分别为'缴纳金额','到账截止时间','缴纳形式',若存在多种缴纳形式,则在'缴纳形式'下以各种缴纳形式作为嵌套键名,再在对应的缴纳形式下嵌套写出缴纳步骤或要求或账户信息,请详细回答,不要遗漏原文信息。
3.该招标文件对于投标保证金的退还相关的规章办法是怎样的请按json格式给我提供信息键名为'退还投标保证金',若存在嵌套信息,嵌套内容键名以文档中对应字段命名。
4.根据投标人须知前附表,该项目对于履约保证金(担保)的要求中它的履约担保形式是怎样的它的履约担保金额是多少请按json格式给我提供信息外层键名为'履约保证金',嵌套键名分别是'履约担保形式','担保金额',若存在多种履约担保形式,则在'履约担保形式'下以各种履约担保形式作为嵌套键名,若存在未知信息,在对应的键值中填'未知'。
5.本项目的招标代理服务费由谁支付支付标准是什么支付方式是什么支付时间是什么请按json格式给我提供信息外层键名为'招标代理服务费',嵌套键名分别是'支付人','支付标准','支付方式','支付时间',若存在未知信息,在对应的键值中填'未知'。
6.该招标文件对于踏勘现场是怎样的踏勘时间和踏勘集中地点是请以json格式给我提供信息外层键名为'踏勘现场',嵌套键名分别是'踏勘时间','踏勘地点',若存在其他信息,新增嵌套键名'备注',填入其中,若存在未知信息,在对应的键值中填'未知'。
7.该招标文件对于投标预备会内容是怎样的召开时间和召开地点是请以json格式给我提供信息外层键名为'投标预备会',嵌套键名分别是'召开时间','召开地方',若存在其他信息,新增嵌套键名'备注',填入其中,若存在未知信息,在对应的键值中填'未知'。
8.本项目可偏离的项目和范围是怎样的请以json格式给我提供信息外层键名为'偏离'。

View File

@ -0,0 +1,111 @@
资格评审-》商务文件:
资质要求、财务要求、业绩要求、主要人员要求、信誉要求
3. 投标人资格要求
3.1 资格要求:
1本次招标要求投标申请人必须是在中华人民共和国境内注册并具有独立法人资格的有
效营业执照、组织机构代码证、税务登记证(或多证合一)。
2投标人必须具备国家行政主管部门核发的工程监理综合资质或房屋建筑工程监理资质甲
级资质。
3投标人拟派总监理工程师须具备房屋建筑专业注册监理工程师执业资格、工程类相关专
业的高级及以上职称,并在本单位注册,须提供无在监项目承诺函。
3.2 业绩要求
1 投标人 2017 年 12 月 1 日至今至少承接过 1 项施工合同投资金额达 1 亿元及以上的房
屋建筑工程监理业绩 ,并在人员、设备、资金等方面具有相应的监理能力。
2拟派项目总监理工程师 2017 年 12 月 1 日至今至少承接 1 项施工合同投资金额达 1 亿元
及以上的房屋建筑工程监理业绩。
3.3 财务能力要求
1提供近三年2019、2020、2021 年)财务审计报告(新成立的公司需提供自成立之日起
相应年度的财务审计报告),且近 3 年均无亏损(新成立的公司自成立之日起年度起)。
3.5 本次招标(接受或不接受)联合体投标:不接受。
资质条件:见本章附件
主要人员要求:见本章附件
信誉要求:见本章附件
其他要求:见本章附件
1.4 投标人资格要求(适用于未进行资格预审的)
1.4.1 投标人应具备承担本标段监理的资质条件、能力和信誉。
1资质条件见投标人须知前附表
2主要人员要求见投标人须知前附表
3信誉要求见投标人须知前附表;
4其他要求见投标人须知前附表。
基本信息:
工程名称(项目名称)是?招标编号是?招标人是?招标代理机构是?投标文件(递交)截止日期是?招标范围是?招标限价(工程、投资概算)是多少?投标保证金是多少?
请以键值对的形式给出回答,要求简洁准确,不多余
##资格评审:(在第二章附件:投标人资质条件、能力和信誉(资格审查标准)表中)
营业执照:具备有效的营业执照
安全生产许可证:具备有效的安全生产许可证
资质要求:
该招标文件对于投标人的资质条件等级是怎样的要求给出完整资质要求内容、需要提交的证明材料并按json格式给我提供信息外层键名为'资质条件',对于各个'资质条件',嵌套键名为'内容'和'证明材料',若存在未知信息,在对应的键值中填'未知',不要回答有关业绩要求、财务要求、主要人员要求、信誉要求的内容。
业绩要求:
该招标文件对于投标人的业绩要求是怎样的,至少需要包含业绩时间范围、业绩内容(如有)、金额要求(如有)、需要提交的证明材料、备注(其他关于业绩要求的内容,如有),请按json格式给我提供信息,最外层键名为'业绩要求',若相关要求不存在,则在对应的键值中填'未知'。
财务要求:
该招标文件对于投标人的财务要求是怎样的要求给出财务报告的时间范围、营收若利润要求、需要提交的证明材料、备注其他关于财务要求的内容如有请按json格式给我提供信息,最外层键名为'财务要求',若相关要求不存在,则在对应的键值中填'未知'。
信誉要求:
该招标文件对于投标人的信誉要求是怎样的。请按json格式给我提供信息键名为'信誉要求',对于各个'信誉要求',嵌套键名为'内容'和'证明材料',若存在未知信息,在对应的键值中填'未知',不要回答无关信誉要求的内容。
(存在问题)主要人员要求:
该招标文件对于投标人的项目经理监理和技术负责人的要求是怎样的请依次给出需要的数量、资格要求、需要提交的证明材料如具体的社保证明、技能证书等若有时间要求请注明时间范围、在岗要求、备注除上述要求外文档中提及的其他关于项目经理监理和技术负责人要求的信息以json的形式给出若相关要求不存在则以“未知”填充。
该招标文件对于项目管理机构中除项目经理和技术负责人外的其他人员要求是怎样的请依次给出所需的岗位、对应的数量、资格要求、需要提交的证明材料如具体的社保证明、技能证书等若有时间要求请注明时间范围、在岗要求、备注除上述要求外文档中提及的其他关于人员要求的信息以json的形式给出若相关要求不存在则以“未知”填充。
是否有'施工机械设备'和'企业信息登记'
该招标文件对于'施工机械设备'和'企业信息登记'的要求是怎样的请按json格式给我提供信息若存在未知信息在对应的键值中填'未知'。
(需要与第一章对应)联合体投标:
该招标文件是否接受联合体投标?
该招标文件对于联合体投标的要求是怎样的请按json格式给我提供信息外层键名为'联合体投标要求'。
(需跳转)禁止投标的情形:
在资格审查中该招标文件规定的投标人不得存在的情形有哪些请按json格式给我提供信息键名为'禁止投标的情形'。
根据该招标文件的第三章评标办法前附表对投标文件的评分分值构成是请以json的格式返回。如果没有评分分值构成请回答我该文件对投标文件评分标准是这里的标准不需要具体展开。
##形式评审:
该招标文件的形式评审标准是怎样的以n行3列表格形式给出表头为序号、评审因素和评审标准不要回答有关资格评审标准、响应性评审标准相关的内容尽量用原文内容进行表述。
## 响应性评审
该招标文件的响应性评审标准是怎样的以n行3列表格形式给出表头为序号、评审因素和评审标准不要回答有关资格评审标准、形式评审标准相关的内容尽量用原文内容进行表述。
prompt="""
# 角色
你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。
## 技能
### 技能 1文档解析与摘要
- 深入理解并分析${documents}的内容,提取关键信息。
- 根据需求生成简洁明了的摘要,保持原文核心意义不变。
### 技能 2信息检索与关联
- 在${documents}中高效检索特定信息或关键词。
- 能够识别并链接到文档内部或外部的相关内容,增强信息的连贯性和深度。
## 限制
- 所有操作均需基于${documents}的内容,不可超出此范围创造信息。
- 在处理敏感或机密信息时,需遵守严格的隐私和安全规定。
- 确保所有生成或改编的内容逻辑连贯,无误导性信息。
请注意,上述技能执行时将直接利用并参考${documents}的具体内容,以确保所有产出紧密相关且高质量。
"""
投标内容==招标范围、监理服务期、监理工作范围、投标有效期、投标保证金、算术错误修正、投标价格(报价)、其他、工期、工程质量==(质量,质量标准)、权利义务、已标价工程量清单、技术标准和要求、招标人不能接受的条件、分包计划、重大偏差

View File

@ -0,0 +1,35 @@
#资质要求:
#1.该招标文件对于投标人的资质条件等级是怎样的要求给出完整资质要求内容、需要提交的证明材料并按json格式给我提供信息外层键名为'资质条件',对于各个'资质条件',嵌套键名为'内容'和'证明材料',若存在未知信息,在对应的键值中填'未知',不要回答有关业绩要求、财务要求、主要人员要求、信誉要求的内容。
#业绩要求:
#2.该招标文件对于投标人的业绩要求是怎样的,至少需要包含业绩时间范围、业绩内容(如有)、金额要求(如有)、需要提交的证明材料、备注(其他关于业绩要求的内容,如有),请按json格式给我提供信息,最外层键名为'业绩要求',若相关要求不存在,则在对应的键值中填'未知'。
#财务要求:
#3.该招标文件对于投标人的财务要求是怎样的要求给出财务报告的时间范围、营收若利润要求、需要提交的证明材料、备注其他关于财务要求的内容如有请按json格式给我提供信息,最外层键名为'财务要求',若相关要求不存在,则在对应的键值中填'未知'。
#信誉要求:
#4.该招标文件对于投标人的信誉要求是怎样的请按json格式给我提供信息键名为'信誉要求',对于各个'信誉要求',嵌套键名为'内容'和'证明材料',若存在未知信息,在对应的键值中填'未知',不要回答无关信誉要求的内容。
#(存在问题)主要人员要求:
#5.该招标文件对于投标人的项目经理监理和技术负责人的要求是怎样的请依次给出需要的数量、资格要求、需要提交的证明材料如具体的社保证明、技能证书等若有时间要求请注明时间范围、在岗要求、备注除上述要求外文档中提及的其他关于项目经理监理和技术负责人要求的信息以json的形式给出键名分别是"项目经理"和"技术负责人",若相关要求不存在,则以“未知”填充。
#6.该招标文件对于项目管理机构中除项目经理和技术负责人外的其他人员要求是怎样的请依次给出所需的岗位、对应的数量、资格要求、需要提交的证明材料如具体的社保证明、技能证书等若有时间要求请注明时间范围、在岗要求、备注除上述要求外文档中提及的其他关于人员要求的信息以json的形式给出最外层键名为"其他人员",嵌套的键名为为具体的岗位名称,若相关要求不存在,则以“未知”填充。
#(需要与第一章对应)联合体投标:
#该招标文件是否接受联合体投标?
7.该招标文件对于联合体投标的要求是怎样的请按json格式给我提供信息外层键名为'联合体投标要求(如有)'。
#(需跳转)禁止投标的情形:
#8.该招标文件规定的投标人不得存在的其他情形有哪些请按json格式给我提供信息键名为'不得存在的其他情形',请你不要回答有关"信誉要求"的内容。
#9.在资格评审标准中,除了'资质要求','业绩要求','财务要求','信誉要求','人员要求','联合体投标要求','禁止投标的情形',还有其他要求吗请按json格式给我提供信息最外层键名为'其他要求',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。注意不要回答'形式评审标准'和'响应性评审标准'的内容。
#施工机械设备、企业信息登录
#9.该招标文件对于'施工机械设备'的要求是怎样的请按json格式给我提供信息键名为'施工机械设备',若存在未知信息,在对应的键值中填'未知'。
#10.该招标文件对于'企业信息登录'的要求是怎样的请按json格式给我提供信息键名为'企业信息登录',若存在未知信息,在对应的键值中填'未知'。
#该招标文件中资格评审的内容是怎样的?具体内容包括'资质条件','财务状况','类似业绩','信誉','施工机械设备','其他要求'请你以json格式返回结果外层键名为'资格评审',嵌套键名为具体的字段,请你忠于原文,回答要求完整准确,不要擅自总结、删减。
#该招标文件中资格评审中有关'项目经理资格','设计负责人资格','施工负责人资格','项目管理机构及人员'的要求是怎样的请依次给出所需的岗位、需要的数量、资格要求、需要提交的证明材料如具体的社保证明、技能证书等若有时间要求请注明时间范围、在岗要求、备注若相关要求不存在则以“未知”填充。请你以json格式返回结果外层键名为'资格评审',嵌套键名为具体的要求,请你忠于原文,回答要求完整准确,不要擅自总结、删减。
#该招标文件中资格评审中有关人员资格的要求是怎样的请依次给出所需的岗位、需要的数量、资格要求、需要提交的证明材料如具体的社保证明、技能证书等若有时间要求请注明时间范围、在岗要求、备注若相关要求不存在则以“未知”填充。请你以json格式返回结果外层键名为'资格评审',嵌套键名为具体的要求,请你忠于原文,回答要求完整准确,不要擅自总结、删减。

View File

View File

@ -0,0 +1,36 @@
import os
import sys
from 货物标截取pdf import truncate_pdf_main
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
# from ..main.format_change import docx2pdf
# from ..main.多线程提问 import multi_threading
from ..main.通义千问long import upload_file,qianwen_long
from ..main.json_utils import clean_json_string
def generate_key_paths(data, parent_key=''):
key_paths = []
for key, value in data.items():
current_key = f"{parent_key}.{key}" if parent_key else key
if isinstance(value, dict):
# 如果值是字典,递归调用
key_paths.extend(generate_key_paths(value, current_key))
else:
# 如果到达了末端,添加当前键路径
key_paths.append(current_key)
return key_paths
#获取采购清单
def fetch_purchasing_list(file_path):
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output"
# file_path = docx2pdf(file_path)
truncate_path=truncate_pdf_main(file_path,output_folder,1)
user_query="这是一份货物标中采购要求部分的内容你需要摘取出需要采购的系统货物一个大系统大项中可能包含多个小系统小项你需要保留这种层次关系给出货物名称请以json格式返回外层键名为\"采购需求\",嵌套键名为对应的系统名称或货物名称,无需给出采购数量和单位,如有未知内容,在对应键值处填\"未知\""
file_id=upload_file(truncate_path)
res=qianwen_long(file_id,user_query)
cleaned_res=clean_json_string(res)
keys_list=generate_key_paths(cleaned_res['采购需求'])
print(keys_list)
if __name__ == "__main__":
file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\招招招标文件一中多媒体报告厅教学设备_20240829101650_tobidders_notice_table.pdf"
fetch_purchasing_list(file_path)

View File

@ -0,0 +1,105 @@
from PyPDF2 import PdfReader, PdfWriter
import re # 导入正则表达式库
import os # 用于文件和文件夹操作
def clean_page_numbers(text):
# 使用正则表达式删除页码
# 假设页码在文本的最开始,紧跟着文字且无空格分隔
cleaned_text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
# 删除结尾的页码
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text)
# 删除形如 /129 的页码
cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text)
return cleaned_text
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
# 打开PDF文件
pdf_document = PdfReader(pdf_path)
start_page = None
end_page = None
# 遍历文档的每一页,查找开始和结束短语的位置
for i in range(len(pdf_document.pages)):
page = pdf_document.pages[i]
text = page.extract_text()
if text:
cleaned_text = clean_page_numbers(text)
if re.search(begin_pattern, cleaned_text) and i > begin_page:
start_page = i
if start_page is not None and re.search(end_pattern, cleaned_text) and i > (start_page+1):
end_page = i
break
# 确保找到了起始和结束页面
if start_page is None or end_page is None:
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
return None
# 创建一个新的PDF文档保存截取的页面
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # Get the base name without extension
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
output_doc = PdfWriter()
# 添加需要的页面,从 start_page 开始,包括 end_page
for page_num in range(start_page, end_page + 1):
output_doc.add_page(pdf_document.pages[page_num])
# 保存新的PDF文件
with open(output_pdf_path, 'wb') as f:
output_doc.write(f)
print(f"已截取并保存页面从 {start_page}{end_page}{output_pdf_path}")
return output_pdf_path
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
# 确保输出文件夹存在
if not os.path.exists(output_folder):
os.makedirs(output_folder)
if os.path.isdir(input_path):
generated_files = []
# 遍历文件夹内的所有PDF文件
for file in os.listdir(input_path):
if file.endswith(".pdf"):
pdf_path = os.path.join(input_path, file)
output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
if output_pdf_path and os.path.isfile(output_pdf_path):
generated_files.append(output_pdf_path)
return generated_files
elif os.path.isfile(input_path) and input_path.endswith(".pdf"):
# 处理单个PDF文件
output_pdf_path = extract_pages(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
if output_pdf_path and os.path.isfile(output_pdf_path):
return [output_pdf_path] # 以列表形式返回,以保持一致性
else:
print("提供的路径既不是文件夹也不是PDF文件。")
return []
def truncate_pdf_main(input_path, output_folder, selection):
if selection == 1:
# Configure patterns and phrases for "投标人须知前附表"
begin_pattern = re.compile(r'第[一二三四五六七八九十百千]+章.*?(?:项目|服务|商务).*?要求')
begin_page = 5
end_pattern = re.compile(r'第[一二三四五六七八九十百千]+章\s*(资格审查|评标方法|评审办法)')
# 示例文本进行测试
output_suffix = "tobidders_notice_table"
else:
print("无效的选择")
return None
# Process the selected input
return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
def truncate_pdf_multiple(input_path, output_folder):
truncate_files = []
for selection in range(1, 2):
files = truncate_pdf_main(input_path, output_folder, selection)
truncate_files.extend(files)
return truncate_files
if __name__ == "__main__":
input_path = "C:\\Users\\Administrator\\WPSDrive\\292826853\\WPS云盘\\应用\\输出为PDF\\公安局音视频监控系统设备采购项目(定稿_20240829133603.pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
# truncate_pdf_multiple(input_path,output_folder)
selection = 1 # 例如1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-招标公告-合同条款前
generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print("生成的文件:", generated_files)

17
requirements.txt Normal file
View File

@ -0,0 +1,17 @@
requests==2.32.3
PyPDF2==3.0.1
Flask==3.0.3
python-docx==1.1.2
llama-index-core==0.10.44
llama-index-embeddings-dashscope==0.1.3
llama-index-indices-managed-dashscope-custom==0.1.1
llama-index-llms-dashscope==0.1.2
llama-index-node-parser-dashscope-custom==0.1.2
llama-index-readers-dashscope-custom==0.1.2
llama-index-readers-file==0.1.23
llamaindex-py-client==0.1.19
dashscope==1.19.2
PyMuPDF==1.24.1
openai==1.33.0
pathlib==1.0.1
alibabacloud_bailian20231229==1.7.0