11.8 评标修改 技术参数修改

This commit is contained in:
zy123 2024-11-11 17:12:38 +08:00
parent f6db131384
commit 906476ad2c
23 changed files with 692 additions and 642 deletions

View File

@ -199,7 +199,7 @@ def concatenate_keys_values(section_content):
concatenated.append(f"{key} {value}") concatenated.append(f"{key} {value}")
return concatenated return concatenated
#生成无结构的数据工程标 #生成无结构的数据工程标,对提取出的若干键值对生成外键为target_value值为列表的新键值对
def extract_sections(data, target_values): def extract_sections(data, target_values):
""" """
Extracts sections from the input dictionary where the top-level keys' values Extracts sections from the input dictionary where the top-level keys' values

View File

@ -1,90 +1,8 @@
import json
import docx
import re import re
import os
import fitz
from PyPDF2 import PdfReader from PyPDF2 import PdfReader
from flask_app.货物标.截取pdf货物标版 import clean_page_content,extract_common_header from flask_app.货物标.截取pdf货物标版 import clean_page_content,extract_common_header
def extract_text_from_docx(file_path):
doc = docx.Document(file_path)
return '\n'.join([para.text for para in doc.paragraphs])
#PYPDF2版本
def extract_text_from_pdf(file_path, start_word, end_pattern):
# 从PDF文件中提取文本
common_header = extract_common_header(file_path)
pdf_document = PdfReader(file_path)
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
all_pages_text = []
start_index = None
# 处理所有页面
for i, page in enumerate(pdf_document.pages):
page_text = page.extract_text() if page.extract_text() else ""
cleaned_text = clean_page_content(page_text, common_header)
# print(cleaned_text)
# 在第一页查找开始位置
if i == 0 and start_index is None:
start_match = re.search(start_word, cleaned_text, re.MULTILINE)
if start_match:
start_index = start_match.start()
cleaned_text = cleaned_text[start_index:]
# 在最后一页查找结束位置
if i == len(pdf_document.pages) - 1:
matches = list(re.finditer(end_pattern, cleaned_text, re.MULTILINE))
if matches:
end_index = matches[-1].start()
cleaned_text = cleaned_text[:end_index]
all_pages_text.append(cleaned_text)
# 合并所有页面的文本
full_text = "\n".join(all_pages_text)
return full_text
#fitz库版本
# def extract_text_from_pdf(file_path, start_word, end_pattern):
# # 从PDF文件中提取文本
# common_header = extract_common_header(file_path)
# doc = fitz.open(file_path)
# all_pages_text = []
# start_index = None
#
# # 处理所有页面
# for i in range(len(doc)):
# page = doc[i]
# page_text = page.get_text()
# cleaned_text = clean_page_content(page_text, common_header)
# print(cleaned_text)
# print("yes")
# # 在第一页查找开始位置
# if i == 0 and start_index is None:
# start_match = re.search(start_word, cleaned_text, re.MULTILINE)
# if start_match:
# start_index = start_match.start()
# cleaned_text = cleaned_text[start_index:]
#
# # 在最后一页查找结束位置
# if i == len(doc) - 1:
# for pattern in end_pattern:
# matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE))
# if matches:
# end_index = matches[-1].start()
# cleaned_text = cleaned_text[:end_index]
# break
#
# all_pages_text.append(cleaned_text)
#
# # 合并所有页面的文本
# full_text = "\n".join(all_pages_text)
# # 关闭文档
# doc.close()
#
# return full_text
def compare_headings(current, new): def compare_headings(current, new):
# 使用过滤来确保只处理非空且为数字的部分 # 使用过滤来确保只处理非空且为数字的部分
current_nums = [int(num) for num in current.split('.') if num.isdigit()] current_nums = [int(num) for num in current.split('.') if num.isdigit()]
@ -115,16 +33,6 @@ def handle_content_append(current_content, line_content, append_newline, keyword
current_content.append('\n') current_content.append('\n')
return append_newline return append_newline
"""
保存换行符的具体逻辑
对于二级标题 1.1如果其后的内容包含关键词或内容较短<=20字符会在内容前添加一个换行符
这个换行符会被保留在 current_content 列表中
当处理下一个标题时之前的内容包括可能存在的换行符会被合并并保存到 data 字典中
解决了''''这类标题出现在正文中的情况但是目前的逻辑是如果''已有了就加入正文否则''作为新的标题
"""
#提取json主函数
def parse_text_by_heading(text): def parse_text_by_heading(text):
keywords = ['包含', '以下'] keywords = ['包含', '以下']
data = {} data = {}
@ -369,145 +277,34 @@ def parse_text_by_heading(text):
return data return data
#type=2时提取货物标的第一章招标公告时采用该逻辑 def extract_text_from_pdf(file_path, start_word, end_pattern):
def parse_text_to_dict(text): # 从PDF文件中提取文本
""" common_header = extract_common_header(file_path)
解析文本根据大标题划分内容生成字典 pdf_document = PdfReader(file_path)
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') #仅匹配第一页和最后一页不需要exclusion_pattern
all_pages_text = []
start_index = None
# 处理所有页面
for i, page in enumerate(pdf_document.pages):
page_text = page.extract_text() if page.extract_text() else ""
cleaned_text = clean_page_content(page_text, common_header)
# print(cleaned_text)
# 在第一页查找开始位置
if i == 0 and start_index is None:
start_match = re.search(start_word, cleaned_text, re.MULTILINE)
if start_match:
start_index = start_match.start()
cleaned_text = cleaned_text[start_index:]
参数: # 在最后一页查找结束位置
text (str): 要解析的文本 if i == len(pdf_document.pages) - 1:
matches = list(re.finditer(end_pattern, cleaned_text, re.MULTILINE))
if matches:
end_index = matches[-1].start()
cleaned_text = cleaned_text[:end_index]
返回: all_pages_text.append(cleaned_text)
dict: 大标题作为键内容作为值的字典
"""
# 正则表达式模式:匹配以一至十的汉字数字开头,后跟顿号和任意字符,且位于行首
pattern = re.compile(r'^([一二三四五六七八九十]+\s*、\s*.*)$', re.MULTILINE)
# 使用 re.finditer 找到所有大标题的位置 # 合并所有页面的文本
matches = list(pattern.finditer(text)) full_text = "\n".join(all_pages_text)
return full_text
result = {}
for i, match in enumerate(matches):
title = match.group(1).strip() # 获取大标题文本
start = match.end() # 内容的起始位置
if i + 1 < len(matches):
end = matches[i + 1].start() # 下一个大标题的起始位置
else:
end = len(text) # 最后一个大标题,内容到文本末尾
content = text[start:end].strip() # 获取内容并去除前后空白
# 规范化换行符,并移除每行开头和结尾的空白
content = content.replace('\r\n', '\n') # 统一换行符
content = re.sub(r'[ \t]+\n', '\n', content) # 移除行尾空白
content = re.sub(r'^[ \t]+|[ \t]+$', '', content, flags=re.MULTILINE) # 移除行首和行尾空白
content = clean_content(content) # 处理内容中的换行符
result[title] = content
return result
def clean_content(content):
"""
处理内容中的换行符
- 保留在子项编号前的换行符
- 保留在冒号 ':' 或全角冒号 '' 前的第一个换行符
- 移除其他位置的换行符不留下额外的空格
参数:
content (str): 要处理的内容字符串
返回:
str: 处理后的内容字符串
"""
# 定义子项编号的正则模式,包括:
# - 数字+点号+数字(如 1.1 或 11
# - 数字+顿号(如 2、
# - 点号+数字(如 .3 或 3
# - 数字+右括号(如 1) 或 1
# - 圆括号包围的数字(如 (5)
# - 全角圆括号包围的数字(如 5
# - 数字+点号(如 1. 或 1
numbering_pattern = r'(?:\d+[.]\d+(?:[.]\d+)*|\d+、|[.]\d+|\d+[)]|\(\d+\)|\d+|\d+[.])'
# 定义需要保留换行符的情况:
# 1. 换行符前面是子项编号
# 2. 换行符前面是任意非空白字符,且后面跟着 ':' 或 ''
pattern_keep = r'\n(?=(?:' + numbering_pattern + r'|[^:\n\r\f\v]+[:]))'
# 定义占位符,用于暂时替换需要保留的换行符
placeholder = "___PLACEHOLDER___"
# Step 1: 将需要保留的换行符替换为占位符
content_with_placeholder = re.sub(pattern_keep, placeholder, content)
# Step 2: 移除所有剩余的换行符
content_no_newlines = content_with_placeholder.replace('\n', '')
# Step 3: 将占位符替换回换行符
cleaned_content = content_no_newlines.replace(placeholder, '\n')
return cleaned_content
#如果file_path为空返回""
def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json"):
if not os.path.exists(file_path):
print(f"The specified file does not exist: {file_path}")
return ""
if type == 1:
start_word = r'^\s*(?:[(]?\s*[一二12]?\s*[)]?\s*[、..]*\s*)?(说\s*明|总\s*则)'
end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*)$'
text = extract_text_from_pdf(file_path, start_word, end_pattern)
result = parse_text_by_heading(text)
else:
start_word = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*|.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*)$'
end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
text = extract_text_from_pdf(file_path, start_word, end_pattern)
result=parse_text_to_dict(text)
# result = convert_to_json(input_path, start_word, end_pattern)
# 检查输出文件夹是否存在,如果不存在则创建
if not os.path.exists(output_folder):
os.makedirs(output_folder)
print(f"Created output folder: {output_folder}")
file_name = "clause1.json" if type == 1 else "clause2.json"
# file_name = f"clause{suffix_counter}.json"
output_path = os.path.join(output_folder, file_name)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=4, ensure_ascii=False)
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
return output_path
def process_folder(input_folder, output_folder):
# 获取输入文件夹中的所有文件,过滤掉不以 'part2' 结尾的文件
files = [f for f in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, f)) and f.endswith('part2.pdf')]
# 遍历文件并对每个文件进行处理
for file_name in files:
file_path = os.path.join(input_folder, file_name)
# 去掉文件扩展名
file_name_without_extension = os.path.splitext(file_name)[0]
try:
# 调用 convert_clause_to_json传递文件路径、输出文件夹和递增的后缀
output_path = convert_clause_to_json(file_path, output_folder, 1, file_name_without_extension)
print(f"Processed file: {file_name}, JSON saved to: {output_path}")
except ValueError as e:
print(f"Error processing {file_name}: {e}")
#TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件定稿三次_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏
#TODO: 投标人须知正文这块,序号可能是乱序的,或许可以删除判断序号大小的逻辑,只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理
if __name__ == "__main__":
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
file_path=r'C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\ztbfile_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf'
output_folder = r'C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\tmp'
try:
output_path = convert_clause_to_json(file_path,output_folder,1)
print(f"Final JSON result saved to: {output_path}")
except ValueError as e:
print("Error:", e)
# input_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4'
# output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1'
#
# # 调用 process_folder 来处理整个文件夹中的所有文件
# process_folder(input_folder, output_folder)

View File

@ -17,6 +17,8 @@ file_data = json.loads(file_stk.model_dump_json())
# 提取所有文件的 id # 提取所有文件的 id
file_ids = [file["id"] for file in file_data["data"]] file_ids = [file["id"] for file in file_data["data"]]
# num=len(file_ids)
# print(num)
# 循环删除每个文件 # 循环删除每个文件
for file_id in file_ids: for file_id in file_ids:
file_object = client.files.delete(file_id) file_object = client.files.delete(file_id)

View File

@ -130,7 +130,7 @@ def process_all_part1_pdfs(folder_path, output_folder):
extract_tables_main(file_path, subfolder_path) extract_tables_main(file_path, subfolder_path)
if __name__ == "__main__": if __name__ == "__main__":
path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_tobidders_notice_table.docx' path =r"C:\Users\Administrator\Desktop\new招标文件\工程标\tmp\\HBDL-2024-0017-001-招标文件_evaluation_method.pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test" # 前附表json文件 output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test" # 前附表json文件
res=extract_tables_main("", output_folder) res=extract_tables_main("", output_folder)
print(res) print(res)

View File

@ -9,7 +9,7 @@ from flask_app.general.投标人须知正文提取指定内容 import get_requir
from flask_app.main.截取pdf import truncate_pdf_multiple from flask_app.main.截取pdf import truncate_pdf_multiple
from flask_app.general.merge_pdfs import merge_pdfs from flask_app.general.merge_pdfs import merge_pdfs
from flask_app.main.table_content_extraction import extract_tables_main from flask_app.main.table_content_extraction import extract_tables_main
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json from flask_app.main.提取json工程标版 import convert_clause_to_json
from flask_app.general.json_utils import transform_json_values from flask_app.general.json_utils import transform_json_values
from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid
from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
@ -54,9 +54,8 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id):
# 处理各个部分 # 处理各个部分
tobidders_notice_table=truncate_files[0] tobidders_notice_table=truncate_files[0]
truncate0_docpath = pdf2docx(tobidders_notice_table) # 投标人须知前附表转docx # tobidders_notice_table_docx = pdf2docx(tobidders_notice_table) # 投标人须知前附表转docx
# truncate_jsonpath = extract_tables_main(tobidders_notice_table_docx, output_folder) # 投标人须知前附表docx->json
truncate_jsonpath = extract_tables_main(truncate0_docpath, output_folder) # 投标人须知前附表docx->json
tobidders_notice = truncate_files[1] #投标人须知正文 tobidders_notice = truncate_files[1] #投标人须知正文
@ -86,7 +85,6 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id):
'tobidders_notice': tobidders_notice, 'tobidders_notice': tobidders_notice,
'evaluation_method':evaluation_method, 'evaluation_method':evaluation_method,
'qualification': qualification, 'qualification': qualification,
'truncate0_jsonpath': truncate_jsonpath,
'merged_baseinfo_path':merged_baseinfo_path, 'merged_baseinfo_path':merged_baseinfo_path,
'merged_baseinfo_path_more':merged_baseinfo_path_more, 'merged_baseinfo_path_more':merged_baseinfo_path_more,
'clause_path': clause_path, 'clause_path': clause_path,
@ -112,14 +110,14 @@ def fetch_project_basic_info(invalid_path, merged_baseinfo_path, merged_baseinfo
# 形式、响应、资格评审 # 形式、响应、资格评审
def fetch_qualification_review(evaluation_method, qualification, output_folder, truncate0_jsonpath, clause_path, invalid_path, merged_baseinfo_path): def fetch_qualification_review(evaluation_method, qualification, output_folder, tobidders_notice_table, clause_path, invalid_path, merged_baseinfo_path):
logger.info("starting 资格审查...") logger.info("starting 资格审查...")
start_time = time.time() start_time = time.time()
if not evaluation_method: if not evaluation_method:
evaluation_method = invalid_path evaluation_method = invalid_path
if not merged_baseinfo_path: if not merged_baseinfo_path:
merged_baseinfo_path = invalid_path merged_baseinfo_path = invalid_path
review_standards_res = combine_review_standards(evaluation_method, qualification, output_folder, truncate0_jsonpath, clause_path, invalid_path, merged_baseinfo_path) review_standards_res = combine_review_standards(evaluation_method, qualification, output_folder, tobidders_notice_table, clause_path, invalid_path, merged_baseinfo_path)
end_time = time.time() end_time = time.time()
logger.info(f"资格审查 done耗时{end_time - start_time:.2f}") logger.info(f"资格审查 done耗时{end_time - start_time:.2f}")
return review_standards_res return review_standards_res
@ -143,10 +141,10 @@ def fetch_evaluation_standards(invalid_path, evaluation_method):
# 无效、废标项解析 # 无效、废标项解析
def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, qualification): def fetch_invalid_requirements(invalid_docpath, output_folder, tobidders_notice_table, clause_path, qualification):
logger.info("starting 无效标与废标...") logger.info("starting 无效标与废标...")
start_time = time.time() start_time = time.time()
find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, qualification) find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, tobidders_notice_table, clause_path, qualification)
end_time = time.time() end_time = time.time()
logger.info(f"无效标与废标 done耗时{end_time - start_time:.2f}") logger.info(f"无效标与废标 done耗时{end_time - start_time:.2f}")
return find_invalid_res return find_invalid_res
@ -194,12 +192,12 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_
processed_data['tobidders_notice'], processed_data['clause_path']), processed_data['tobidders_notice'], processed_data['clause_path']),
'qualification_review': executor.submit(fetch_qualification_review, processed_data['evaluation_method'], 'qualification_review': executor.submit(fetch_qualification_review, processed_data['evaluation_method'],
processed_data['qualification'], output_folder, processed_data['qualification'], output_folder,
processed_data['truncate0_jsonpath'], processed_data['tobidders_notice_table'],
processed_data['clause_path'], processed_data['invalid_path'], processed_data['clause_path'], processed_data['invalid_path'],
processed_data['merged_baseinfo_path']), processed_data['merged_baseinfo_path']),
'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['invalid_path'],processed_data['evaluation_method']), 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['invalid_path'],processed_data['evaluation_method']),
'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
output_folder, processed_data['truncate0_jsonpath'], output_folder, processed_data['tobidders_notice_table'],
processed_data['clause_path'], processed_data['qualification']), processed_data['clause_path'], processed_data['qualification']),
'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'], processed_data['merged_baseinfo_path_more'],processed_data['clause_path']), 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'], processed_data['merged_baseinfo_path_more'],processed_data['clause_path']),
'opening_bid': executor.submit(fetch_bid_opening,processed_data['invalid_path'],processed_data['merged_baseinfo_path_more'], processed_data['clause_path']) 'opening_bid': executor.submit(fetch_bid_opening,processed_data['invalid_path'],processed_data['merged_baseinfo_path_more'], processed_data['clause_path'])
@ -228,6 +226,7 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_
logger.error(f"Error processing {key}: {exc}") logger.error(f"Error processing {key}: {exc}")
yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False) yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
#TODO:废标项,针对新文件作优化,统一成货物标的处理逻辑
if __name__ == "__main__": if __name__ == "__main__":
start_time = time.time() start_time = time.time()
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test1" output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test1"

View File

@ -7,7 +7,7 @@ import time
from flask_app.general.多线程提问 import multi_threading from flask_app.general.多线程提问 import multi_threading
from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2 from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2
from flask_app.general.json_utils import clean_json_string from flask_app.general.json_utils import clean_json_string
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json from flask_app.main.提取json工程标版 import convert_clause_to_json
from flask_app.general.通义千问long import upload_file from flask_app.general.通义千问long import upload_file
from flask_app.general.merge_pdfs import merge_pdfs from flask_app.general.merge_pdfs import merge_pdfs
prompt = """ prompt = """
@ -312,7 +312,7 @@ def process_reviews(original_dict_data, output_folder, truncate0_jsonpath, claus
formatted_questions = formatted_questions1 + formatted_questions2 formatted_questions = formatted_questions1 + formatted_questions2
if formatted_questions: if formatted_questions:
output_path = fetch_specific_pdf(output_folder) output_path = fetch_specific_pdf(output_folder) #合并merged_info
if output_path: if output_path:
file_id = upload_file(output_path) file_id = upload_file(output_path)
results = multi_threading(formatted_questions, "", file_id, 2) results = multi_threading(formatted_questions, "", file_id, 2)

View File

@ -202,7 +202,8 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
text = page.extract_text() text = page.extract_text()
if text: if text:
cleaned_text = clean_page_content(text, common_header) cleaned_text = clean_page_content(text, common_header)
if is_secondary_match and re.search(exclusion_pattern, cleaned_text): # 跳过投标人须知正文中的"投标文件的组成" # if is_secondary_match and re.search(exclusion_pattern, cleaned_text): # 跳过投标人须知正文中的"投标文件的组成"
if re.search(exclusion_pattern, cleaned_text):
continue continue
if re.search(begin_pattern, cleaned_text) and i >= begin_page: if re.search(begin_pattern, cleaned_text) and i >= begin_page:
if start_page and (output_suffix == "notice" or output_suffix == "invalid"): if start_page and (output_suffix == "notice" or output_suffix == "invalid"):
@ -233,8 +234,9 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
try: try:
pdf_document = PdfReader(pdf_path) pdf_document = PdfReader(pdf_path)
if output_suffix == "qualification": if output_suffix == "qualification":
print("twice:qualificaiton!")
# 动态设置 include_keys # 动态设置 include_keys
include_keys = ["资格", "资质", "能力", "信誉"] include_keys = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查","资格检查","能力","信誉"]
# 定义起始匹配模式,仅匹配“附录”、“附件”或“附表” # 定义起始匹配模式,仅匹配“附录”、“附件”或“附表”
begin_pattern = r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:])' begin_pattern = r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:])'
# 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻) # 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻)
@ -586,18 +588,18 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, uniqu
# 投标人须知前附表改为货物标一样的 # 投标人须知前附表改为货物标一样的
if __name__ == "__main__": if __name__ == "__main__":
start_time = time.time() start_time = time.time()
# input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标\\HBDL-2024-0017-001-招标文件.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标"
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹" input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp" output_folder = "C:\\Users\\Administrator\\Desktop\\new招标文件\\output3"
# files=truncate_pdf_multiple(input_path,output_folder) # files=truncate_pdf_multiple(input_path,output_folder)
# selections = [4, 1] # 仅处理 selection 4、1 # selections = [4, 1] # 仅处理 selection 4、1
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections) # files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
# print(files) # print(files)
selection = 2 # 例如1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标 selection = 3 # 例如1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
generated_files = truncate_pdf_main(input_path, output_folder, selection) generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print(generated_files) print(generated_files)
# print("生成的文件:", generated_files) # print("生成的文件:", generated_files)
end_time = time.time() end_time = time.time()
print("耗时:" + str(end_time - start_time)) print("耗时:" + str(end_time - start_time))

View File

@ -1,6 +1,7 @@
import json import json
import re import re
from flask_app.general.投标人须知正文提取指定内容 import process_nested_data, transform_json, get_requirements_with_gpt,extract_sections from flask_app.general.投标人须知正文提取指定内容 import process_nested_data, transform_json, get_requirements_with_gpt, \
extract_sections, concatenate_keys_values
# 对于每个target_value元素如果有完美匹配json_data中的键那就加入这个完美匹配的键名否则把全部模糊匹配到的键名都加入 # 对于每个target_value元素如果有完美匹配json_data中的键那就加入这个完美匹配的键名否则把全部模糊匹配到的键名都加入
@ -17,40 +18,7 @@ def find_keys_with_prefix(key_prefix, json_data):
subheadings = [k for k in json_data if k.startswith(key_prefix)] subheadings = [k for k in json_data if k.startswith(key_prefix)]
return subheadings return subheadings
#将 top_level_key 的值设为 target_value。
# 从完整的json文件中读取所需数据eg:投标、评标
# def extract_json(data, target_values):
# results = {}
#
# # 遍历所有目标值
# for target_value in target_values:
# # 找到所有与目标值匹配的键
# matched_keys = find_keys_by_value(target_value, data)
#
# for key in matched_keys:
# # 查找所有以该键为前缀的子键,限制只提取直接子项
# key_and_subheadings = find_keys_with_prefix(key, data)
#
# for subkey in key_and_subheadings:
# # 如果子键有多级结构(比如 '7.2.1'),并且是直接子项
# if "." in subkey:
# parent_key = subkey.rsplit('.', 1)[0]
# top_level_key = parent_key.split('.')[0] + '.'
#
# # 确保顶级键不会重复添加
# if top_level_key not in results:
# results[top_level_key] = data[top_level_key]
#
# # 添加或更新父级键
# if parent_key not in results:
# if parent_key in data:
# results[parent_key] = data[parent_key]
#
# # 添加当前子键和它的值
# if subkey in data:
# results[subkey] = data[subkey]
#
# return results
def extract_json(data, target_values): def extract_json(data, target_values):
results = {} results = {}
for target_value in target_values: for target_value in target_values:
@ -72,6 +40,39 @@ def extract_json(data, target_values):
results[subkey] = data[subkey] results[subkey] = data[subkey]
return results return results
def extract_between_sections(data, target_values):
target_found = False
extracted_data = {}
current_section_title = ""
section_pattern = re.compile(r'^[一二三四五六七八九十]+$') # 匹配 "一", "二", "三" 等大标题
current_block = {}
# 遍历所有键值对
for key, value in data.items():
# 只匹配形如 "一": "竞争性磋商响应文件" 的章节标题
if section_pattern.match(key):
if target_found:
# 如果已经找到了符合的章节,并且遇到了另一个章节
# 保存当前块并重置
if current_block:
extracted_data[current_section_title] = current_block
current_block = {}
target_found = False
# 检查当前标题是否包含 target_values 中的任意关键词
if any(tv in value for tv in target_values):
target_found = True # 找到了目标章节,开始捕获后续内容
current_section_title = value # 保存章节标题内容
elif target_found: # 只捕获目标值之后的内容
current_block[key] = value
# 保存最后一个块(如果有的话)
if current_block:
extracted_data[current_section_title] = current_block
return extracted_data
def sort_clean_data_keys(data): def sort_clean_data_keys(data):
# 预处理:删除键名中的空格 # 预处理:删除键名中的空格
def preprocess_key(key): def preprocess_key(key):
@ -114,13 +115,20 @@ def extract_from_notice(merged_baseinfo_path,clause_path, type):
"Invalid type specified. Use 1 for '投标文件, 投标' or 2 for '开标, 评标, 定标'or 3 for '重新招标'") "Invalid type specified. Use 1 for '投标文件, 投标' or 2 for '开标, 评标, 定标'or 3 for '重新招标'")
with open(clause_path, 'r', encoding='utf-8') as file: with open(clause_path, 'r', encoding='utf-8') as file:
data = json.load(file) data = json.load(file)
extracted_data = extract_json(data, target_values) # 读取json extracted_data = extract_between_sections(data, target_values)
if not extracted_data:
extracted_data = extract_json(data, target_values) # 提取需要的数据
if not extracted_data: if not extracted_data:
final_result = get_requirements_with_gpt(merged_baseinfo_path, type) # 万一没用正则匹配到,那就调用大模型 final_result = get_requirements_with_gpt(merged_baseinfo_path, type) # 万一没用正则匹配到,那就调用大模型
return final_result return final_result
# print(json.dumps(extracted_data,ensure_ascii=False,indent=4)) final_result=extract_sections(extracted_data,target_values) #后处理,生成键名
final_result=extract_sections(extracted_data,target_values)
return final_result return final_result
else:
extracted_data_concatenated = {
section: concatenate_keys_values(content)
for section, content in extracted_data.items()
}
return extracted_data_concatenated
# print(json.dumps(res, ensure_ascii=False, indent=4)) # print(json.dumps(res, ensure_ascii=False, indent=4))
# sorted_data = sort_clean_data_keys(extracted_data) # 对输入的字典 data 的键进行预处理和排序 # sorted_data = sort_clean_data_keys(extracted_data) # 对输入的字典 data 的键进行预处理和排序
# transformed_data = transform_json(sorted_data) # transformed_data = transform_json(sorted_data)
@ -130,10 +138,10 @@ def extract_from_notice(merged_baseinfo_path,clause_path, type):
if __name__ == "__main__": if __name__ == "__main__":
# file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json' # file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json'
merged_baseinfo_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\4e5bc6c2-c2b8-4c0b-8e57-81a498b982f6\\ztbfile_tobidders_notice.pdf" merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\ea4c3d02-1198-48ab-b841-e62959b3668c\merged_baseinfo_path_more.pdf"
clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\4e5bc6c2-c2b8-4c0b-8e57-81a498b982f6\\clause1.json" clause_path=r"D:\flask_project\flask_app\static\output\output1\ea4c3d02-1198-48ab-b841-e62959b3668c\tmp\clause1.json"
try: try:
res = extract_from_notice(merged_baseinfo_path,clause_path, 1) # 可以改变此处的 type 参数测试不同的场景 res = extract_from_notice(merged_baseinfo_path,clause_path, 3) # 可以改变此处的 type 参数测试不同的场景
res2 = json.dumps(res, ensure_ascii=False, indent=4) res2 = json.dumps(res, ensure_ascii=False, indent=4)
print(res2) print(res2)
except ValueError as e: except ValueError as e:

View File

@ -1,287 +0,0 @@
import json
import docx
import re
import os
from PyPDF2 import PdfReader
from flask_app.main.截取pdf import clean_page_content,extract_common_header
def extract_text_from_docx(file_path):
doc = docx.Document(file_path)
return '\n'.join([para.text for para in doc.paragraphs])
# def extract_text_from_pdf(file_path):
# # 从PDF文件中提取文本
# common_header = extract_common_header(file_path)
# pdf_document = PdfReader(file_path)
# text = ""
# # 遍历每一页
# for page in pdf_document.pages:
# # 提取当前页面的文本
# page_text = page.extract_text() if page.extract_text() else ""
# # 清洗页面文本
# page_text = clean_page_content(page_text, common_header)
# # 将清洗后的文本添加到总文本中
# text += page_text+"\n"
# return text
def extract_text_from_pdf(file_path, start_word, end_pattern):
# 从PDF文件中提取文本
common_header = extract_common_header(file_path)
pdf_document = PdfReader(file_path)
all_pages_text = []
start_index = None
# 处理所有页面
for i, page in enumerate(pdf_document.pages):
page_text = page.extract_text() if page.extract_text() else ""
cleaned_text = clean_page_content(page_text, common_header)
# 在第一页查找开始位置
if i == 0 and start_index is None:
for pattern in (start_word if isinstance(start_word, list) else [start_word]):
start_match = re.search(pattern, cleaned_text)
if start_match:
start_index = start_match.start()
cleaned_text = cleaned_text[start_index:]
break # 找到一个匹配后跳出循环
# 在最后一页查找结束位置
if i == len(pdf_document.pages) - 1:
matches = list(re.finditer(end_pattern, cleaned_text))
if matches:
end_index = matches[-1].start()
cleaned_text = cleaned_text[:end_index]
all_pages_text.append(cleaned_text)
# 合并所有页面的文本
full_text = "\n".join(all_pages_text)
return full_text
def extract_section(text, start_pattern, end_phrases):
# 查找开始模式
start_match = re.search(start_pattern, text)
if not start_match:
return "" # 如果没有找到匹配的开始模式,返回空字符串
start_index = start_match.end() # 从匹配的结束位置开始
# 初始化结束索引为文本总长度
end_index = len(text)
# 遍历所有结束短语,查找第一个出现的结束短语
for phrase in end_phrases:
match = re.search(phrase, text[start_index:], flags=re.MULTILINE)
if match:
end_index = start_index + match.start() # 更新结束索引为匹配到的开始位置
break # 找到第一个匹配后立即停止搜索
# 提取并返回从开始模式后到结束模式前的内容
return text[start_index:end_index]
def compare_headings(current, new):
# 使用过滤来确保只处理非空且为数字的部分
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
# 比较数字序列以确定标题的层次关系
for c, n in zip(current_nums, new_nums):
if n > c:
return True
elif n < c:
return False
# 如果新标题有更多层次,认为是新的子章节
return len(new_nums) > len(current_nums)
def should_add_newline(content, keywords, max_length=20):
content_str = ''.join(content).strip()
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
def handle_content_append(current_content, line_content, append_newline, keywords):
if append_newline:
if should_add_newline(current_content, keywords):
current_content.append('\n') # 添加换行符
append_newline = False
current_content.append(line_content)
return append_newline
#对二级标题如x.x进行额外处理如果当前处理内容包含keywords中的内容则必须保留换行符/如果当前内容字数大于20不保留换行。
def parse_text_by_heading(text):
keywords = ['包含', '以下']
data = {}
current_key = None
current_content = []
append_newline = False
lines = text.split('\n')
def check_year_pattern(line):
line_no_spaces = line.replace(' ', '')
return re.match(r'^\d{4}', line_no_spaces) and line_no_spaces[4:5] == ''
for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n'
line_stripped = line.strip().replace('', '.')
# 检查是否以年份开头,如果是,属于上一个标题内容
if check_year_pattern(line_stripped) and current_key is not None:
current_content.append(line_stripped)
continue
# 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match:
# 匹配形如 '1.' 的标题
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
if not match:
# 新增:匹配纯数字标题,如 '27'
match = re.match(r'^(\d+)([^.\d].*)', line_stripped)
if match:
new_key, line_content = match.groups()
# 如果是纯数字标题,添加一个点
if not new_key.endswith('.'):
new_key += '.'
line_content = line_content.lstrip('.')
# 检查是否应该更新当前键和内容
if current_key is None or (compare_headings(current_key, new_key) and (
len(current_content) == 0 or current_content[-1][-1] != '')): #current_content 内容占两行则是: ['本招标项目已具备招标条件,现对本项目施工监理进行招标,项目概况见本', '章附件一。']
if current_key is not None:
# 将之前的内容保存到data中保留第一个换行符后续的换行符转为空字符
content_string = ''.join(current_content).strip()
data[current_key] = content_string.replace(' ', '')
current_key = new_key
current_content = [line_content]
# 只有当标题级别为两级(如 1.1)时,才设置 append_newline 为 True
# append_newline = len(new_key.split('.')) == 2
append_newline = len(new_key.rstrip('.').split('.')) <= 2
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
else:
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
if current_key is not None:
# 保存最后一部分内容
content_string = ''.join(current_content).strip()
data[current_key] = content_string.replace(' ', '')
return data
def convert_to_json(file_path, start_word, end_phrases):
if file_path.endswith('.docx'):
text = extract_text_from_docx(file_path)
elif file_path.endswith('.pdf'):
text = extract_text_from_pdf(file_path,start_word,end_phrases)
else:
raise ValueError("Unsupported file format")
# 提取从 start_word 开始到 end_phrases 结束的内容
# text = extract_section(text, start_word, end_phrases)
# print(text)
parsed_data = parse_text_by_heading(text)
return parsed_data
def convert_clause_to_json(input_path, output_folder, type=1):
if not os.path.exists(input_path):
return ""
if type == 1:
# start_word 和编译后的 end_pattern 用于 type=1
start_word = [
re.compile(
r'^\s*(?:[(]\s*[一1]?\s*[)]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)'
r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文)',
re.MULTILINE
)
]
end_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
r'^评标办法前附表|'
r'^附录(?:一)?[:]|'
r'^附件(?:一)?[:]|'
r'^附表(?:一)?[:]',
re.MULTILINE
)
else:
start_word = [
re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',
re.MULTILINE
),
re.compile(
r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)]?\s*$',
re.MULTILINE
)
]
end_pattern = re.compile(
r'第[一二三四五六七八九十]+章\s*投标人须知|'
r'投标人须知前附表',
re.MULTILINE
)
result = convert_to_json(input_path, start_word, end_pattern) # 过滤无关信息
# 检查输出文件夹是否存在,如果不存在则创建
if not os.path.exists(output_folder):
os.makedirs(output_folder)
print(f"Created output folder: {output_folder}")
file_name = "clause1.json" if type == 1 else "clause2.json"
output_path = os.path.join(output_folder, file_name)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=4, ensure_ascii=False)
return output_path
# def post_process_json(json_file_path): #处理一级标题如'5.1'过长的内容 zbtest20
# # 读取 JSON 文件
# with open(json_file_path, 'r', encoding='utf-8') as file:
# data = json.load(file)
# processed_data = {}
# for key, value in data.items():
# # 检查是否是一级标题(如 '5.'),并且其值包含 '\n'
# if re.match(r'^\d+\.\s*$', key) and '\n' in value:
# # 分割标题和正文
# title, content = value.split('\n', 1)
#
# # 添加原来的标题作为 '5.0',其值为原来标题的内容(即 title
# processed_data[key] = title.strip()
# sub_key = f"{key.rstrip('.')}." + "0" # 自动生成 '5.0',与 '5.' 一致,保证点号的存在
#
# processed_data[sub_key] = title.strip()
#
# # 初始化计数器
# sub_count = 1
#
# # 根据子序号 '1.' 或 '1、' 进行分割
# sub_sections = re.split(r'(\d+[\.\、])\s*', content)
#
# current_sub_content = ""
# for i in range(1, len(sub_sections), 2):
# sub_number = sub_sections[i].strip() # 获取子序号
# sub_content = sub_sections[i + 1].strip() # 获取内容
#
# # 生成三级标题,如 '5.0.1', '5.0.2'
# sub_key_with_number = f"{sub_key}.{sub_count}"
# processed_data[sub_key_with_number] = sub_content
# sub_count += 1
#
# else:
# # 如果没有分割需求,保留原数据
# processed_data[key] = value
# # 将修改后的数据重新写入到原来的 JSON 文件中
# with open(json_file_path, 'w', encoding='utf-8') as file:
# json.dump(processed_data, file, ensure_ascii=False, indent=4)
if __name__ == "__main__":
file_path='C:\\Users\\Administrator\\Desktop\\fsdownload\\dafa8e6f-319e-4547-abbd-65375bf82004\\ztbfile_tobidders_notice.pdf'
# file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件一中多媒体报告厅教学设备_tobidders_notice_part1.pdf'
# start_word = "投标人须知正文"
# end_phrases = [
# r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
# r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
# ]
output_folder = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\dafa8e6f-319e-4547-abbd-65375bf82004\\tmp'
try:
output_path = convert_clause_to_json(file_path,output_folder)
print(f"Final JSON result saved to: {output_path}")
except ValueError as e:
print("Error:", e)

View File

@ -0,0 +1,100 @@
import json
import os
from flask_app.general.投标人须知正文条款提取成json文件 import parse_text_by_heading, extract_text_from_pdf
def convert_clause_to_json(file_path, output_folder, type=1):
if not os.path.exists(file_path):
return ""
if type == 1:
# 对于 type=1使用原始字符串定义 start_word 和 end_pattern
start_word = (
r'^\s*(?:[(]?\s*[一二12]?\s*[)]?\s*[、..]*\s*)?(说\s*明|总\s*则)'
r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文)'
)
end_pattern = (
r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
r'评标办法前附表|'
r'附录(?:一)?[:]|'
r'附件(?:一)?[:]|'
r'附表(?:一)?[:])$'
)
else:
# 对于其他类型,使用原始字符串定义 start_word 和 end_pattern
start_word = (
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请)'
r'|^第一卷'
r'|^投标邀请书'
r'|^投标邀请函'
r'|^投标邀请'
r'|.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)]?\s*$'
)
end_pattern = (
r'^(?:第[一二三四五六七八九十]+章\s*投标人须知|投标人须知前附表)$'
)
if file_path.endswith('.pdf'):
text = extract_text_from_pdf(file_path, start_word, end_pattern)
else:
raise ValueError("Unsupported file format")
parsed_data = parse_text_by_heading(text)
# 检查输出文件夹是否存在,如果不存在则创建
if not os.path.exists(output_folder):
os.makedirs(output_folder)
print(f"Created output folder: {output_folder}")
file_name = "clause1.json" if type == 1 else "clause2.json"
output_path = os.path.join(output_folder, file_name)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(parsed_data, f, indent=4, ensure_ascii=False)
return output_path
# def post_process_json(json_file_path): #处理一级标题如'5.1'过长的内容 zbtest20
# # 读取 JSON 文件
# with open(json_file_path, 'r', encoding='utf-8') as file:
# data = json.load(file)
# processed_data = {}
# for key, value in data.items():
# # 检查是否是一级标题(如 '5.'),并且其值包含 '\n'
# if re.match(r'^\d+\.\s*$', key) and '\n' in value:
# # 分割标题和正文
# title, content = value.split('\n', 1)
#
# # 添加原来的标题作为 '5.0',其值为原来标题的内容(即 title
# processed_data[key] = title.strip()
# sub_key = f"{key.rstrip('.')}." + "0" # 自动生成 '5.0',与 '5.' 一致,保证点号的存在
#
# processed_data[sub_key] = title.strip()
#
# # 初始化计数器
# sub_count = 1
#
# # 根据子序号 '1.' 或 '1、' 进行分割
# sub_sections = re.split(r'(\d+[\.\、])\s*', content)
#
# current_sub_content = ""
# for i in range(1, len(sub_sections), 2):
# sub_number = sub_sections[i].strip() # 获取子序号
# sub_content = sub_sections[i + 1].strip() # 获取内容
#
# # 生成三级标题,如 '5.0.1', '5.0.2'
# sub_key_with_number = f"{sub_key}.{sub_count}"
# processed_data[sub_key_with_number] = sub_content
# sub_count += 1
#
# else:
# # 如果没有分割需求,保留原数据
# processed_data[key] = value
# # 将修改后的数据重新写入到原来的 JSON 文件中
# with open(json_file_path, 'w', encoding='utf-8') as file:
# json.dump(processed_data, file, ensure_ascii=False, indent=4)
if __name__ == "__main__":
file_path='C:\\Users\\Administrator\\Desktop\\fsdownload\\dafa8e6f-319e-4547-abbd-65375bf82004\\ztbfile_tobidders_notice.pdf'
output_folder = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\dafa8e6f-319e-4547-abbd-65375bf82004\\tmp'
try:
output_path = convert_clause_to_json(file_path,output_folder)
print(f"Final JSON result saved to: {output_path}")
except ValueError as e:
print("Error:", e)

View File

@ -3,8 +3,12 @@ import json
import os.path import os.path
import time import time
import re import re
from flask_app.general.format_change import pdf2docx
from flask_app.general.通义千问long import upload_file, qianwen_long,qianwen_long_text from flask_app.general.通义千问long import upload_file, qianwen_long,qianwen_long_text
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from flask_app.main.table_content_extraction import extract_tables_main
from flask_app.main.禁止投标情形 import find_forbidden, process_string_list from flask_app.main.禁止投标情形 import find_forbidden, process_string_list
@ -365,7 +369,9 @@ def handle_query(file_path, user_query, output_file, result_key, keywords, trunc
print(f"handle_query 在处理 {result_key} 时发生异常: {e}") print(f"handle_query 在处理 {result_key} 时发生异常: {e}")
return {result_key: ""} return {result_key: ""}
def combine_find_invalid(invalid_docpath, output_dir, truncate_json_path, clause_path, qualification): def combine_find_invalid(invalid_docpath, output_dir, tobidders_notice_table, clause_path, qualification):
tobidders_notice_table_docx = pdf2docx(tobidders_notice_table) # 投标人须知前附表转docx
truncate_jsonpath = extract_tables_main(tobidders_notice_table_docx, output_dir) # 投标人须知前附表docx->json
queries = [ queries = [
( (
r'\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效', r'\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
@ -392,7 +398,7 @@ def combine_find_invalid(invalid_docpath, output_dir, truncate_json_path, clause
output_file, output_file,
result_key, result_key,
keywords, keywords,
truncate_json_path truncate_jsonpath
) )
futures.append((future, result_key)) futures.append((future, result_key))
time.sleep(0.5) # 暂停0.5秒后再提交下一个任务 time.sleep(0.5) # 暂停0.5秒后再提交下一个任务
@ -408,7 +414,7 @@ def combine_find_invalid(invalid_docpath, output_dir, truncate_json_path, clause
# 禁止投标find_forbidden部分 # 禁止投标find_forbidden部分
try: try:
# print("starting不得存在的情形...") # print("starting不得存在的情形...")
forbidden_res = find_forbidden(truncate_json_path, clause_path, qualification) forbidden_res = find_forbidden(truncate_jsonpath, clause_path, qualification)
except Exception as e: except Exception as e:
print(f"find_forbidden 处理时出错: {e}") print(f"find_forbidden 处理时出错: {e}")
forbidden_res = {'不得存在的其他情形': ""} forbidden_res = {'不得存在的其他情形': ""}
@ -422,13 +428,13 @@ def combine_find_invalid(invalid_docpath, output_dir, truncate_json_path, clause
if __name__ == '__main__': if __name__ == '__main__':
start_time = time.time() start_time = time.time()
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\truncate_output.json" tobidders_notice_table=""
clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json" clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json"
qualification="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf" qualification="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf"
output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\zbout" output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\zbout"
# doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx' # doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx'
invalid_docpath = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_invalid.docx' invalid_docpath = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_invalid.docx'
results = combine_find_invalid(invalid_docpath, output_dir,truncate_json_path,clause_path,qualification) results = combine_find_invalid(invalid_docpath, output_dir,tobidders_notice_table,clause_path,qualification)
end_time = time.time() end_time = time.time()
print("Elapsed time:", str(end_time - start_time)) print("Elapsed time:", str(end_time - start_time))
print("Results:", json.dumps(results,ensure_ascii=False,indent=4)) print("Results:", json.dumps(results,ensure_ascii=False,indent=4))

View File

@ -5,7 +5,7 @@ import time
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from flask_app.main.截取pdf import truncate_pdf_multiple from flask_app.main.截取pdf import truncate_pdf_multiple
from flask_app.main.table_content_extraction import extract_tables_main from flask_app.main.table_content_extraction import extract_tables_main
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json from flask_app.main.提取json工程标版 import convert_clause_to_json
from flask_app.general.json_utils import transform_json_values from flask_app.general.json_utils import transform_json_values
from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid
from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice

View File

@ -1,42 +1,151 @@
import json import json
import os.path
import time import time
from flask_app.general.json_utils import extract_content_from_json from flask_app.general.format_change import pdf2docx
from flask_app.general.json_utils import extract_content_from_json, clean_json_string
from flask_app.main.table_content_extraction import extract_tables_main
from flask_app.main.形式响应评审 import process_reviews from flask_app.main.形式响应评审 import process_reviews
from flask_app.main.资格评审 import process_qualification from flask_app.main.资格评审 import process_qualification
from flask_app.general.通义千问long import upload_file, qianwen_long from flask_app.general.通义千问long import upload_file, qianwen_long
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from flask_app.货物标.资格审查main import combine_qualification_review
from flask_app.general.merge_pdfs import merge_pdfs
def process_notice(notice_path):
print("call notice_path")
try:
# 上传通知文件并获取文件ID
file_id1 = upload_file(notice_path)
# 定义用户查询,提取申请人资格要求
user_query1 = """
第一章招标公告投标邀请书中说明的申请人资格要求是怎样的请以json格式给出回答外键为'申请人资格要求'键值为字符串列表其中每个字符串对应原文中的一条要求你的回答与原文内容一致不要擅自总结删减输出格式示例如下
{
"申请人资格要求":[
"1.满足《中华人民共和国政府采购法》第二十二条规定;",
"1.1 法人或者其他组织的营业执照等证明文件,如供应商是自然人的提供身份证明材料;",
"2.未被列入“信用中国”网站(www.creditchina.gov.cn)信用服务栏失信被执行人、重大税收违法案件当事人名单;"
]
}
"""
# 执行查询并清洗结果
res1 = clean_json_string(qianwen_long(file_id1, user_query1))
# 提取申请人资格要求
requirements = res1.get("申请人资格要求", "未找到相关内容")
return {"申请人资格要求": requirements}
except Exception as e:
print(f"处理申请人资格要求时出错: {e}")
return {"申请人资格要求": "处理失败"}
def combine_review_standards(evaluation_method,qualification,output_folder,truncate0_jsonpath,clause_path,invalid_path,merged_baseinfo_path): #评标办法前附表 def combine_review_standards(evaluation_method, qualification_path, output_folder, tobidders_notice_table, clause_path,
# 形式评审、响应评审:千问 invalid_path, merged_baseinfo_path):
"""
结合评审标准包括形式评审响应评审资格评审及申请人资格要求
参数:
evaluation_method (str): 评标办法文件路径
qualification (str): 资格文件路径
output_folder (str): 输出文件夹路径
tobidders_notice_table (str): JSON截断路径
clause_path (str): 条款路径
invalid_path (str): 无效文件路径
merged_baseinfo_path (str): 合并基础信息路径
notice_path (str): 通知文件路径
返回:
dict: 包含资格审查和申请人资格要求的合并结果
"""
# 上传评标办法前附表并获取文件ID
file_id = upload_file(evaluation_method) # 评标办法前附表 file_id = upload_file(evaluation_method) # 评标办法前附表
user_query_1 = "根据该文档中的评标办法前附表请你列出该文件中的形式评审标准和响应性评审标准和资格评审标准请以json格式返回外层键名为'形式评审标准''响应性评审标准''资格评审标准',嵌套键名为'评审因素'中的内容,相应的键值为对应'评审标准'中的内容。"
first_query="""
该文档中是否说明了符合性审查标准说明了就回答''否则回答''请以json格式给我返回结果键名分别是'符合性审查'键值仅限于''''注意它与形式响应性评审是对立的也就是说只要文档中描述了形式响应性评审那么符合性审查的键值一定是''以下为输出示例
{
"符合性审查":""
}
"""
first_res=clean_json_string(qianwen_long(file_id,first_query))
if first_res.get("符合性审查") == "":
print("new 资格审查")
paths=[qualification_path,evaluation_method]
output_path=os.path.join(output_folder,"merged_qualification.pdf")
merge_pdfs(paths,output_path)
final_result=combine_qualification_review(invalid_path,output_path,merged_baseinfo_path)
else:
tobidders_notice_table_docx = pdf2docx(tobidders_notice_table) # 投标人须知前附表转docx
truncate_jsonpath = extract_tables_main(tobidders_notice_table_docx, output_folder) # 投标人须知前附表docx->json
# 定义用户查询,提取形式评审标准、响应性评审标准和资格评审标准
user_query_1 = """
根据该文档中的评标办法前附表请你列出该文件中的形式评审标准和响应性评审标准和资格评审标准请以json格式返回外层键名为'形式评审标准''响应性评审标准''资格评审标准',嵌套键名为'评审因素'中的内容相应的键值为对应'评审标准'中的内容
"""
# 执行查询并提取内容
results = qianwen_long(file_id, user_query_1) results = qianwen_long(file_id, user_query_1)
original_dict_data = extract_content_from_json(results) original_dict_data = extract_content_from_json(results)
qualification_review = original_dict_data.pop('资格评审标准', {}) #qianwen-long有关资格评审的内容
with ThreadPoolExecutor() as executor: # 提取资格评审标准
# 创建Future对象 qualification_review = original_dict_data.pop('资格评审标准', {}) # qianwen_long有关资格评审的内容
future_qualification = executor.submit(process_qualification, qualification_review, qualification,invalid_path,merged_baseinfo_path)
future_form_response = executor.submit(process_reviews, original_dict_data,output_folder, truncate0_jsonpath, # 初始化 ThreadPoolExecutor设定最多三个线程以处理三个任务
clause_path) with ThreadPoolExecutor(max_workers=3) as executor:
# 等待执行结果 # 提交任务并建立任务名到Future的映射
final_qualify_json = future_qualification.result() futures = {
form_response_dict = future_form_response.result() "资格审查": executor.submit(
form_response_dict.update(final_qualify_json) process_qualification,
return {"资格审查":form_response_dict} qualification_review,
qualification_path,
invalid_path,
merged_baseinfo_path
),
"形式及响应性审查": executor.submit(
process_reviews,
original_dict_data,
output_folder,
truncate_jsonpath,
clause_path
),
"申请人资格要求": executor.submit(
process_notice,
merged_baseinfo_path
)
}
# 定义所需的顺序
desired_order = ["申请人资格要求", "资格审查", "形式及响应性审查"]
# 初始化结果字典
combined_results = {}
# 按指定顺序收集结果
for key in desired_order:
future = futures.get(key)
if future:
try:
result = future.result()
if isinstance(result, dict):
combined_results.update(result)
else:
combined_results[key] = result
except Exception as e:
print(f"处理 '{key}' 时出错: {e}")
combined_results[key] = "处理失败"
else:
combined_results[key] = "未提交任务"
# 将各部分结果合并到最终的资格审查字典中
final_result = {"资格审查": combined_results}
return final_result
if __name__ == "__main__": if __name__ == "__main__":
start_time = time.time() start_time = time.time()
evaluation_method = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_evaluation_method.pdf" evaluation_method = r"C:\Users\Administrator\Desktop\new招标文件\工程标\tmp\\HBDL-2024-0017-001-招标文件_evaluation_method.pdf"
qualification="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_qualification.pdf" qualification_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标\tmp\\HBDL-2024-0017-001-招标文件_qualification.pdf"
output_folder = "D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21" output_folder = r"C:\Users\Administrator\Desktop\new招标文件\工程标\tmp"
# knowledge_name="zbtest20" # knowledge_name="zbtest20"
clause_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\clause1.json" clause_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\clause1.json"
truncate0_jsonpath = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\truncate_output.json" tobidders_notice_table = r"C:\Users\Administrator\Desktop\new招标文件\工程标\tmp\\HBDL-2024-0017-001-招标文件_tobidders_notice_table.pdf"
invalid_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_invalid.pdf"
merged_baseinfo_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_merged_baseinfo.pdf" invalid_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标\tmp\\HBDL-2024-0017-001-招标文件_invalid.pdf"
res=combine_review_standards(evaluation_method,qualification, output_folder,truncate0_jsonpath,clause_path,invalid_path,merged_baseinfo_path) merged_baseinfo_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标\tmp\\HBDL-2024-0017-001-招标文件_merged_baseinfo.pdf"
res = combine_review_standards(evaluation_method, qualification_path, output_folder, tobidders_notice_table, clause_path,
invalid_path, merged_baseinfo_path)
print(json.dumps(res, ensure_ascii=False, indent=4)) print(json.dumps(res, ensure_ascii=False, indent=4))
end_time = time.time() end_time = time.time()
print("elapsed time:" + str(end_time - start_time)) print("elapsed time:" + str(end_time - start_time))

View File

@ -123,17 +123,17 @@ def get_all_dict(invalid_path, ques=None):
return {'资格评审': qualification_combined_res} return {'资格评审': qualification_combined_res}
def process_qualification(qualification_review, qualification, invalid_path, merged_baseinfo_path): def process_qualification(qualification_review, qualification_path, invalid_path, merged_baseinfo_path):
# 资格评审 # 资格评审
matching_keys_list, non_matching_dict = extract_matching_keys_qual( matching_keys_list, non_matching_dict = extract_matching_keys_qual(
qualification_review) # matching_keys_list:['资质条件', '财务状况'] non_matching_dict:{'营业执照': '具备有效的营业执照', '施工机械设备': '具备完善的施工设备'} qualification_review) # matching_keys_list:['资质条件', '财务状况'] non_matching_dict:{'营业执照': '具备有效的营业执照', '施工机械设备': '具备完善的施工设备'}
if not matching_keys_list: if not matching_keys_list:
if not non_matching_dict: # 古法提取 non_matching_dict和matching_keys_list都为空 if not non_matching_dict: # 古法提取 non_matching_dict和matching_keys_list都为空
if qualification != "": # 提取到资格审查附件的情况 if qualification_path != "": # 提取到资格审查附件的情况
print("资格评审: type1") print("资格评审: type1")
matching_keys_list = ["资质条件", "财务要求", "业绩要求", "信誉要求", "其他要求"] matching_keys_list = ["资质条件", "财务要求", "业绩要求", "信誉要求", "其他要求"]
ques = generate_qual_question(matching_keys_list) ques = generate_qual_question(matching_keys_list)
file_id2 = upload_file(qualification) file_id2 = upload_file(qualification_path)
results2 = multi_threading(ques, "", file_id2, 2) # 资格评审表调用qianwen-long results2 = multi_threading(ques, "", file_id2, 2) # 资格评审表调用qianwen-long
res_list = [clean_json_string(res) for _, res in results2] if results2 else [] res_list = [clean_json_string(res) for _, res in results2] if results2 else []
if res_list: if res_list:
@ -161,7 +161,7 @@ def process_qualification(qualification_review, qualification, invalid_path, mer
else: else:
return new_non_matching_json or {"资格评审": ""} return new_non_matching_json or {"资格评审": ""}
elif matching_keys_list and qualification == "": # 这种情况是评分办法前附表中有要求,但是没有正确截取到'资格审查表' elif matching_keys_list and qualification_path == "": # 这种情况是评分办法前附表中有要求,但是没有正确截取到'资格审查表'
# print("资格评审: type4") # print("资格评审: type4")
# target=["资质","业绩","财务","信誉","人员","项目经理","负责人","联合体"] # target=["资质","业绩","财务","信誉","人员","项目经理","负责人","联合体"]
# question_template="该招标文件中{key}的内容是怎样的请你以json格式返回结果键名为{key},若存在嵌套内容,嵌套键名为你对相应要求的总结,而对应键值需要完全与原文保持一致,不要擅自总结、删减。" # question_template="该招标文件中{key}的内容是怎样的请你以json格式返回结果键名为{key},若存在嵌套内容,嵌套键名为你对相应要求的总结,而对应键值需要完全与原文保持一致,不要擅自总结、删减。"
@ -190,7 +190,7 @@ def process_qualification(qualification_review, qualification, invalid_path, mer
else: # 大多数情况 else: # 大多数情况
print("资格评审: type5") print("资格评审: type5")
user_querys = generate_qual_question(matching_keys_list) # 生成提问->‘附件:资格审查’ user_querys = generate_qual_question(matching_keys_list) # 生成提问->‘附件:资格审查’
file_id2 = upload_file(qualification) file_id2 = upload_file(qualification_path)
results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表调用qianwen-long results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表调用qianwen-long
res_list = [] res_list = []
if not results2: if not results2:
@ -212,12 +212,12 @@ if __name__ == "__main__":
# qualification_review={'营业执照': '具备有效的营业执照', '资质等级': '具备建设行政主管部门颁发的市政公用工程监理乙级及以上资质或房屋建筑工程监理乙级及以上资质或工程监理综合资质证书', '财务状况': '投标人须提供近三年2018 年、2019 年、2020 年)完', '类似项目业绩': '投标人近 5 年2017 年至今须具有至少一项投资概算在4000 万元及以上房屋建筑工程或市政公用工程监理业绩,同时提供中标通知书及监理服务合同,项目规模及项目时间认定以监理服务合同内信息为准', '信誉': '根据《关于在招标投标活动中对失信被执行', '主要人员': '监理工程师:至少提供 1 名具有房屋建筑工程专','不存在禁止投标的':'不存在第二章“投标人须知”第 1.4.3 项规定的情形','联合体投标':'hha'} # qualification_review={'营业执照': '具备有效的营业执照', '资质等级': '具备建设行政主管部门颁发的市政公用工程监理乙级及以上资质或房屋建筑工程监理乙级及以上资质或工程监理综合资质证书', '财务状况': '投标人须提供近三年2018 年、2019 年、2020 年)完', '类似项目业绩': '投标人近 5 年2017 年至今须具有至少一项投资概算在4000 万元及以上房屋建筑工程或市政公用工程监理业绩,同时提供中标通知书及监理服务合同,项目规模及项目时间认定以监理服务合同内信息为准', '信誉': '根据《关于在招标投标活动中对失信被执行', '主要人员': '监理工程师:至少提供 1 名具有房屋建筑工程专','不存在禁止投标的':'不存在第二章“投标人须知”第 1.4.3 项规定的情形','联合体投标':'hha'}
qualification_review = {'营业执照': '具备有效的营业执照', '安全生产许可证': '具备有效的安全生产许可证', qualification_review = {'营业执照': '具备有效的营业执照', '安全生产许可证': '具备有效的安全生产许可证',
'资质等级': '符合第二章“投标人须知”规定', '财务状况': '符合第二章“投标人须知”规定'} '资质等级': '符合第二章“投标人须知”规定', '财务状况': '符合第二章“投标人须知”规定'}
qualification= "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_qualification.pdf" qualification_path= "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_qualification.pdf"
# output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test" # output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test"
invalid_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_invalid.pdf" invalid_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_invalid.pdf"
merged_baseinfo_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_merged_baseinfo.pdf" merged_baseinfo_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_merged_baseinfo.pdf"
# knowledge_name = "招标解析word13" # knowledge_name = "招标解析word13"
res = process_qualification(qualification_review, qualification, invalid_path, merged_baseinfo_path) res = process_qualification(qualification_review, qualification_path, invalid_path, merged_baseinfo_path)
print(json.dumps(res, ensure_ascii=False, indent=4)) print(json.dumps(res, ensure_ascii=False, indent=4))
# 该招标文件中资格评审关于财务状况的内容是怎样的请你以json格式返回结果外层键名为'财务状况',请你忠于原文,回答要求完整准确,不要擅自总结、删减,且不要回答诸如'见投标人须知前附表'或'见第x.x项规定'这类无实质性内容的回答。 # 该招标文件中资格评审关于财务状况的内容是怎样的请你以json格式返回结果外层键名为'财务状况',请你忠于原文,回答要求完整准确,不要擅自总结、删减,且不要回答诸如'见投标人须知前附表'或'见第x.x项规定'这类无实质性内容的回答。

View File

@ -0,0 +1,120 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from flask_app.general.通义千问long import upload_file, qianwen_long
from flask_app.general.json_utils import clean_json_string
def combine_qualification_new(invalid_path, qualification_path,notice_path):
detailed_res = {}
# 初始化无效文件ID
invalid_file_id = None
if qualification_path:
# 上传资格文件并获取文件ID
qualification_file_id = upload_file(qualification_path)
# 定义第一个查询,用于检查资格性审查是否存在
first_query = """
该文档中是否有关于资格性审查标准的具体内容?请以json格式给出回答,外键为'资格性审查',键值仅限于'','',输出格式示例如下:
{
"资格性审查":""
}
"""
# 执行第一个查询并清洗返回的JSON字符串
print("call first_query")
first_res = clean_json_string(qianwen_long(qualification_file_id, first_query))
# 判断是否存在资格性审查
zige_file_id = qualification_file_id if first_res.get("资格性审查") == "" else None
# 如果需要,上传无效文件
if zige_file_id is None:
if invalid_file_id is None:
invalid_file_id = upload_file(invalid_path)
zige_file_id = invalid_file_id
else:
# 如果 qualification_path 为空,直接使用无效文件
zige_file_id = upload_file(invalid_path)
# 定义第二组查询,仅包含资格性审查
second_query = [
{
"key": "资格性审查",
"query": "该招标文件中规定的资格性审查标准是怎样的请以json格式给出外层为'资格性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关符合性审查的内容。"
}
]
# 定义任务函数
def process_second_query(key, query, file_id):
print("call second_query")
try:
res = qianwen_long(file_id, query)
cleaned_res = clean_json_string(res)
return key, cleaned_res.get(key, "未找到相关内容")
except Exception as e:
print(f"执行查询 '{key}' 时出错: {e}")
return key, "查询失败"
def process_notice(notice_path):
print("call notice_path")
try:
# 上传通知文件并获取文件ID
file_id1 = upload_file(notice_path)
# 定义用户查询,提取申请人资格要求
user_query1 = """
第一章招标公告投标邀请书中说明的申请人资格要求是怎样的请以json格式给出回答外键为'申请人资格要求'键值为字符串列表其中每个字符串对应原文中的一条要求你的回答与原文内容一致不要擅自总结删减输出格式示例如下
{
"申请人资格要求":[
"1.满足《中华人民共和国政府采购法》第二十二条规定;",
"1.1 法人或者其他组织的营业执照等证明文件,如供应商是自然人的提供身份证明材料;",
"2.未被列入“信用中国”网站(www.creditchina.gov.cn)信用服务栏失信被执行人、重大税收违法案件当事人名单;"
]
}
"""
# 执行查询并清洗结果
res1 = clean_json_string(qianwen_long(file_id1, user_query1))
# 提取申请人资格要求
requirements = res1.get("申请人资格要求", "未找到相关内容")
return "申请人资格要求", requirements
except Exception as e:
print(f"处理申请人资格要求时出错: {e}")
return "申请人资格要求", "处理失败"
# 初始化 ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=2) as executor:
future_to_key = {}
# 提交第二组查询
for query_info in second_query:
key = query_info["key"]
query = query_info["query"]
current_file_id = zige_file_id
future = executor.submit(process_second_query, key, query, current_file_id)
future_to_key[future] = key
# 有条件地提交通知处理
if notice_path:
future = executor.submit(process_notice, notice_path)
future_to_key[future] = "申请人资格要求"
else:
future = executor.submit(process_notice, invalid_path)
future_to_key[future] = "申请人资格要求"
# 收集结果(按完成顺序)
for future in as_completed(future_to_key):
key, result = future.result()
detailed_res[key] = result
# 定义所需的顺序
desired_order = ["申请人资格要求", "资格性审查"]
# print(json.dumps(detailed_res,ensure_ascii=False,indent=4))
# 创建一个新的有序字典
ordered_res = {}
for key in desired_order:
if key in detailed_res:
ordered_res[key] = detailed_res[key]
# 最终处理结果,例如打印或保存
return {"资格审查": ordered_res}

View File

@ -7,7 +7,7 @@ from flask_app.general.多线程提问 import multi_threading
from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2 from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2
from flask_app.general.json_utils import extract_content_from_json from flask_app.general.json_utils import extract_content_from_json
from flask_app.main.截取pdf import truncate_pdf_main from flask_app.main.截取pdf import truncate_pdf_main
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json from flask_app.main.提取json工程标版 import convert_clause_to_json
prompt = """ prompt = """
# 角色 # 角色
你是一个文档处理专家专门负责理解和操作基于特定内容的文档任务这包括解析总结搜索或生成与给定文档相关的各类信息 你是一个文档处理专家专门负责理解和操作基于特定内容的文档任务这包括解析总结搜索或生成与给定文档相关的各类信息

View File

@ -6,7 +6,7 @@ from concurrent.futures import ThreadPoolExecutor
from flask_app.main.截取pdf import truncate_pdf_multiple from flask_app.main.截取pdf import truncate_pdf_multiple
from flask_app.main.table_content_extraction import extract_tables_main from flask_app.main.table_content_extraction import extract_tables_main
from flask_app.old_version.文档理解大模型版知识库处理.知识库操作 import addfileToKnowledge, deleteKnowledge from flask_app.old_version.文档理解大模型版知识库处理.知识库操作 import addfileToKnowledge, deleteKnowledge
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json from flask_app.main.提取json工程标版 import convert_clause_to_json
from flask_app.general.json_utils import transform_json_values from flask_app.general.json_utils import transform_json_values
from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid
from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice

View File

@ -1,6 +1,6 @@
import os import os
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json from flask_app.main.提取json工程标版 import convert_clause_to_json
from flask_app.general.json_utils import extract_content_from_json from flask_app.general.json_utils import extract_content_from_json
from flask_app.old_version.形式响应评审old import process_reviews from flask_app.old_version.形式响应评审old import process_reviews
from flask_app.old_version.资格评审old import process_qualification from flask_app.old_version.资格评审old import process_qualification

View File

@ -240,12 +240,12 @@ def get_technical_requirements(file_id,invalid_path):
"""根据所有键是否已添加处理技术要求""" """根据所有键是否已添加处理技术要求"""
# 更新原始采购需求字典 # 更新原始采购需求字典
update_res=combine_and_update_results(cleaned_res['采购需求'], technical_requirements_combined_res) final_res=combine_and_update_results(cleaned_res['采购需求'], technical_requirements_combined_res)
# final_res = postprocess(cleaned_res) # final_res = postprocess(cleaned_res)
update_res["货物列表"] = good_list final_res["货物列表"] = good_list
# 输出最终的 JSON 字符串 # 输出最终的 JSON 字符串
return {"采购需求": update_res} return {"采购需求": final_res}
def test_all_files_in_folder(input_folder, output_folder): def test_all_files_in_folder(input_folder, output_folder):
# 确保输出文件夹存在 # 确保输出文件夹存在

View File

@ -99,7 +99,6 @@ def extract_from_notice(merged_baseinfo_path, clause_path, type):
if clause_path: if clause_path:
with open(clause_path, 'r', encoding='utf-8') as file: with open(clause_path, 'r', encoding='utf-8') as file:
data = json.load(file) data = json.load(file)
# 提取目标部分 # 提取目标部分
extracted_data = extract_between_sections(data, target_values) # 读取json截取大标题之间的内容 extracted_data = extract_between_sections(data, target_values) # 读取json截取大标题之间的内容

View File

@ -0,0 +1,194 @@
import json
import docx
import re
import os
from flask_app.general.投标人须知正文条款提取成json文件 import parse_text_by_heading, extract_text_from_pdf
#fitz库版本
# def extract_text_from_pdf(file_path, start_word, end_pattern):
# # 从PDF文件中提取文本
# common_header = extract_common_header(file_path)
# doc = fitz.open(file_path)
# all_pages_text = []
# start_index = None
#
# # 处理所有页面
# for i in range(len(doc)):
# page = doc[i]
# page_text = page.get_text()
# cleaned_text = clean_page_content(page_text, common_header)
# print(cleaned_text)
# print("yes")
# # 在第一页查找开始位置
# if i == 0 and start_index is None:
# start_match = re.search(start_word, cleaned_text, re.MULTILINE)
# if start_match:
# start_index = start_match.start()
# cleaned_text = cleaned_text[start_index:]
#
# # 在最后一页查找结束位置
# if i == len(doc) - 1:
# for pattern in end_pattern:
# matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE))
# if matches:
# end_index = matches[-1].start()
# cleaned_text = cleaned_text[:end_index]
# break
#
# all_pages_text.append(cleaned_text)
#
# # 合并所有页面的文本
# full_text = "\n".join(all_pages_text)
# # 关闭文档
# doc.close()
#
# return full_text
#type=2时提取货物标的第一章招标公告时采用该逻辑
def parse_text_to_dict(text):
"""
解析文本根据大标题划分内容生成字典
参数:
text (str): 要解析的文本
返回:
dict: 大标题作为键内容作为值的字典
"""
# 正则表达式模式:匹配以一至十的汉字数字开头,后跟顿号和任意字符,且位于行首
pattern = re.compile(r'^([一二三四五六七八九十]+\s*、\s*.*)$', re.MULTILINE)
# 使用 re.finditer 找到所有大标题的位置
matches = list(pattern.finditer(text))
result = {}
for i, match in enumerate(matches):
title = match.group(1).strip() # 获取大标题文本
start = match.end() # 内容的起始位置
if i + 1 < len(matches):
end = matches[i + 1].start() # 下一个大标题的起始位置
else:
end = len(text) # 最后一个大标题,内容到文本末尾
content = text[start:end].strip() # 获取内容并去除前后空白
# 规范化换行符,并移除每行开头和结尾的空白
content = content.replace('\r\n', '\n') # 统一换行符
content = re.sub(r'[ \t]+\n', '\n', content) # 移除行尾空白
content = re.sub(r'^[ \t]+|[ \t]+$', '', content, flags=re.MULTILINE) # 移除行首和行尾空白
content = clean_content(content) # 处理内容中的换行符
result[title] = content
return result
def clean_content(content):
"""
处理内容中的换行符
- 保留在子项编号前的换行符
- 保留在冒号 ':' 或全角冒号 '' 前的第一个换行符
- 移除其他位置的换行符不留下额外的空格
参数:
content (str): 要处理的内容字符串
返回:
str: 处理后的内容字符串
"""
# 定义子项编号的正则模式,包括:
# - 数字+点号+数字(如 1.1 或 11
# - 数字+顿号(如 2、
# - 点号+数字(如 .3 或 3
# - 数字+右括号(如 1) 或 1
# - 圆括号包围的数字(如 (5)
# - 全角圆括号包围的数字(如 5
# - 数字+点号(如 1. 或 1
numbering_pattern = r'(?:\d+[.]\d+(?:[.]\d+)*|\d+、|[.]\d+|\d+[)]|\(\d+\)|\d+|\d+[.])'
# 定义需要保留换行符的情况:
# 1. 换行符前面是子项编号
# 2. 换行符前面是任意非空白字符,且后面跟着 ':' 或 ''
pattern_keep = r'\n(?=(?:' + numbering_pattern + r'|[^:\n\r\f\v]+[:]))'
# 定义占位符,用于暂时替换需要保留的换行符
placeholder = "___PLACEHOLDER___"
# Step 1: 将需要保留的换行符替换为占位符
content_with_placeholder = re.sub(pattern_keep, placeholder, content)
# Step 2: 移除所有剩余的换行符
content_no_newlines = content_with_placeholder.replace('\n', '')
# Step 3: 将占位符替换回换行符
cleaned_content = content_no_newlines.replace(placeholder, '\n')
return cleaned_content
#如果file_path为空返回""
def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json"):
if not os.path.exists(file_path):
print(f"The specified file does not exist: {file_path}")
return ""
if type == 1:
start_word = r'^\s*(?:[(]?\s*[一二12]?\s*[)]?\s*[、..]*\s*)?(说\s*明|总\s*则)'
end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*)$'
else:
start_word = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*|.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*)$'
end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
if file_path.endswith('.pdf'):
text = extract_text_from_pdf(file_path, start_word, end_pattern)
else:
raise ValueError("Unsupported file format")
parsed_data = parse_text_by_heading(text)
# result = convert_to_json(input_path, start_word, end_pattern)
# 检查输出文件夹是否存在,如果不存在则创建
if not os.path.exists(output_folder):
os.makedirs(output_folder)
print(f"Created output folder: {output_folder}")
file_name = "clause1.json" if type == 1 else "clause2.json"
# file_name = f"clause{suffix_counter}.json"
output_path = os.path.join(output_folder, file_name)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(parsed_data, f, indent=4, ensure_ascii=False)
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
return output_path
#批量转换,测试时使用
def process_folder(input_folder, output_folder):
# 获取输入文件夹中的所有文件,过滤掉不以 'part2' 结尾的文件
files = [f for f in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, f)) and f.endswith('part2.pdf')]
# 遍历文件并对每个文件进行处理
for file_name in files:
file_path = os.path.join(input_folder, file_name)
# 去掉文件扩展名
file_name_without_extension = os.path.splitext(file_name)[0]
try:
# 调用 convert_clause_to_json传递文件路径、输出文件夹和递增的后缀
output_path = convert_clause_to_json(file_path, output_folder, 1, file_name_without_extension)
print(f"Processed file: {file_name}, JSON saved to: {output_path}")
except ValueError as e:
print(f"Error processing {file_name}: {e}")
#TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件定稿三次_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏
#TODO: 投标人须知正文这块,序号可能是乱序的,或许可以删除判断序号大小的逻辑,只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理
if __name__ == "__main__":
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zb_tobidders_notice.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf'
output_folder = r'C:\Users\Administrator\Desktop\招标文件\output3\tmp'
try:
output_path = convert_clause_to_json(file_path,output_folder,1)
print(f"Final JSON result saved to: {output_path}")
except ValueError as e:
print("Error:", e)
# input_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4'
# output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1'
#
# # 调用 process_folder 来处理整个文件夹中的所有文件
# process_folder(input_folder, output_folder)

View File

@ -8,7 +8,7 @@ from flask_app.货物标.投标人须知正文提取指定内容货物标版 imp
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_multiple from flask_app.货物标.截取pdf货物标版 import truncate_pdf_multiple
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
import concurrent.futures import concurrent.futures
from flask_app.货物标.投标人须知正文条款提取json文件货物标版 import convert_clause_to_json from flask_app.货物标.提取json货物标版 import convert_clause_to_json
from flask_app.货物标.无效标和废标和禁止投标整合main import combine_find_invalid from flask_app.货物标.无效标和废标和禁止投标整合main import combine_find_invalid
from flask_app.货物标.资格审查main import combine_qualification_review from flask_app.货物标.资格审查main import combine_qualification_review
from flask_app.货物标.评分标准提取main import combine_evaluation_standards from flask_app.货物标.评分标准提取main import combine_evaluation_standards

View File

@ -8,7 +8,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
from flask_app.general.通义千问long import upload_file, qianwen_long from flask_app.general.通义千问long import upload_file, qianwen_long
from flask_app.general.多线程提问 import multi_threading from flask_app.general.多线程提问 import multi_threading
from flask_app.general.json_utils import clean_json_string from flask_app.general.json_utils import clean_json_string
from flask_app.货物标.投标人须知正文条款提取json文件货物标版 import convert_clause_to_json from flask_app.货物标.提取json货物标版 import convert_clause_to_json
import copy import copy
import concurrent.futures import concurrent.futures
# 这个字典可能有嵌套,你需要遍历里面的键名,对键名作判断,而不是键值,具体是这样的:如果处于同一层级的键的数量>1并且键名全由数字或点号组成。那么就将这些序号键名全部删除重新组织成一个字典格式的数据你可以考虑用字符串列表来保持部分平级的数据 # 这个字典可能有嵌套,你需要遍历里面的键名,对键名作判断,而不是键值,具体是这样的:如果处于同一层级的键的数量>1并且键名全由数字或点号组成。那么就将这些序号键名全部删除重新组织成一个字典格式的数据你可以考虑用字符串列表来保持部分平级的数据
@ -445,7 +445,7 @@ def combine_qualification_review(invalid_path, qualification_path, notice_path):
# 定义用户查询,提取申请人资格要求 # 定义用户查询,提取申请人资格要求
user_query1 = """ user_query1 = """
该文档中说明的申请人资格要求是怎样的请以json格式给出回答外键为'申请人资格要求'键值为字符串列表其中每个字符串对应原文中的一条要求你的回答与原文内容一致不要擅自总结删减输出格式示例如下 第一章招标公告投标邀请书中说明的申请人资格要求是怎样的请以json格式给出回答外键为'申请人资格要求'键值为字符串列表其中每个字符串对应原文中的一条要求你的回答与原文内容一致不要擅自总结删减输出格式示例如下
{ {
"申请人资格要求":[ "申请人资格要求":[
"1.满足《中华人民共和国政府采购法》第二十二条规定;", "1.满足《中华人民共和国政府采购法》第二十二条规定;",
@ -454,10 +454,8 @@ def combine_qualification_review(invalid_path, qualification_path, notice_path):
] ]
} }
""" """
# 执行查询并清洗结果 # 执行查询并清洗结果
res1 = clean_json_string(qianwen_long(file_id1, user_query1)) res1 = clean_json_string(qianwen_long(file_id1, user_query1))
# 提取申请人资格要求 # 提取申请人资格要求
requirements = res1.get("申请人资格要求", "未找到相关内容") requirements = res1.get("申请人资格要求", "未找到相关内容")
return "申请人资格要求", requirements return "申请人资格要求", requirements
@ -478,9 +476,12 @@ def combine_qualification_review(invalid_path, qualification_path, notice_path):
future_to_key[future] = key future_to_key[future] = key
# 有条件地提交通知处理 # 有条件地提交通知处理
if qualification_path and notice_path and first_res.get("资格性审查") == "": if notice_path:
future = executor.submit(process_notice, notice_path) future = executor.submit(process_notice, notice_path)
future_to_key[future] = "申请人资格要求" future_to_key[future] = "申请人资格要求"
else:
future=executor.submit(process_notice,invalid_path)
future_to_key[future] = "申请人资格要求"
# 收集结果(按完成顺序) # 收集结果(按完成顺序)
for future in as_completed(future_to_key): for future in as_completed(future_to_key):
@ -585,17 +586,17 @@ if __name__ == "__main__":
start_time=time.time() start_time=time.time()
# qualification_path="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\6.2定版视频会议磋商文件_qualification2.pdf" # qualification_path="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\6.2定版视频会议磋商文件_qualification2.pdf"
# output_folder = "D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89" # output_folder = "D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89"
output_folder="C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5" output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\tmp"
# qualification_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5\\ztbfile_qualification1.pdf" # qualification_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5\\ztbfile_qualification1.pdf"
# qualification_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\6558a50a-13ea-4279-a5db-684935481c39\\ztbfile_qualification2.pdf" # qualification_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\6558a50a-13ea-4279-a5db-684935481c39\\ztbfile_qualification2.pdf"
qualification_path="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\6.2定版视频会议磋商文件_qualification2.pdf" qualification_path="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\2-招标文件统计局智能终端二次招标_qualification1.pdf"
# notice_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\6558a50a-13ea-4279-a5db-684935481c39\\ztbfile_notice.pdf" # notice_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\6558a50a-13ea-4279-a5db-684935481c39\\ztbfile_notice.pdf"
# notice_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5\\ztbfile_notice.pdf" # notice_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5\\ztbfile_notice.pdf"
notice_path="C:\\Users\\Administrator\\Desktop\\货物标\\output5\\6.2定版视频会议磋商文件_notice.pdf" notice_path="C:\\Users\\Administrator\\Desktop\\货物标\\output5\\2-招标文件(统计局智能终端二次招标)_notice.pdf"
# knowledge_name = "6.2视频会议docx" # knowledge_name = "6.2视频会议docx"
# invalid_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile.pdf" # invalid_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile.pdf"
# invalid_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5\\ztbfile.pdf" # invalid_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5\\ztbfile.pdf"
invalid_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\6.2定版视频会议磋商文件.pdf" invalid_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件(统计局智能终端二次招标).pdf"
res = combine_qualification_review(invalid_path, qualification_path, notice_path) res = combine_qualification_review(invalid_path, qualification_path, notice_path)
print(json.dumps(res, ensure_ascii=False, indent=4)) print(json.dumps(res, ensure_ascii=False, indent=4))
end_time=time.time() end_time=time.time()