10.18小解析

This commit is contained in:
zy123 2024-10-19 15:33:55 +08:00
parent 389d0f7fad
commit 91b4f1efc1
9 changed files with 128 additions and 102 deletions

View File

@ -62,9 +62,14 @@ def download_file(url, local_filename):
if __name__ == '__main__':
# 测试下载的URL
test_url ="https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/01%20%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6-%E5%A4%A7%E8%8D%94%E5%8E%BF%E5%85%AC%E5%AE%89%E5%B1%80%E6%83%85%E6%8C%87%E5%8B%A4%E8%88%86%E4%B8%80%E4%BD%93%E5%8C%96%E5%B9%B3%E5%8F%B0%E5%BB%BA%E8%AE%BE%E9%A1%B9%E7%9B%AE%28%E4%BA%8C%E6%AC%A1%29.pdf?Expires=1728751377&OSSAccessKeyId=TMP.3KeYhAGeJr2LGiNctSPvSmdSwxhrU8pbaDYRJcNCCgv8ijyWN613QahKb3nhXydfAvHaqpw4nTHXMzq7hmTHmNnPA77DgL&Signature=mwPHW8v7dVmHP1udTDL%2ByzllwCE%3D"
test_url ="http://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/tender/444d68db95c3438ca481f029500f4f9b.pdf?Expires=1729325467&OSSAccessKeyId=LTAI5tJmL4Nvg95VjgUNvVL6&Signature=2%2Bax895ik7DoFs%2FHVYeN29%2BEiTo%3D&x-oss-process=doc%2Fedit"
local_file_name = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output\\testdownload'
file_path,file_type = download_file(test_url, local_file_name)
if file_path:
print(f"Downloaded file path: {file_path}")
downloaded = download_file(test_url, local_file_name)
if not downloaded:
print("下载文件失败或不支持的文件类型")
downloaded_filepath, file_type = downloaded
print(downloaded_filepath)
print(file_type)
# 检查文件类型
if file_type == 4:
print("error")

View File

@ -257,7 +257,7 @@ def process_and_stream(file_url, zb_type):
"""
logger = g.logger
unique_id = g.unique_id
output_folder = f"flask_app/static/output1/{unique_id}"
output_folder = f"flask_app/static/output/{unique_id}"
filename = "ztbfile"
downloaded_filename = os.path.join(output_folder, filename)

View File

@ -1,36 +1,43 @@
def second_query(baseinfo_list):
result_list = []
for item in baseinfo_list:
for key, value in item.items():
if isinstance(value, dict):
# 检查所有最内层的值是否都是 "未知"
if all(v == "未知" for v in value.values()):
result_list.append(key)
return result_list
import re
# 示例用法
baseinfo_list = [
{
"招标人联系方式": {
"名称": "未知",
"地址": "未知",
},
"项目名称": {
"名称": "项目A",
"地址": "未知",
}
},
{
"供应商信息": {
"名称": "未知",
"联系方式": "未知",
},
"合同详情": {
"金额": "100万",
"期限": "未知",
}
}
]
result = second_query(baseinfo_list)
print(result) # 输出: ['招标人联系方式', '供应商信息']
def read_questions_from_file(file_path):
questions = []
current_question = ""
current_number = 0
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
line = line.strip()
if not line: # 跳过空行
continue
if line.startswith('#'): # 跳过以#开头的行
continue
# 检查是否是新的问题编号,例如 "1."
match = re.match(r'^(\d+)\.', line)
if match:
# 如果有之前的问题,保存它
if current_question:
questions.append(current_question.strip())
# 开始新的问题
current_number = int(match.group(1))
# 提取问题内容,去掉编号和点
current_question = line.split('.', 1)[1].strip() + "\n"
else:
# 继续添加到当前问题
current_question += line + "\n"
# 添加最后一个问题(如果存在)
if current_question:
questions.append(current_question.strip())
return questions
qualification_review_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\资格评审.txt'
ques=read_questions_from_file(qualification_review_file_path)
print(ques)

View File

@ -1,7 +1,8 @@
import copy
import json
import threading
import time
import concurrent.futures
from flask_app.main.json_utils import clean_json_string, nest_json_under_key,rename_outer_key
from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge, merge_json_to_list
@ -113,20 +114,6 @@ def judge_consortium_bidding(baseinfo_list):
baseinfo_list[:] = updated_list
return accept_bidding
def generate_query(baseinfo_list):
questions_list = []
for item in baseinfo_list:
# print(json.dumps(item, ensure_ascii=False, indent=4))
for key, value in item.items():
if value == "未知" or (isinstance(value, dict) and all(v == "未知" for v in value.values())):
question = (
f"根据该招标文件中的信息,{key}的内容是怎样的?"
f"请按json格式给我提供信息键名是'{key}'"
f"若存在未知信息,在对应的键值中填'未知'"
)
questions_list.append(question)
return questions_list
def update_baseinfo_lists(baseinfo_list1, baseinfo_list2):
# 创建一个字典,用于存储 baseinfo_list1 中的所有键值对
combined_dict = {}
@ -167,7 +154,18 @@ def process_judge_questions(judge_file_path, chosen_numbers, truncate0, baseinfo
for question, response in res2:
baseinfo_list1.append(clean_json_string(response))
def process_questions_list(questions_list, truncate2):
def process_baseinfo_list(baseinfo_list, truncate2):
questions_list = []
for item in baseinfo_list:
# print(json.dumps(item, ensure_ascii=False, indent=4))
for key, value in item.items():
if value == "未知" or (isinstance(value, dict) and all(v == "未知" for v in value.values())):
question = (
f"根据该招标文件中的信息,{key}的内容是怎样的?"
f"请按json格式给我提供信息键名是'{key}'"
f"若存在未知信息,在对应的键值中填'未知'"
)
questions_list.append(question)
if questions_list:
file_id = upload_file(truncate2)
baseinfo_results = multi_threading(questions_list, "", file_id, 2)
@ -200,35 +198,27 @@ def combine_basic_info(merged_baseinfo_path,truncate0,truncate2, clause_path):
baseinfo_list1 = [clean_json_string(res) for _, res in baseinfo_results] if baseinfo_results else []
chosen_numbers, merged = merge_json_to_list(baseinfo_list1.pop())
baseinfo_list1_copy = copy.deepcopy(baseinfo_list1)
baseinfo_list1.append(merged)
questions_list=generate_query(baseinfo_list1)
# judge_file_path = 'flask_app/static/提示词/是否相关问题qianwen-long.txt'
judge_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题qianwen-long.txt'
# 创建两个线程
thread1 = threading.Thread(target=process_judge_questions,
args=(judge_file_path, chosen_numbers, truncate0, baseinfo_list1))
thread2 = threading.Thread(target=process_questions_list, args=(questions_list, truncate2))
# 启动线程
thread1.start()
thread2.start()
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
# 提交两个任务
future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, truncate0, baseinfo_list1)
future2 = executor.submit(process_baseinfo_list, baseinfo_list1_copy, truncate2)
# 等待两个线程完成
thread1.join()
thread2.join()
# 处理结果
# baseinfo_list1 已经在 process_judge_questions 中被更新
baseinfo_list2 = process_questions_list(questions_list, truncate2)
# 等待两个任务完成并获取结果
future1.result() # process_judge_questions 直接修改 baseinfo_list1不需要返回值
baseinfo_list2 = future2.result()
# 如果需要,合并或处理 baseinfo_list1 和 baseinfo_list2
updated_list = update_baseinfo_lists(baseinfo_list1, baseinfo_list2)
rebidding_situation = extract_from_notice(clause_path, 3) # "重新招标, 不再招标和终止招标"需从投标人须知正文提取
rebidding_situation = extract_from_notice(clause_path, 3)
update_json = rename_outer_key(rebidding_situation, "重新招标、不再招标和终止招标")
updated_list.append(update_json)
aggregated_baseinfo = aggregate_basic_info_engineering(updated_list) # 现在是一个字典
aggregated_baseinfo = aggregate_basic_info_engineering(updated_list)
return {"基础信息": aggregated_baseinfo}
#TODO:先不带投标人须知正文,如果是未知,再直接问正文,

View File

@ -40,10 +40,14 @@ def read_questions_from_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
line = line.strip()
if not line: # 跳过空行
continue
# 检查是否是新的问题编号
if line.startswith('#'): # 跳过以#开头的行
continue
# 检查是否是新的问题编号,例如 "1."
match = re.match(r'^(\d+)\.', line)
if match:
# 如果有之前的问题,保存它
@ -52,12 +56,13 @@ def read_questions_from_file(file_path):
# 开始新的问题
current_number = int(match.group(1))
# 提取问题内容,去掉编号和点
current_question = line.split('.', 1)[1].strip() + "\n"
else:
# 继续添加到当前问题
current_question += line + "\n"
# 添加最后一个问题
# 添加最后一个问题(如果存在)
if current_question:
questions.append(current_question.strip())

View File

@ -58,7 +58,7 @@ def extract_common_header(pdf_path):
return '\n'.join(common_headers)
def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page):
def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header):
"""
从原始PDF截取指定范围的页面并保存到新的PDF文件中
如果 output_suffix 'notice'则额外保存 start_page 之前的页面
@ -89,11 +89,25 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
print(f"无效的页面范围: {start_page}{end_page}")
return ""
# 如果 output_suffix 是 'notice',保存 start_page 之前的页面
# 如果 output_suffix 是 'notice',保存 start_page 之前的页面(若遇到'目录',则截取到'目录')
if output_suffix == 'notice' and start_page > 0:
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
before_doc = PdfWriter()
for page_num in range(0, start_page):
toc_page = -1
# 查找目录页
for page_num in range(min(start_page, total_pages)):
page_text = pdf_document.pages[page_num].extract_text()
cleaned_text = clean_page_content(page_text, common_header)
if re.search(r'\s*录', cleaned_text, re.MULTILINE):
toc_page = page_num
break
# 确定截取的页数
pages_to_extract = toc_page + 1 if toc_page != -1 else start_page
# 提取页面
for page_num in range(pages_to_extract):
before_doc.add_page(pdf_document.pages[page_num])
with open(before_pdf_path, 'wb') as f_before:
before_doc.write(f_before)
@ -159,7 +173,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix):
print(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!")
return ""
else:
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header)
elif output_suffix == "invalid":
pdf_document = PdfReader(pdf_path)
total_pages = len(pdf_document.pages)
@ -167,7 +181,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix):
total = int(total_pages * 2 / 3)
start_page = last_begin_index
end_page = min(90, total)
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header)
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
@ -208,7 +222,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
print(f"first: 未找到起始或结束页在文件 {pdf_path} 中!")
return ""
else:
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header)
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
@ -337,8 +351,7 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na
desired_suffixes = [
f'{base_file_name}_before.pdf',
f'{base_file_name}_notice.pdf',
f'{base_file_name}_tobidders_notice_table.pdf',
f'{base_file_name}_tobidders_notice.pdf'
f'{base_file_name}_tobidders_notice_table.pdf'
]
all_pdfs_to_merge = []
@ -416,7 +429,7 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder,selections):
if __name__ == "__main__":
# input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74\\ztbfile.pdf"
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74"
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf"
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest16.pdf"
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\special_output"
files=truncate_pdf_multiple(input_path,output_folder)
# selections = [5, 1] # 仅处理 selection 5、1 和 3

View File

@ -289,7 +289,6 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_
if __name__ == "__main__":
output_folder = "flask_app/static/output/zytest1"
# truncate0 = os.path.join(output_folder, "ztb_tobidders_notice_table.pdf")
# truncate1=os.path.join(output_folder,"ztb_evaluation_method.pdf")
# clause_path = convert_clause_to_json(truncate1, output_folder)
@ -299,7 +298,6 @@ if __name__ == "__main__":
file_type = 1 #1:docx 2:pdf 3:其他
input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest11.pdf"
# file_path = main_processing(output_folder, input_file, file_type, "uuidzyzy11")
print("zy")
preprocess_files(output_folder, input_file, file_type, "zytest1")
end_time = time.time()

View File

@ -3,7 +3,7 @@ import os
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
from flask_app.main.json_utils import nest_json_under_key, extract_content_from_json
from flask_app.main.形式响应评审 import process_reviews
from flask_app.main.资格评审 import process_qualification
from flask_app.main.资格评审old import process_qualification
from flask_app.main.通义千问long import upload_file, qianwen_long
from concurrent.futures import ThreadPoolExecutor

View File

@ -340,12 +340,16 @@ def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix,
return path1, path2
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
try:
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
pdf_document = PdfReader(pdf_path)
patterns = None
begin_page = 0
start_page = None
end_page = None
if output_suffix == "procurement":
patterns = [get_patterns_for_procurement()]
begin_page = 5
@ -353,18 +357,20 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
patterns = [get_patterns_for_evaluation_method()]
begin_page = 5
elif output_suffix == "qualification1":
patterns = [get_patterns_for_qualification()] # This now returns a tuple of pattern pairs
patterns = [get_patterns_for_qualification()]
begin_page = 5
elif output_suffix == "notice":
patterns = [get_patterns_for_notice()]
begin_page = 0
# Try each set of patterns until a valid range is found
if patterns:
for pattern_pair in patterns:
start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
common_header,
exclusion_pattern, output_suffix)
if start_page is not None and end_page is not None:
break
if start_page is None or end_page is None:
if output_suffix == "qualification1":
print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
@ -373,14 +379,16 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
if len(temp) > 0:
return temp[0]
else:
return "" # 返回空字符串
return ""
else:
print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
return "" # 返回空字符串
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
return ""
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix,
)
except Exception as e:
print(f"Error in extract_pages_twice: {e}")
return "" # 返回空字符串
return ""
# def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):