10.18小解析
This commit is contained in:
parent
389d0f7fad
commit
91b4f1efc1
@ -62,9 +62,14 @@ def download_file(url, local_filename):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# 测试下载的URL
|
# 测试下载的URL
|
||||||
test_url ="https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/01%20%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6-%E5%A4%A7%E8%8D%94%E5%8E%BF%E5%85%AC%E5%AE%89%E5%B1%80%E6%83%85%E6%8C%87%E5%8B%A4%E8%88%86%E4%B8%80%E4%BD%93%E5%8C%96%E5%B9%B3%E5%8F%B0%E5%BB%BA%E8%AE%BE%E9%A1%B9%E7%9B%AE%28%E4%BA%8C%E6%AC%A1%29.pdf?Expires=1728751377&OSSAccessKeyId=TMP.3KeYhAGeJr2LGiNctSPvSmdSwxhrU8pbaDYRJcNCCgv8ijyWN613QahKb3nhXydfAvHaqpw4nTHXMzq7hmTHmNnPA77DgL&Signature=mwPHW8v7dVmHP1udTDL%2ByzllwCE%3D"
|
test_url ="http://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/tender/444d68db95c3438ca481f029500f4f9b.pdf?Expires=1729325467&OSSAccessKeyId=LTAI5tJmL4Nvg95VjgUNvVL6&Signature=2%2Bax895ik7DoFs%2FHVYeN29%2BEiTo%3D&x-oss-process=doc%2Fedit"
|
||||||
local_file_name = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output\\testdownload'
|
local_file_name = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output\\testdownload'
|
||||||
file_path,file_type = download_file(test_url, local_file_name)
|
downloaded = download_file(test_url, local_file_name)
|
||||||
if file_path:
|
if not downloaded:
|
||||||
print(f"Downloaded file path: {file_path}")
|
print("下载文件失败或不支持的文件类型")
|
||||||
print(file_type)
|
downloaded_filepath, file_type = downloaded
|
||||||
|
print(downloaded_filepath)
|
||||||
|
print(file_type)
|
||||||
|
# 检查文件类型
|
||||||
|
if file_type == 4:
|
||||||
|
print("error")
|
||||||
|
@ -257,7 +257,7 @@ def process_and_stream(file_url, zb_type):
|
|||||||
"""
|
"""
|
||||||
logger = g.logger
|
logger = g.logger
|
||||||
unique_id = g.unique_id
|
unique_id = g.unique_id
|
||||||
output_folder = f"flask_app/static/output1/{unique_id}"
|
output_folder = f"flask_app/static/output/{unique_id}"
|
||||||
filename = "ztbfile"
|
filename = "ztbfile"
|
||||||
downloaded_filename = os.path.join(output_folder, filename)
|
downloaded_filename = os.path.join(output_folder, filename)
|
||||||
|
|
||||||
|
@ -1,36 +1,43 @@
|
|||||||
def second_query(baseinfo_list):
|
import re
|
||||||
result_list = []
|
|
||||||
for item in baseinfo_list:
|
|
||||||
for key, value in item.items():
|
|
||||||
if isinstance(value, dict):
|
|
||||||
# 检查所有最内层的值是否都是 "未知"
|
|
||||||
if all(v == "未知" for v in value.values()):
|
|
||||||
result_list.append(key)
|
|
||||||
return result_list
|
|
||||||
|
|
||||||
# 示例用法
|
|
||||||
baseinfo_list = [
|
|
||||||
{
|
|
||||||
"招标人联系方式": {
|
|
||||||
"名称": "未知",
|
|
||||||
"地址": "未知",
|
|
||||||
},
|
|
||||||
"项目名称": {
|
|
||||||
"名称": "项目A",
|
|
||||||
"地址": "未知",
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"供应商信息": {
|
|
||||||
"名称": "未知",
|
|
||||||
"联系方式": "未知",
|
|
||||||
},
|
|
||||||
"合同详情": {
|
|
||||||
"金额": "100万",
|
|
||||||
"期限": "未知",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
result = second_query(baseinfo_list)
|
def read_questions_from_file(file_path):
|
||||||
print(result) # 输出: ['招标人联系方式', '供应商信息']
|
questions = []
|
||||||
|
current_question = ""
|
||||||
|
current_number = 0
|
||||||
|
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
for line in file:
|
||||||
|
line = line.strip()
|
||||||
|
|
||||||
|
if not line: # 跳过空行
|
||||||
|
continue
|
||||||
|
|
||||||
|
if line.startswith('#'): # 跳过以#开头的行
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 检查是否是新的问题编号,例如 "1."
|
||||||
|
match = re.match(r'^(\d+)\.', line)
|
||||||
|
if match:
|
||||||
|
# 如果有之前的问题,保存它
|
||||||
|
if current_question:
|
||||||
|
questions.append(current_question.strip())
|
||||||
|
|
||||||
|
# 开始新的问题
|
||||||
|
current_number = int(match.group(1))
|
||||||
|
# 提取问题内容,去掉编号和点
|
||||||
|
current_question = line.split('.', 1)[1].strip() + "\n"
|
||||||
|
else:
|
||||||
|
# 继续添加到当前问题
|
||||||
|
current_question += line + "\n"
|
||||||
|
|
||||||
|
# 添加最后一个问题(如果存在)
|
||||||
|
if current_question:
|
||||||
|
questions.append(current_question.strip())
|
||||||
|
|
||||||
|
return questions
|
||||||
|
|
||||||
|
|
||||||
|
qualification_review_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\资格评审.txt'
|
||||||
|
ques=read_questions_from_file(qualification_review_file_path)
|
||||||
|
print(ques)
|
@ -1,7 +1,8 @@
|
|||||||
|
import copy
|
||||||
import json
|
import json
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
|
import concurrent.futures
|
||||||
from flask_app.main.json_utils import clean_json_string, nest_json_under_key,rename_outer_key
|
from flask_app.main.json_utils import clean_json_string, nest_json_under_key,rename_outer_key
|
||||||
from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
|
from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
|
||||||
from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge, merge_json_to_list
|
from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge, merge_json_to_list
|
||||||
@ -113,20 +114,6 @@ def judge_consortium_bidding(baseinfo_list):
|
|||||||
baseinfo_list[:] = updated_list
|
baseinfo_list[:] = updated_list
|
||||||
return accept_bidding
|
return accept_bidding
|
||||||
|
|
||||||
def generate_query(baseinfo_list):
|
|
||||||
questions_list = []
|
|
||||||
for item in baseinfo_list:
|
|
||||||
# print(json.dumps(item, ensure_ascii=False, indent=4))
|
|
||||||
for key, value in item.items():
|
|
||||||
if value == "未知" or (isinstance(value, dict) and all(v == "未知" for v in value.values())):
|
|
||||||
question = (
|
|
||||||
f"根据该招标文件中的信息,{key}的内容是怎样的?"
|
|
||||||
f"请按json格式给我提供信息,键名是'{key}',"
|
|
||||||
f"若存在未知信息,在对应的键值中填'未知'。"
|
|
||||||
)
|
|
||||||
questions_list.append(question)
|
|
||||||
return questions_list
|
|
||||||
|
|
||||||
def update_baseinfo_lists(baseinfo_list1, baseinfo_list2):
|
def update_baseinfo_lists(baseinfo_list1, baseinfo_list2):
|
||||||
# 创建一个字典,用于存储 baseinfo_list1 中的所有键值对
|
# 创建一个字典,用于存储 baseinfo_list1 中的所有键值对
|
||||||
combined_dict = {}
|
combined_dict = {}
|
||||||
@ -167,7 +154,18 @@ def process_judge_questions(judge_file_path, chosen_numbers, truncate0, baseinfo
|
|||||||
for question, response in res2:
|
for question, response in res2:
|
||||||
baseinfo_list1.append(clean_json_string(response))
|
baseinfo_list1.append(clean_json_string(response))
|
||||||
|
|
||||||
def process_questions_list(questions_list, truncate2):
|
def process_baseinfo_list(baseinfo_list, truncate2):
|
||||||
|
questions_list = []
|
||||||
|
for item in baseinfo_list:
|
||||||
|
# print(json.dumps(item, ensure_ascii=False, indent=4))
|
||||||
|
for key, value in item.items():
|
||||||
|
if value == "未知" or (isinstance(value, dict) and all(v == "未知" for v in value.values())):
|
||||||
|
question = (
|
||||||
|
f"根据该招标文件中的信息,{key}的内容是怎样的?"
|
||||||
|
f"请按json格式给我提供信息,键名是'{key}',"
|
||||||
|
f"若存在未知信息,在对应的键值中填'未知'。"
|
||||||
|
)
|
||||||
|
questions_list.append(question)
|
||||||
if questions_list:
|
if questions_list:
|
||||||
file_id = upload_file(truncate2)
|
file_id = upload_file(truncate2)
|
||||||
baseinfo_results = multi_threading(questions_list, "", file_id, 2)
|
baseinfo_results = multi_threading(questions_list, "", file_id, 2)
|
||||||
@ -200,35 +198,27 @@ def combine_basic_info(merged_baseinfo_path,truncate0,truncate2, clause_path):
|
|||||||
baseinfo_list1 = [clean_json_string(res) for _, res in baseinfo_results] if baseinfo_results else []
|
baseinfo_list1 = [clean_json_string(res) for _, res in baseinfo_results] if baseinfo_results else []
|
||||||
|
|
||||||
chosen_numbers, merged = merge_json_to_list(baseinfo_list1.pop())
|
chosen_numbers, merged = merge_json_to_list(baseinfo_list1.pop())
|
||||||
|
baseinfo_list1_copy = copy.deepcopy(baseinfo_list1)
|
||||||
baseinfo_list1.append(merged)
|
baseinfo_list1.append(merged)
|
||||||
|
|
||||||
questions_list=generate_query(baseinfo_list1)
|
|
||||||
|
|
||||||
# judge_file_path = 'flask_app/static/提示词/是否相关问题qianwen-long.txt'
|
# judge_file_path = 'flask_app/static/提示词/是否相关问题qianwen-long.txt'
|
||||||
judge_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题qianwen-long.txt'
|
judge_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题qianwen-long.txt'
|
||||||
# 创建两个线程
|
|
||||||
thread1 = threading.Thread(target=process_judge_questions,
|
|
||||||
args=(judge_file_path, chosen_numbers, truncate0, baseinfo_list1))
|
|
||||||
thread2 = threading.Thread(target=process_questions_list, args=(questions_list, truncate2))
|
|
||||||
|
|
||||||
# 启动线程
|
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
||||||
thread1.start()
|
# 提交两个任务
|
||||||
thread2.start()
|
future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, truncate0, baseinfo_list1)
|
||||||
|
future2 = executor.submit(process_baseinfo_list, baseinfo_list1_copy, truncate2)
|
||||||
|
|
||||||
# 等待两个线程完成
|
# 等待两个任务完成并获取结果
|
||||||
thread1.join()
|
future1.result() # process_judge_questions 直接修改 baseinfo_list1,不需要返回值
|
||||||
thread2.join()
|
baseinfo_list2 = future2.result()
|
||||||
|
|
||||||
# 处理结果
|
# 如果需要,合并或处理 baseinfo_list1 和 baseinfo_list2
|
||||||
# baseinfo_list1 已经在 process_judge_questions 中被更新
|
updated_list = update_baseinfo_lists(baseinfo_list1, baseinfo_list2)
|
||||||
baseinfo_list2 = process_questions_list(questions_list, truncate2)
|
|
||||||
|
|
||||||
updated_list=update_baseinfo_lists(baseinfo_list1,baseinfo_list2)
|
rebidding_situation = extract_from_notice(clause_path, 3)
|
||||||
|
|
||||||
rebidding_situation = extract_from_notice(clause_path, 3) # "重新招标, 不再招标和终止招标"需从投标人须知正文提取
|
|
||||||
update_json = rename_outer_key(rebidding_situation, "重新招标、不再招标和终止招标")
|
update_json = rename_outer_key(rebidding_situation, "重新招标、不再招标和终止招标")
|
||||||
updated_list.append(update_json)
|
updated_list.append(update_json)
|
||||||
aggregated_baseinfo = aggregate_basic_info_engineering(updated_list) # 现在是一个字典
|
aggregated_baseinfo = aggregate_basic_info_engineering(updated_list)
|
||||||
return {"基础信息": aggregated_baseinfo}
|
return {"基础信息": aggregated_baseinfo}
|
||||||
|
|
||||||
#TODO:先不带投标人须知正文,如果是未知,再直接问正文,
|
#TODO:先不带投标人须知正文,如果是未知,再直接问正文,
|
||||||
|
@ -40,10 +40,14 @@ def read_questions_from_file(file_path):
|
|||||||
with open(file_path, 'r', encoding='utf-8') as file:
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
for line in file:
|
for line in file:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
|
|
||||||
if not line: # 跳过空行
|
if not line: # 跳过空行
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 检查是否是新的问题编号
|
if line.startswith('#'): # 跳过以#开头的行
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 检查是否是新的问题编号,例如 "1."
|
||||||
match = re.match(r'^(\d+)\.', line)
|
match = re.match(r'^(\d+)\.', line)
|
||||||
if match:
|
if match:
|
||||||
# 如果有之前的问题,保存它
|
# 如果有之前的问题,保存它
|
||||||
@ -52,12 +56,13 @@ def read_questions_from_file(file_path):
|
|||||||
|
|
||||||
# 开始新的问题
|
# 开始新的问题
|
||||||
current_number = int(match.group(1))
|
current_number = int(match.group(1))
|
||||||
|
# 提取问题内容,去掉编号和点
|
||||||
current_question = line.split('.', 1)[1].strip() + "\n"
|
current_question = line.split('.', 1)[1].strip() + "\n"
|
||||||
else:
|
else:
|
||||||
# 继续添加到当前问题
|
# 继续添加到当前问题
|
||||||
current_question += line + "\n"
|
current_question += line + "\n"
|
||||||
|
|
||||||
# 添加最后一个问题
|
# 添加最后一个问题(如果存在)
|
||||||
if current_question:
|
if current_question:
|
||||||
questions.append(current_question.strip())
|
questions.append(current_question.strip())
|
||||||
|
|
||||||
|
@ -58,7 +58,7 @@ def extract_common_header(pdf_path):
|
|||||||
return '\n'.join(common_headers)
|
return '\n'.join(common_headers)
|
||||||
|
|
||||||
|
|
||||||
def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page):
|
def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header):
|
||||||
"""
|
"""
|
||||||
从原始PDF截取指定范围的页面并保存到新的PDF文件中。
|
从原始PDF截取指定范围的页面并保存到新的PDF文件中。
|
||||||
如果 output_suffix 是 'notice',则额外保存 start_page 之前的页面。
|
如果 output_suffix 是 'notice',则额外保存 start_page 之前的页面。
|
||||||
@ -89,11 +89,25 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
|
|||||||
print(f"无效的页面范围: {start_page} 到 {end_page}")
|
print(f"无效的页面范围: {start_page} 到 {end_page}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
# 如果 output_suffix 是 'notice',保存 start_page 之前的页面
|
# 如果 output_suffix 是 'notice',保存 start_page 之前的页面(若遇到'目录',则截取到'目录')
|
||||||
if output_suffix == 'notice' and start_page > 0:
|
if output_suffix == 'notice' and start_page > 0:
|
||||||
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
|
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
|
||||||
before_doc = PdfWriter()
|
before_doc = PdfWriter()
|
||||||
for page_num in range(0, start_page):
|
toc_page = -1
|
||||||
|
|
||||||
|
# 查找目录页
|
||||||
|
for page_num in range(min(start_page, total_pages)):
|
||||||
|
page_text = pdf_document.pages[page_num].extract_text()
|
||||||
|
cleaned_text = clean_page_content(page_text, common_header)
|
||||||
|
if re.search(r'目\s*录', cleaned_text, re.MULTILINE):
|
||||||
|
toc_page = page_num
|
||||||
|
break
|
||||||
|
|
||||||
|
# 确定截取的页数
|
||||||
|
pages_to_extract = toc_page + 1 if toc_page != -1 else start_page
|
||||||
|
|
||||||
|
# 提取页面
|
||||||
|
for page_num in range(pages_to_extract):
|
||||||
before_doc.add_page(pdf_document.pages[page_num])
|
before_doc.add_page(pdf_document.pages[page_num])
|
||||||
with open(before_pdf_path, 'wb') as f_before:
|
with open(before_pdf_path, 'wb') as f_before:
|
||||||
before_doc.write(f_before)
|
before_doc.write(f_before)
|
||||||
@ -159,7 +173,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix):
|
|||||||
print(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!")
|
print(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!")
|
||||||
return ""
|
return ""
|
||||||
else:
|
else:
|
||||||
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
|
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header)
|
||||||
elif output_suffix == "invalid":
|
elif output_suffix == "invalid":
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
total_pages = len(pdf_document.pages)
|
total_pages = len(pdf_document.pages)
|
||||||
@ -167,7 +181,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix):
|
|||||||
total = int(total_pages * 2 / 3)
|
total = int(total_pages * 2 / 3)
|
||||||
start_page = last_begin_index
|
start_page = last_begin_index
|
||||||
end_page = min(90, total)
|
end_page = min(90, total)
|
||||||
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
|
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header)
|
||||||
|
|
||||||
|
|
||||||
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
||||||
@ -208,7 +222,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
|||||||
print(f"first: 未找到起始或结束页在文件 {pdf_path} 中!")
|
print(f"first: 未找到起始或结束页在文件 {pdf_path} 中!")
|
||||||
return ""
|
return ""
|
||||||
else:
|
else:
|
||||||
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
|
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header)
|
||||||
|
|
||||||
|
|
||||||
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
||||||
@ -337,8 +351,7 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na
|
|||||||
desired_suffixes = [
|
desired_suffixes = [
|
||||||
f'{base_file_name}_before.pdf',
|
f'{base_file_name}_before.pdf',
|
||||||
f'{base_file_name}_notice.pdf',
|
f'{base_file_name}_notice.pdf',
|
||||||
f'{base_file_name}_tobidders_notice_table.pdf',
|
f'{base_file_name}_tobidders_notice_table.pdf'
|
||||||
f'{base_file_name}_tobidders_notice.pdf'
|
|
||||||
]
|
]
|
||||||
|
|
||||||
all_pdfs_to_merge = []
|
all_pdfs_to_merge = []
|
||||||
@ -416,7 +429,7 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder,selections):
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74\\ztbfile.pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74\\ztbfile.pdf"
|
||||||
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74"
|
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74"
|
||||||
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf"
|
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest16.pdf"
|
||||||
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\special_output"
|
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\special_output"
|
||||||
files=truncate_pdf_multiple(input_path,output_folder)
|
files=truncate_pdf_multiple(input_path,output_folder)
|
||||||
# selections = [5, 1] # 仅处理 selection 5、1 和 3
|
# selections = [5, 1] # 仅处理 selection 5、1 和 3
|
||||||
|
@ -289,7 +289,6 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
output_folder = "flask_app/static/output/zytest1"
|
output_folder = "flask_app/static/output/zytest1"
|
||||||
|
|
||||||
# truncate0 = os.path.join(output_folder, "ztb_tobidders_notice_table.pdf")
|
# truncate0 = os.path.join(output_folder, "ztb_tobidders_notice_table.pdf")
|
||||||
# truncate1=os.path.join(output_folder,"ztb_evaluation_method.pdf")
|
# truncate1=os.path.join(output_folder,"ztb_evaluation_method.pdf")
|
||||||
# clause_path = convert_clause_to_json(truncate1, output_folder)
|
# clause_path = convert_clause_to_json(truncate1, output_folder)
|
||||||
@ -299,7 +298,6 @@ if __name__ == "__main__":
|
|||||||
file_type = 1 #1:docx 2:pdf 3:其他
|
file_type = 1 #1:docx 2:pdf 3:其他
|
||||||
input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest11.pdf"
|
input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest11.pdf"
|
||||||
# file_path = main_processing(output_folder, input_file, file_type, "uuidzyzy11")
|
# file_path = main_processing(output_folder, input_file, file_type, "uuidzyzy11")
|
||||||
print("zy")
|
|
||||||
|
|
||||||
preprocess_files(output_folder, input_file, file_type, "zytest1")
|
preprocess_files(output_folder, input_file, file_type, "zytest1")
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
|
@ -3,7 +3,7 @@ import os
|
|||||||
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
|
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
|
||||||
from flask_app.main.json_utils import nest_json_under_key, extract_content_from_json
|
from flask_app.main.json_utils import nest_json_under_key, extract_content_from_json
|
||||||
from flask_app.main.形式响应评审 import process_reviews
|
from flask_app.main.形式响应评审 import process_reviews
|
||||||
from flask_app.main.资格评审 import process_qualification
|
from flask_app.main.资格评审old import process_qualification
|
||||||
from flask_app.main.通义千问long import upload_file, qianwen_long
|
from flask_app.main.通义千问long import upload_file, qianwen_long
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
@ -340,12 +340,16 @@ def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix,
|
|||||||
|
|
||||||
return path1, path2
|
return path1, path2
|
||||||
|
|
||||||
|
|
||||||
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
|
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
|
||||||
try:
|
try:
|
||||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
|
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
patterns = None
|
patterns = None
|
||||||
begin_page = 0
|
begin_page = 0
|
||||||
|
start_page = None
|
||||||
|
end_page = None
|
||||||
|
|
||||||
if output_suffix == "procurement":
|
if output_suffix == "procurement":
|
||||||
patterns = [get_patterns_for_procurement()]
|
patterns = [get_patterns_for_procurement()]
|
||||||
begin_page = 5
|
begin_page = 5
|
||||||
@ -353,18 +357,20 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
|
|||||||
patterns = [get_patterns_for_evaluation_method()]
|
patterns = [get_patterns_for_evaluation_method()]
|
||||||
begin_page = 5
|
begin_page = 5
|
||||||
elif output_suffix == "qualification1":
|
elif output_suffix == "qualification1":
|
||||||
patterns = [get_patterns_for_qualification()] # This now returns a tuple of pattern pairs
|
patterns = [get_patterns_for_qualification()]
|
||||||
begin_page = 5
|
begin_page = 5
|
||||||
elif output_suffix == "notice":
|
elif output_suffix == "notice":
|
||||||
patterns = [get_patterns_for_notice()]
|
patterns = [get_patterns_for_notice()]
|
||||||
begin_page = 0
|
begin_page = 0
|
||||||
# Try each set of patterns until a valid range is found
|
|
||||||
for pattern_pair in patterns:
|
if patterns:
|
||||||
start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
|
for pattern_pair in patterns:
|
||||||
common_header,
|
start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
|
||||||
exclusion_pattern, output_suffix)
|
common_header,
|
||||||
if start_page is not None and end_page is not None:
|
exclusion_pattern, output_suffix)
|
||||||
break
|
if start_page is not None and end_page is not None:
|
||||||
|
break
|
||||||
|
|
||||||
if start_page is None or end_page is None:
|
if start_page is None or end_page is None:
|
||||||
if output_suffix == "qualification1":
|
if output_suffix == "qualification1":
|
||||||
print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
|
print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
|
||||||
@ -373,14 +379,16 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
|
|||||||
if len(temp) > 0:
|
if len(temp) > 0:
|
||||||
return temp[0]
|
return temp[0]
|
||||||
else:
|
else:
|
||||||
return "" # 返回空字符串
|
return ""
|
||||||
else:
|
else:
|
||||||
print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
|
print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
|
||||||
return "" # 返回空字符串
|
return ""
|
||||||
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
|
||||||
|
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix,
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error in extract_pages_twice: {e}")
|
print(f"Error in extract_pages_twice: {e}")
|
||||||
return "" # 返回空字符串
|
return ""
|
||||||
|
|
||||||
|
|
||||||
# def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):
|
# def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user