12.17 无效投标、废标

This commit is contained in:
zy123 2024-12-17 18:44:58 +08:00
parent 8149769bab
commit cb153889a4
13 changed files with 357 additions and 617 deletions

View File

@ -91,21 +91,20 @@ def find_and_merge(target_path, output_suffix):
paths=[target_path,full_path]
merge_pdfs(paths,target_path)
def merge_selected_pdfs_for_engineering(output_folder, truncate_files, output_path, base_file_name):
def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_name, mode='engineering'):
"""
合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件
以及 truncate_files 中以指定后缀结尾的文件按照指定顺序合并
通用的 PDF 合并函数根据不同的模式合并指定的文件
参数
- output_folder (str): 包含以 {base_file_name}_before.pdf 结尾的 PDF 文件的文件夹路径
- truncate_files (list): 包含 PDF 文件路径的列表
- output_path (str): 合并后的 PDF 文件保存路径
- base_file_name (str): 用于匹配文件名的基础名称
- mode (str): 合并模式支持 'engineering' 'goods'
返回:
- str: 合并后的 PDF 文件路径如果未找到所有需要合并的文件则返回 ""
- str: 如果合并成功返回 output_path否则返回空字符串 ""
"""
# 1. 获取 output_folder 中所有文件
try:
all_output_files = os.listdir(output_folder)
except FileNotFoundError:
@ -115,50 +114,82 @@ def merge_selected_pdfs_for_engineering(output_folder, truncate_files, output_pa
print(f"没有权限访问输出文件夹 '{output_folder}'")
return ""
# 2. 定义要选择的文件后缀及合并顺序,包括 before 文件
desired_suffixes = [
f'{base_file_name}_before.pdf',
f'{base_file_name}_notice.pdf',
f'{base_file_name}_tobidders_notice_table.pdf'
]
if mode == 'engineering':
required_suffixes = [
f'{base_file_name}_before.pdf',
f'{base_file_name}_notice.pdf',
f'{base_file_name}_tobidders_notice_table.pdf'
]
optional_suffixes = []
elif mode == 'goods':
required_suffixes = [
f'{base_file_name}_before.pdf',
f'{base_file_name}_notice.pdf',
f'{base_file_name}_tobidders_notice_part1.pdf'
]
optional_suffixes = [
f'{base_file_name}_tobidders_notice_part2.pdf'
]
else:
print(f"未知的合并模式: {mode}")
return ""
all_pdfs_to_merge = []
missing_files = [] # 用于记录缺失的文件
missing_files = []
for suffix in desired_suffixes:
# 处理必需的文件
for suffix in required_suffixes:
if suffix == f'{base_file_name}_before.pdf':
# 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件
matching_files = [
os.path.join(output_folder, f)
for f in all_output_files
if f.endswith(suffix)
]
else:
# 从 truncate_files 中选择以指定后缀结尾的文件
matching_files = [f for f in truncate_files if f.endswith(suffix)]
if matching_files:
# 如果找到多个匹配的文件,按名称排序并添加
matching_files_sorted = sorted(matching_files)
all_pdfs_to_merge.extend(matching_files_sorted)
for f in matching_files_sorted:
print(f"选中文件: {f}")
else:
print(f"没有找到以 '{suffix}' 结尾的文件。")
missing_files.append(suffix) # 记录缺失的文件
missing_files.append(suffix)
# 检查是否所有需要的文件都已找到
# 处理可选的文件
for suffix in optional_suffixes:
if suffix == f'{base_file_name}_before.pdf':
matching_files = [
os.path.join(output_folder, f)
for f in all_output_files
if f.endswith(suffix)
]
else:
matching_files = [f for f in truncate_files if f.endswith(suffix)]
if matching_files:
matching_files_sorted = sorted(matching_files)
all_pdfs_to_merge.extend(matching_files_sorted)
for f in matching_files_sorted:
print(f"选中文件: {f}")
else:
print(f"可选文件 '{suffix}' 未找到,继续合并。")
# 检查是否所有必需的文件都已找到
if missing_files:
print("缺少以下必要的 PDF 文件,无法进行合并:")
for missing in missing_files:
print(f" - {missing}")
return ""
print(f"总共将要合并的 PDF 文件数量: {len(all_pdfs_to_merge)}")
# 过滤掉不存在或为空的文件路径
all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f)]
all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f) and os.path.getsize(f) > 0]
if not all_pdfs_to_merge:
print("没有找到要合并的 PDF 文件。")
print("没有找到要合并的有效 PDF 文件。")
return ""
# 调用 merge_pdfs 函数进行合并
@ -171,134 +202,6 @@ def merge_selected_pdfs_for_engineering(output_folder, truncate_files, output_pa
# 检查合并后的文件是否存在且不为空
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
# 合并成功,删除 {base_file_name}_before.pdf 文件
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
if os.path.exists(before_pdf_path):
try:
os.remove(before_pdf_path)
# print(f"已删除文件: {before_pdf_path}")
except Exception as e:
print(f"删除文件 {before_pdf_path} 时出错: {e}")
else:
print(f"未找到要删除的文件: {before_pdf_path}")
return output_path
else:
print(f"合并失败,没有生成 '{output_path}'")
return ""
#合并封面+招标公告+投标人须知前附表+须知正文
def merge_selected_pdfs_for_goods(output_folder, truncate_files, output_path, base_file_name):
"""
合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件
以及 truncate_files 中以指定后缀结尾的文件按照指定顺序合并
参数
- output_folder (str): 包含以 {base_file_name}_before.pdf 结尾的 PDF 文件的文件夹路径
- truncate_files (list): 包含 PDF 文件路径的列表
- output_path (str): 合并后的 PDF 文件保存路径
- base_file_name (str): 用于匹配文件名的基础名称
返回:
- str: 如果合并成功返回 output_path否则返回空字符串 ""
"""
# 1. 获取 output_folder 中所有文件
try:
all_output_files = os.listdir(output_folder)
except FileNotFoundError:
print(f"输出文件夹 '{output_folder}' 未找到。")
return ""
except PermissionError:
print(f"没有权限访问输出文件夹 '{output_folder}'")
return ""
# 2. 定义要选择的文件后缀及合并顺序
required_suffixes = [
f'{base_file_name}_before.pdf',
f'{base_file_name}_notice.pdf',
f'{base_file_name}_tobidders_notice_part1.pdf'
]
optional_suffixes = [
f'{base_file_name}_tobidders_notice_part2.pdf'
]
all_pdfs_to_merge = []
missing_files = [] # 用于记录缺失的文件
# 3. 处理必需的文件
for suffix in required_suffixes:
if suffix == f'{base_file_name}_before.pdf':
# 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件
matching_files = [
os.path.join(output_folder, f)
for f in all_output_files
if f.endswith(suffix)
]
else:
# 从 truncate_files 中选择以指定后缀结尾的文件
matching_files = [f for f in truncate_files if f.endswith(suffix)]
if matching_files:
# 如果找到多个匹配的文件,按名称排序并添加
matching_files_sorted = sorted(matching_files)
all_pdfs_to_merge.extend(matching_files_sorted)
# for f in matching_files_sorted:
# print(f"选中文件: {f}")
else:
print(f"没有找到以 '{suffix}' 结尾的文件。")
missing_files.append(suffix) # 记录缺失的文件
# 4. 处理可选的文件
for suffix in optional_suffixes:
if suffix == f'{base_file_name}_before.pdf':
# 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件
matching_files = [
os.path.join(output_folder, f)
for f in all_output_files
if f.endswith(suffix)
]
else:
# 从 truncate_files 中选择以指定后缀结尾的文件
matching_files = [f for f in truncate_files if f.endswith(suffix)]
if matching_files:
# 如果找到多个匹配的文件,按名称排序并添加
matching_files_sorted = sorted(matching_files)
all_pdfs_to_merge.extend(matching_files_sorted)
for f in matching_files_sorted:
print(f"选中文件: {f}")
else:
print(f"可选文件 '{suffix}' 未找到,继续合并。")
# 5. 检查是否所有必需的文件都已找到
if missing_files:
print("缺少以下必要的 PDF 文件,无法进行合并:")
for missing in missing_files:
print(f" - {missing}")
return ""
print(f"总共将要合并的 PDF 文件数量: {len(all_pdfs_to_merge)}")
# 6. 过滤掉不存在或为空的文件路径
all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f) and os.path.getsize(f) > 0]
if not all_pdfs_to_merge:
print("没有找到要合并的有效 PDF 文件。")
return ""
# 7. 调用 merge_pdfs 函数进行合并
try:
merge_pdfs(all_pdfs_to_merge, output_path)
print(f"已成功合并 PDF 文件到 '{output_path}'")
except Exception as e:
print(f"合并 PDF 文件时出错: {e}")
return ""
# 8. 检查合并后的文件是否存在且不为空
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
# 合并成功,删除 {base_file_name}_before.pdf 文件
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
if os.path.exists(before_pdf_path):
try:
@ -308,7 +211,6 @@ def merge_selected_pdfs_for_goods(output_folder, truncate_files, output_path, ba
print(f"删除文件 {before_pdf_path} 时出错: {e}")
else:
print(f"未找到要删除的文件: {before_pdf_path}")
return output_path
else:
print(f"合并失败,没有生成 '{output_path}'")

View File

@ -0,0 +1,127 @@
import concurrent.futures
import os
import time
from flask_app.general.merge_pdfs import merge_selected_pdfs
from flask_app.general.截取pdf通用函数 import check_pdf_pages
from flask_app.general.通用功能函数 import get_global_logger
from flask_app.工程标.截取pdf工程标版 import truncate_pdf_main_engineering
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main_goods
def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selections=None):
"""
统一的 PDF 截取和合并函数支持 'goods' 'engineering' 两种模式
以及默认的全选和指定选择
参数
- pdf_path (str): 输入的 PDF 文件路径
- output_folder (str): 输出文件夹路径
- logger (Logger): 日志记录器
- mode (str, 可选): 模式类型支持 'goods' 'engineering'默认为 'engineering'
- selections (List[int], 可选): 选择的模式列表默认为 [1, 2, 3, 4, 5]
返回:
- list: 截取和合并后的文件路径列表如果截取或合并失败则包含空字符串
"""
if selections is None:
selections = [1, 2, 3, 4, 5]
# 确认模式有效
if mode not in ['goods', 'engineering']:
raise ValueError("mode 参数必须是 'goods''engineering'")
def handle_exception(selection):
return ["", ""] if selection == 4 else [""]
# 设置模式相关的参数和函数
if mode == 'goods':
truncate_function = truncate_pdf_main_goods
merge_mode = 'goods'
# 根据 'goods' 模式定义异常处理的逻辑
else: # mode == 'engineering'
truncate_function = truncate_pdf_main_engineering
merge_mode = 'engineering'
mode_flag = 1
num_selections = len(selections)
if num_selections < 5:
mode_flag = 2
res = check_pdf_pages(pdf_path, mode_flag, logger)
if res is not None:
return res # 返回包含空字符串的列表
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # 纯文件名
truncate_files = []
# 使用 ThreadPoolExecutor 进行多线程处理
with concurrent.futures.ThreadPoolExecutor(max_workers=num_selections) as executor:
# 提交所有任务并保持 selection 顺序
future_to_selection = {
selection: executor.submit(
truncate_function,
pdf_path,
output_folder,
selection,
logger
# 如果 'goods' 模式需要额外参数,如 output_suffix可以在这里处理
)
for selection in selections
}
# 按 selection 顺序收集结果
for selection in selections:
future = future_to_selection.get(selection)
try:
files = future.result()
if files:
truncate_files.extend(files)
# 无需额外处理,因为 `truncate_function` 已处理空字符串的添加和日志记录
except Exception as e:
logger.error(f"Selection {selection} 生成了一个异常: {e}")
# 根据模式和 selection 添加相应数量的空字符串
truncate_files.extend(handle_exception(selection))
# 定义合并后的输出路径
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
# 调用 merge_selected_pdfs 并获取返回值
merged_path = merge_selected_pdfs(
output_folder,
truncate_files,
merged_output_path,
base_file_name,
mode=merge_mode
)
if merged_path:
# 合并成功,添加合并后的文件路径
truncate_files.append(merged_path)
logger.info(f"已成功合并 PDF 文件到 '{merged_output_path}'")
else:
# 合并失败,添加空字符串
truncate_files.append("")
logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}")
logger.info("已截取文件路径: " + str(truncate_files))
return truncate_files
if __name__ == "__main__":
logger=get_global_logger("123")
start_time = time.time()
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
pdf_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest5.pdf"
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp"
# selections = [1, 4] # 仅处理 selection 4、1
selections=[5]
files=truncate_pdf_multiple(pdf_path,output_folder,logger,'engineering',selections)
print(files)
# selection = 1 # 例如1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
# generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
# print(generated_files)
# print("生成的文件:", generated_files)
end_time = time.time()
print("耗时:" + str(end_time - start_time))

View File

@ -0,0 +1,57 @@
#flask_app/general/截取pdf通用函数.py
import concurrent.futures
import os
import regex
from PyPDF2 import PdfReader
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
def get_start_and_common_header(input_path,end_page):
common_header = extract_common_header(input_path)
last_begin_index = 0
begin_pattern = regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE
)
# 新增目录匹配模式
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
pdf_document = PdfReader(input_path)
for i, page in enumerate(pdf_document.pages):
if i > end_page:
return common_header, 0 # 如果页码大于end_page直接返回last_begin_index=0
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
# 检查是否存在"目录"
if catalog_pattern.search(cleaned_text):
continue # 如果存在目录,跳过当前页面
if begin_pattern.search(cleaned_text):
last_begin_index = i # 更新第一个匹配的索引页码从0开始
return common_header, last_begin_index
return common_header, last_begin_index
def check_pdf_pages(pdf_path, mode,logger):
try:
reader = PdfReader(pdf_path)
num_pages = len(reader.pages)
logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
if num_pages <= 50:
logger.info("PDF页数小于或等于50页跳过切分逻辑。")
if mode==1:
return ['', '', '', '', '', '', '']
else:
return ['','','','']
# 若页数大于50页返回None表示继续处理
return None
except Exception as e:
logger.error(f"无法读取 PDF 页数: {e}")
# 返回空列表意味着无法执行后续处理逻辑
if mode == 1:
return ['', '', '', '', '', '', '']
else:
return ['', '', '', '']

View File

@ -1,12 +1,9 @@
# -*- encoding:utf-8 -*-
import ast
import json
import logging
import re
from collections import OrderedDict
from flask_app.general.json_utils import clean_json_string
from flask_app.general.多线程提问 import multi_threading
from flask_app.general.通义千问long import upload_file
from flask_app.工程标.判断是否分包等 import read_questions_from_judge
def process_judge_questions(judge_file_path, chosen_numbers, file_id, baseinfo_list1):
@ -72,3 +69,9 @@ def process_string_list(string_list):
# 如果没有匹配到内容,返回空列表
return []
def get_global_logger(unique_id):
if unique_id is None:
return logging.getLogger() # 获取默认的日志器
logger = logging.getLogger(unique_id)
return logger

View File

@ -6,7 +6,7 @@ import time
from flask_app.general.多线程提问 import multi_threading
from flask_app.工程标.根据条款号整合json import process_and_merge_entries,process_and_merge2
from flask_app.general.json_utils import extract_content_from_json
from flask_app.工程标.截取pdf import truncate_pdf_main
from flask_app.工程标.截取pdf工程标版 import truncate_pdf_main
from flask_app.工程标.提取json工程标版 import convert_clause_to_json
prompt = """
# 角色

View File

@ -3,7 +3,7 @@ import json
import logging
import time
from concurrent.futures import ThreadPoolExecutor
from flask_app.工程标.截取pdf import truncate_pdf_multiple
from flask_app.工程标.截取pdf工程标版 import truncate_pdf_multiple
from flask_app.general.table_content_extraction import extract_tables_main
from flask_app.old_version.文档理解大模型版知识库处理.知识库操作_old import addfileToKnowledge, deleteKnowledge
from flask_app.工程标.提取json工程标版 import convert_clause_to_json

View File

@ -3,7 +3,7 @@ import json
import logging
import time
from concurrent.futures import ThreadPoolExecutor
from flask_app.工程标.截取pdf import truncate_pdf_multiple
from flask_app.工程标.截取pdf工程标版 import truncate_pdf_multiple
from flask_app.general.table_content_extraction import extract_tables_main
from flask_app.工程标.提取json工程标版 import convert_clause_to_json
from flask_app.general.json_utils import transform_json_values

View File

@ -1,27 +1,16 @@
import json
import logging
import re
import string
import time
from flask_app.general.doubao import doubao_model
from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx
from flask_app.general.insert_del_pagemark import insert_mark
from flask_app.general.json_utils import clean_json_string
from flask_app.general.义千问long import upload_file
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_specific_goods
from flask_app.general.用功能函数 import get_global_logger
from flask_app.general.截取pdf_main import truncate_pdf_multiple
from flask_app.货物标.提取采购需求main import fetch_procurement_reqs
from flask_app.货物标.技术参数要求提取后处理函数 import extract_matching_keys
from flask_app.货物标.资格审查main import combine_qualification_review
import concurrent.futures
def get_global_logger(unique_id):
if unique_id is None:
return logging.getLogger() # 获取默认的日志器
logger = logging.getLogger(unique_id)
return logger
logger = None
def get_nested(dic, keys, default=None):
for key in keys:
@ -367,7 +356,7 @@ def get_tech_and_business_deviation(file_path,file_type,unique_id,output_folder)
logger.error("Unsupported file type provided. Preprocessing halted.")
return None
selections=[1,3,5]
files=truncate_pdf_specific_goods(pdf_path,output_folder,selections,unique_id)
files=truncate_pdf_multiple(pdf_path,output_folder,logger,'goods',selections)
notice_path=files[0]
qualification_file=files[1]
procurement_file=files[2]

View File

@ -8,21 +8,14 @@ from flask_app.general.format_change import docx2pdf
from flask_app.general.json_utils import clean_json_string
from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
from flask_app.general.通义千问long import upload_file
from flask_app.general.通用功能函数 import get_global_logger
from flask_app.货物标.基础信息解析main import aggregate_basic_info_goods
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_specific_goods
from flask_app.工程标.截取pdf import truncate_pdf_specific_engineering,truncate_pdf_main
from flask_app.general.截取pdf_main import truncate_pdf_multiple
from flask_app.general.post_processing import inner_post_processing
from flask_app.工程标.基础信息整合工程标 import aggregate_basic_info_engineering
def get_global_logger(unique_id):
if unique_id is None:
return logging.getLogger() # 获取默认的日志器
logger = logging.getLogger(unique_id)
return logger
#货物标
def little_parse_goods(output_folder, pdf_path):
def little_parse_goods(output_folder, pdf_path,logger):
"""
解析货物相关的基础信息
@ -35,7 +28,7 @@ def little_parse_goods(output_folder, pdf_path):
"""
# 截取特定的货物 PDF 文件
selections = [1,4] # 仅处理 selection 1和4 #公告+投标人须知
files = truncate_pdf_specific_goods(pdf_path, output_folder,selections)
files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods',selections)
if not files:
raise ValueError("未找到截取后的文件。")
# 假设最后一个文件是需要处理的基础信息文件
@ -58,7 +51,7 @@ def little_parse_goods(output_folder, pdf_path):
return {"基础信息": aggregated_baseinfo}
def little_parse_engineering(output_folder, pdf_path):
def little_parse_engineering(output_folder, pdf_path,logger):
"""
解析工程相关的基础信息
@ -71,13 +64,14 @@ def little_parse_engineering(output_folder, pdf_path):
"""
# 截取特定的工程 PDF 文件
selections = [ 1,4] #公告+投标人须知前附表
files = truncate_pdf_specific_engineering(pdf_path, output_folder,selections)
files = truncate_pdf_multiple(pdf_path, output_folder,logger,'engineering',selections)
if not files:
raise ValueError("未找到截取后的文件。")
# 假设最后一个文件是需要处理的基础信息文件
baseinfo_file_path = files[-1]
if not baseinfo_file_path:
baseinfo_file_path=truncate_pdf_main(pdf_path,output_folder,5)[0] #invalid_path
selections=[5]
baseinfo_file_path=truncate_pdf_multiple(pdf_path,output_folder,logger,selections)[0] #invalid_path
# 上传文件并获取文件 ID
file_id = upload_file(baseinfo_file_path)
# 注意:以下路径被硬编码,确保该路径存在并且正确
@ -123,11 +117,11 @@ def little_parse_main(output_folder, file_path, file_type,zb_type,unique_id):
return None
# 根据招标类型调用相应的解析函数
if zb_type == 2: # 货物标
combined_data = little_parse_goods(output_folder, pdf_path)
combined_data = little_parse_goods(output_folder, pdf_path,logger)
logger.info("Parsed goods data: %s", json.dumps(combined_data, ensure_ascii=False, indent=4))
res = inner_post_processing(combined_data["基础信息"])
elif zb_type==1: #工程标
combined_data = little_parse_engineering(output_folder, pdf_path)
combined_data = little_parse_engineering(output_folder, pdf_path,logger)
logger.info("Parsed engineering data: %s", json.dumps(combined_data, ensure_ascii=False, indent=4))
res = inner_post_processing(combined_data["基础信息"])
# 定义保存JSON文件的路径

View File

@ -8,8 +8,9 @@ from concurrent.futures import ThreadPoolExecutor
from docx import Document
from flask_app.general.insert_del_pagemark import insert_mark,delete_mark
from flask_app.工程标.截取pdf import truncate_pdf_multiple
from flask_app.general.截取pdf_main import truncate_pdf_multiple
from flask_app.general.merge_pdfs import merge_pdfs
from flask_app.general.通用功能函数 import get_global_logger
from flask_app.工程标.提取json工程标版 import convert_clause_to_json
from flask_app.general.json_utils import transform_json_values
from flask_app.general.无效标和废标公共代码 import combine_find_invalid
@ -20,16 +21,9 @@ from flask_app.工程标.资格审查模块 import combine_review_standards
from flask_app.general.商务技术评分提取 import combine_evaluation_standards
from flask_app.general.format_change import pdf2docx, docx2pdf
def get_global_logger(unique_id):
if unique_id is None:
return logging.getLogger() # 获取默认的日志器
logger = logging.getLogger(unique_id)
return logger
# 创建全局线程池
executor = ThreadPoolExecutor()
def preprocess_files(output_folder, file_path, file_type,unique_id,logger):
def preprocess_files(output_folder, file_path, file_type,logger):
logger.info("starting 文件预处理...")
logger.info("output_folder..." + output_folder)
start_time=time.time()
@ -47,20 +41,20 @@ def preprocess_files(output_folder, file_path, file_type,unique_id,logger):
logger.error("Unsupported file type provided. Preprocessing halted.")
return None
# 调用截取PDF多次
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,unique_id)
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'engineering')
print("切割出的文件:"+str(truncate_files))
# 处理各个部分
tobidders_notice_table=truncate_files[0] #投标人须知前附表
notice_path=truncate_files[0] #招标公告
tobidders_notice = truncate_files[1] #投标人须知正文
evaluation_method = truncate_files[1] #评标方法
evaluation_method = truncate_files[2] #评标方法
qualification = truncate_files[2] #资格审查
qualification = truncate_files[3] #资格审查
notice_path=truncate_files[4] #公告
tobidders_notice_table = truncate_files[3] #投标人须知前附表
tobidders_notice=truncate_files[4] #投标人须知正文
invalid_path=truncate_files[5]
invalid_path=truncate_files[5] #无效标
# invalid_docpath = copy_docx(docx_path) # docx截取无效标部分
# invalid_docpath=pdf2docx(invalid_path)
invalid_added_pdf = insert_mark(invalid_path)
@ -215,7 +209,7 @@ def fetch_bid_opening(invalid_deleted_docx, merged_baseinfo_path_more, clause_pa
def engineering_bid_main(output_folder, file_path, file_type, unique_id):
logger = get_global_logger(unique_id)
# 预处理文件,获取处理后的数据
processed_data = preprocess_files(output_folder, file_path, file_type,unique_id,logger)
processed_data = preprocess_files(output_folder, file_path, file_type,logger)
if not processed_data:
yield json.dumps({}) # 如果处理数据失败,返回空的 JSON

View File

@ -7,23 +7,16 @@ from docx import Document
from flask_app.general.format_change import docx2pdf, pdf2docx,doc2docx
from flask_app.general.insert_del_pagemark import insert_mark, delete_mark
from flask_app.general.json_utils import transform_json_values
from flask_app.general.通用功能函数 import get_global_logger
from flask_app.货物标.基础信息解析main import combine_basic_info
from flask_app.货物标.投标人须知正文提取指定内容货物标版 import extract_from_notice
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_multiple
from flask_app.general.截取pdf_main import truncate_pdf_multiple
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
from flask_app.货物标.提取json货物标版 import convert_clause_to_json
from flask_app.general.无效标和废标公共代码 import combine_find_invalid
from flask_app.货物标.资格审查main import combine_qualification_review
from flask_app.general.商务技术评分提取 import combine_evaluation_standards
import logging
def get_global_logger(unique_id):
if unique_id is None:
return logging.getLogger() # 获取默认的日志器
logger = logging.getLogger(unique_id)
return logger
# 创建全局线程池
executor = ThreadPoolExecutor()
@ -47,7 +40,7 @@ def preprocess_files(output_folder, file_path, file_type,logger):
return None
# 调用截取PDF多次
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger) # index: 0->商务技术服务要求 1->评标办法 2->资格审查 3->投标人须知前附表 4->投标人须知正文
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods') # index: 0->商务技术服务要求 1->评标办法 2->资格审查 3->投标人须知前附表 4->投标人须知正文
# 处理各个部分
invalid_path=pdf_path

View File

@ -1,21 +1,14 @@
#flask_app/工程标/截取pdf工程标版.py
import regex
import os
import time
from PyPDF2 import PdfReader, PdfWriter
from flask_app.general.clean_pdf import clean_page_content, extract_common_header
from flask_app.general.merge_pdfs import merge_selected_pdfs_for_engineering
import concurrent.futures
import logging
from flask_app.general.clean_pdf import clean_page_content
from flask_app.general.截取pdf通用函数 import get_start_and_common_header
from flask_app.general.通用功能函数 import get_global_logger
def get_global_logger(unique_id):
if unique_id is None:
return logging.getLogger() # 获取默认的日志器
logger = logging.getLogger(unique_id)
return logger
logger = None
def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header):
try:
# 获取文件基本名称
@ -301,67 +294,39 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
return []
def get_start_and_common_header(input_path):
common_header = extract_common_header(input_path)
last_begin_index = 0
begin_pattern = regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE
)
# 新增目录匹配模式
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
pdf_document = PdfReader(input_path)
for i, page in enumerate(pdf_document.pages):
if i > 10:
return common_header, 0 # 如果页码大于10直接返回last_begin_index=0
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
# 检查是否存在"目录"
if catalog_pattern.search(cleaned_text):
continue # 如果存在目录,跳过当前页面
if begin_pattern.search(cleaned_text):
last_begin_index = i # 更新第一个匹配的索引页码从0开始
return common_header, last_begin_index
return common_header, last_begin_index
def truncate_pdf_main(input_path, output_folder, selection):
def truncate_pdf_main_engineering(input_path, output_folder, selection,logger):
if os.path.isdir(input_path):
generated_files = []
for file_name in os.listdir(input_path):
if file_name.endswith('.pdf'):
file_path = os.path.join(input_path, file_name)
files = truncate_pdf_main(file_path, output_folder, selection)
files = truncate_pdf_main_engineering(file_path, output_folder, selection)
if files:
generated_files.extend(files)
return generated_files
elif os.path.isfile(input_path) and input_path.endswith('.pdf'):
# base_file_name = os.path.splitext(os.path.basename(input_path))[0]
common_header, last_begin_index = get_start_and_common_header(input_path)
common_header, last_begin_index = get_start_and_common_header(input_path,20)
# print(last_begin_index)
if selection == 1:
# Selection 1: 投标人须知前附表
# Selection 1: 招标公告
pattern_pairs = [
(
regex.compile(
r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知|(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表)\s*$',regex.MULTILINE),
regex.compile(
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]',
regex.MULTILINE)
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'),
regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
),
(
regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|'
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表\s*$',
regex.MULTILINE
),
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE),
regex.compile(
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]',
regex.MULTILINE)
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知(?:前附表)?\s*$',
regex.MULTILINE
)
)
]
output_suffix = "tobidders_notice"
output_suffix = "notice"
elif selection == 2:
# Selection 2: 评标办法
pattern_pairs = [
@ -408,23 +373,28 @@ def truncate_pdf_main(input_path, output_folder, selection):
]
output_suffix = "qualification"
elif selection == 4:
# Selection 4: 招标公告
# Selection 4: 投标人须知
pattern_pairs = [
(
regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'),
regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
(
regex.compile(
r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知|(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表)\s*$',
regex.MULTILINE),
regex.compile(
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]',
regex.MULTILINE)
),
(
regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|'
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表\s*$',
regex.MULTILINE
),
(
regex.compile(r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$', regex.MULTILINE),
regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知(?:前附表)?\s*$',
regex.MULTILINE
)
)
]
output_suffix = "notice"
regex.compile(
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]',
regex.MULTILINE)
)
]
output_suffix = "tobidders_notice"
elif selection == 5:
# Selection 5: 无效标
pattern_pairs = [
@ -447,13 +417,13 @@ def truncate_pdf_main(input_path, output_folder, selection):
for idx, (begin_pattern, end_pattern) in enumerate(pattern_pairs, start=1):
is_secondary_match = (idx == 2)
begin_page = last_begin_index if last_begin_index != 0 else {
1: 3, # 前附表
1: 0, # 公告
2: 10, # 评标
3: 5, # 资格
4: 0, # 公告
4: 3, # 前附表
5: 0 # 无效标
}.get(selection, 0)
if selection == 1: # 投标人须知
if selection == 4: # 投标人须知
output_paths = list(
extract_pages_tobidders_notice(input_path, output_folder, begin_pattern, begin_page, common_header,
is_secondary_match))
@ -485,148 +455,29 @@ def truncate_pdf_main(input_path, output_folder, selection):
if fallback_result and any(os.path.isfile(f) for f in fallback_result):
return fallback_result
else:
if selection == 1:
return ["", ""]
else:
return ['']
# 根据 selection 返回相应数量的空字符串
empty_returns = ["", ""] if selection == 4 else [""]
logger.error(f"Selection {selection}: 回退提取失败,返回空字符串。")
return empty_returns
else:
print("Invalid input path. It is neither a PDF file nor a directory containing PDF files.")
return ['']
def truncate_pdf_multiple(input_path, output_folder, unique_id="123"):
logger = get_global_logger(unique_id)
# 新增逻辑:检查 PDF 总页数
try:
reader = PdfReader(input_path)
num_pages = len(reader.pages)
logger.info(f"PDF '{input_path}' 的总页数为: {num_pages}")
if num_pages <= 50:
logger.info(f"PDF页数小于或等于50页跳过切分逻辑。")
return ['', '', '', '', '', '', '']
except Exception as e:
logger.error(f"无法读取 PDF 页数: {e}")
return ['', '', '', '', '', '', '']
base_file_name = os.path.splitext(os.path.basename(input_path))[0] # 纯文件名
truncate_files = []
selections = range(1, 6) # 选择 1 到 5
# 使用 ThreadPoolExecutor 进行多线程处理
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
# 提交所有任务并保持 selection 顺序
future_to_selection = {selection: executor.submit(truncate_pdf_main, input_path, output_folder, selection) for
selection in selections}
# 按 selection 顺序收集结果
for selection in selections:
future = future_to_selection.get(selection)
try:
files = future.result()
if files and any(f for f in files):
# 过滤空字符串
valid_files = [f for f in files if f]
truncate_files.extend(valid_files)
else:
logger.error(f"Selection {selection}: 截取失败,已添加空字符串。")
# Append two empty strings for selection=1; otherwise, one empty string
truncate_files.extend(["", ""] if selection == 1 else [""])
except Exception as e:
logger.error(f"Selection {selection} 生成了一个异常: {e}")
# Append two empty strings for selection=1; otherwise, one empty string
truncate_files.extend(["", ""] if selection == 1 else [""])
# Proceed with merging logic as before...
if any(f for f in truncate_files if f): # 检查是否有有效的文件路径
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
merged_result = merge_selected_pdfs_for_engineering(output_folder, truncate_files, merged_output_path,
base_file_name)
if merged_result:
truncate_files.append(merged_result)
# logger.info(f"merged_baseinfo: 已生成合并文件: {merged_output_path}")
else:
truncate_files.append("") # 如果 merged_result 未生成,添加空字符串
logger.warning("merged_baseinfo: 未生成合并文件,因为没有找到需要合并的 PDF 文件。")
else:
truncate_files.append("") # 如果没有文件需要合并,也添加空字符串
logger.warning(f"merged_baseinfo: 没有文件需要合并 for {input_path}")
return truncate_files
def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, unique_id="123"):
try:
logger = get_global_logger(unique_id)
try:
reader = PdfReader(input_path)
num_pages = len(reader.pages)
logger.info(f"PDF '{input_path}' 的总页数为: {num_pages}")
if num_pages <= 50:
logger.info(f"PDF页数小于或等于50页跳过切分逻辑。")
return ['', '', '', '']
except Exception as e:
logger.error(f"无法读取 PDF 页数: {e}")
return ['', '', '', '']
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
truncate_files = []
# 使用 ThreadPoolExecutor 进行多线程处理
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
# 提交所有任务并保持 selection 顺序
future_to_selection = {selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection) for
selection in selections}
# 按 selection 顺序收集结果
for selection in selections:
future = future_to_selection.get(selection)
try:
files = future.result()
if files and any(f for f in files):
# 过滤空字符串
valid_files = [f for f in files if f]
truncate_files.extend(valid_files)
else:
logger.error(f"Selection {selection}: 截取失败,已添加空字符串。")
# Append two empty strings for selection=1; otherwise, one empty string
truncate_files.extend(["", ""] if selection == 1 else [""])
except Exception as e:
logger.error(f"Selection {selection} 生成了一个异常: {e}")
# Append two empty strings for selection=1; otherwise, one empty string
truncate_files.extend(["", ""] if selection == 1 else [""])
# Proceed with merging logic as before...
if any(f for f in truncate_files if f): # 检查是否有有效的文件路径
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_specific.pdf")
merged_result = merge_selected_pdfs_for_engineering(output_folder, truncate_files, merged_output_path,
base_file_name)
if merged_result:
truncate_files.append(merged_result)
logger.info(f"merged_specific: 已生成合并文件: {merged_output_path}")
else:
truncate_files.append("") # 如果 merged_result 未生成,添加空字符串
logger.warning("merged_specific: 未生成合并文件,因为没有找到需要合并的 PDF 文件。")
else:
truncate_files.append("") # 如果没有文件需要合并,也添加空字符串
logger.warning(f"merged_specific: 没有文件需要合并 for {pdf_path}")
return truncate_files
except Exception as e:
logger.error(f"Error in truncate_pdf_specific_engineering: {e}")
return [""] * len(selections) # 返回与 selections 数量相同的空字符串列表
if __name__ == "__main__":
logger=get_global_logger("123")
start_time = time.time()
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
input_path=r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\西藏监管局办.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
pdf_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest4_evaluation_method.pdf"
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp"
files=truncate_pdf_multiple(input_path,output_folder)
# selections = [4, 1] # 仅处理 selection 4、1
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
print(files)
# selection = 2 # 例如1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
# selections = [1, 4] # 仅处理 selection 4、1
selections=[5]
# files=truncate_pdf_multiple(pdf_path,output_folder,logger,'engineering',selections)
# print(files)
# selection = 1 # 例如1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
# generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
# print(generated_files)
# print("生成的文件:", generated_files)
end_time = time.time()

View File

@ -1,22 +1,14 @@
import glob
import logging
from PyPDF2 import PdfReader, PdfWriter
import regex # 导入正则表达式库
import os # 用于文件和文件夹操作
from flask_app.general.clean_pdf import clean_page_content, extract_common_header
from flask_app.general.format_change import docx2pdf
from flask_app.general.merge_pdfs import merge_and_cleanup, merge_selected_pdfs_for_goods
import concurrent.futures
from flask_app.general.merge_pdfs import merge_and_cleanup
from flask_app.general.截取pdf通用函数 import get_start_and_common_header
from flask_app.general.通用功能函数 import get_global_logger
def get_global_logger(unique_id):
if unique_id is None:
return logging.getLogger() # 获取默认的日志器
logger = logging.getLogger(unique_id)
return logger
def is_pdf_or_doc(filename):
# 判断文件是否为PDF或Word文档
return filename.lower().endswith(('.pdf', '.doc', '.docx'))
@ -28,21 +20,6 @@ def convert_to_pdf(file_path):
return docx2pdf(file_path)
return file_path
def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
pdf_path = convert_to_pdf(file_path)
result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
if result:
if output_suffix == "tobidders_notice":
# 确保返回的是元组,并将其中的 None 转换为 ""
path1, path2 = result
return [path1 or "", path2 or ""]
elif output_suffix == "qualification1":
merge_and_cleanup(result, "qualification3")
return [result or ""]
return [result or ""]
return [""] # 返回空字符串
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
output_suffix="normal"):
start_page = None
@ -124,7 +101,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page)
elif output_suffix == "qualification1":
truncate_pdf_main(pdf_path, output_folder, 2, "qualification3") # 合并'资格审查'章节和'评标办法'章节
truncate_pdf_main_goods(pdf_path, output_folder, 2, "qualification3") # 合并'资格审查'章节和'评标办法'章节
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
except Exception as e:
print(f"Error processing {pdf_path}: {e}")
@ -171,26 +148,6 @@ def get_patterns_for_notice():
)
return begin_pattern, end_pattern
# def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header,
# exclusion_pattern):
# start_page = None
# mid_page = None
# end_page = None
# for i, page in enumerate(pdf_document.pages):
# text = page.extract_text() or ""
# cleaned_text = clean_page_content(text, common_header)
# if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
# continue
# if start_page is None and regex.search(begin_pattern, cleaned_text) and i > begin_page:
# start_page = i
# if start_page is not None and mid_page is None and regex.search(
# r'^\s*[(]?\s*[一1]\s*[)]?\s*[、..]*\s*(说\s*明|总\s*则)', cleaned_text):
# mid_page = i
# if start_page is not None and mid_page is not None and regex.search(end_pattern, cleaned_text) and i > mid_page:
# end_page = i
# break
# return start_page, mid_page, end_page
def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern):
"""
从PDF文档中提取起始页中间页和结束页
@ -510,7 +467,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, b
new_file.write(original_file.read())
return new_file_path
else:
temp = truncate_pdf_main(pdf_path, output_folder, 2, "qualification2")
temp = truncate_pdf_main_goods(pdf_path, output_folder, 2, "qualification2")
if len(temp) > 0:
return temp[0]
else:
@ -558,33 +515,8 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo
print(f"Error in save_extracted_pages: {e}")
return "" # 返回空字符串
def get_start_and_common_header(input_path):
common_header = extract_common_header(input_path)
last_begin_index = 0
begin_pattern = regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE
)
# 新增目录匹配模式
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
pdf_document = PdfReader(input_path)
for i, page in enumerate(pdf_document.pages):
if i > 10:
return common_header, 0 # 如果页码大于10直接返回last_begin_index=0
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
# 检查是否存在"目录"
if catalog_pattern.search(cleaned_text):
continue # 如果存在目录,跳过当前页面
if begin_pattern.search(cleaned_text):
last_begin_index = i # 更新第一个匹配的索引页码从0开始
return common_header, last_begin_index
return common_header, last_begin_index
def truncate_pdf_main(input_path, output_folder, selection, output_suffix="default"):
def truncate_pdf_main_goods(input_path, output_folder, selection, output_suffix="default"):
try:
# 检查是否为文件夹
if os.path.isdir(input_path):
@ -619,7 +551,7 @@ def process_input(input_path, output_folder, selection, output_suffix):
os.makedirs(output_folder)
# 获取起始和通用页眉
common_header, last_begin_index = get_start_and_common_header(input_path)
common_header, last_begin_index = get_start_and_common_header(input_path,10)
begin_page = last_begin_index if last_begin_index != 0 else {
4: 1,
2: 5,
@ -676,145 +608,43 @@ def process_input(input_path, output_folder, selection, output_suffix):
if output_suffix == "default":
output_suffix = local_output_suffix
# 调用实际处理文件内容的函数
return process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
# 将原先的 process_files 逻辑合并到此处
pdf_path = convert_to_pdf(input_path)
result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
# 根据提取结果以及不同的 output_suffix 进行处理
if result:
if output_suffix == "tobidders_notice":
# 确保返回的是元组,并将其中的 None 转换为 ""
path1, path2 = result
return [path1 or "", path2 or ""]
elif output_suffix == "qualification1":
merge_and_cleanup(result, "qualification3")
return [result or ""]
return [result or ""]
return [""]
except Exception as e:
print(f"Error in process_input: {e}")
return ['']
def truncate_pdf_multiple(pdf_path, output_folder, logger):
# 新增逻辑:检查 PDF 总页数
try:
reader = PdfReader(pdf_path)
num_pages = len(reader.pages)
logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
if num_pages <= 50:
logger.info(f"PDF页数小于或等于50页跳过切分逻辑。")
return ['', '', '', '', '', '', '']
except Exception as e:
logger.error(f"无法读取 PDF 页数: {e}")
return ['', '', '', '', '', '', '']
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
truncate_files = []
# 定义要处理的选择范围
selections = range(1, 6)
# 使用 ThreadPoolExecutor 进行多线程处理
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# 提交所有任务并保持 selection 顺序
future_to_selection = {
selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default")
for selection in selections}
# 按 selection 顺序收集结果
for selection in selections:
future = future_to_selection.get(selection)
try:
files = future.result()
if files:
truncate_files.extend(files)
except Exception as e:
logger.error(f"Selection {selection} 生成了一个异常: {e}")
# 定义合并后的输出路径
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
# 调用 merge_selected_pdfs 并获取返回值
merged_path = merge_selected_pdfs_for_goods(output_folder, truncate_files, merged_output_path, base_file_name)
if merged_path:
# 合并成功,添加合并后的文件路径
truncate_files.append(merged_path)
# logger.info(f"已生成合并文件: {merged_output_path}")
else:
# 合并失败,添加空字符串
truncate_files.append("")
logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}")
logger.info("已截取文件路径" + str(truncate_files))
return truncate_files
# 小解析,只需要前三章内容
def truncate_pdf_specific_goods(pdf_path, output_folder, selections, unique_id="123"):
"""
处理 PDF 文件选择指定的 selections并合并结果
Args:
pdf_path (str): 要处理的 PDF 文件路径
output_folder (str): 截取后的文件保存文件夹路径
selections (iterable): 要处理的 selection 列表
Returns:
list: 截取的文件路径列表包括合并后的文件路径如果有
"""
logger = get_global_logger(unique_id)
# 新增逻辑:检查 PDF 总页数
try:
reader = PdfReader(pdf_path)
num_pages = len(reader.pages)
logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
if num_pages <= 50:
logger.info(f"PDF页数小于或等于50页跳过切分逻辑。")
return ['', '', '', '']
except Exception as e:
logger.error(f"无法读取 PDF 页数: {e}")
return ['', '', '', '']
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
truncate_files = []
# 使用 ThreadPoolExecutor 进行多线程处理
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
# 提交所有任务并保持 selection 顺序
future_to_selection = {
selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default")
for selection in selections}
# 按 selection 顺序收集结果
for selection in selections:
future = future_to_selection.get(selection)
try:
files = future.result()
if files:
if isinstance(files, list):
truncate_files.extend(files)
elif isinstance(files, str):
truncate_files.append(files)
except Exception as e:
logger.error(f"Selection {selection} 生成了一个异常: {e}")
# 定义合并后的输出路径
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
# 调用 merge_selected_pdfs 并获取返回值
merged_path = merge_selected_pdfs_for_goods(output_folder, truncate_files, merged_output_path, base_file_name)
if merged_path:
# 合并成功,添加合并后的文件路径
truncate_files.append(merged_path)
# logger.info(f"已生成合并文件: {merged_output_path}")
else:
# 合并失败,添加空字符串
truncate_files.append("")
logger.warning(f"合并失败没有生成merged_baseinfo for {pdf_path}")
return truncate_files
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 工程标中,判断是符合性审查之后,可以将它们设为同一章
# ztbfile.pdf少资格评审 包头少符合性评审
if __name__ == "__main__":
logger = get_global_logger("123")
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
# input_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
input_path=r"C:\Users\Administrator\Desktop\fsdownload\ff2acdba-3a55-48a7-aa7a-61d9f89b909a\ztbfile.pdf"
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\ff2acdba-3a55-48a7-aa7a-61d9f89b909a\tmp"
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
# files = truncate_pdf_multiple(input_path, output_folder,logger)
# selections = [1,4]
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
# selections = [1, 4]
# files = truncate_pdf_multiple(pdf_path, output_folder,logger,selections)
# print(files)
selection = 4 # 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
generated_files = truncate_pdf_main(input_path, output_folder, selection)
print(generated_files)
# selection = 4 # 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
# generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection)
# print(generated_files)