12.17 无效投标、废标
This commit is contained in:
parent
8149769bab
commit
cb153889a4
@ -91,21 +91,20 @@ def find_and_merge(target_path, output_suffix):
|
||||
paths=[target_path,full_path]
|
||||
merge_pdfs(paths,target_path)
|
||||
|
||||
def merge_selected_pdfs_for_engineering(output_folder, truncate_files, output_path, base_file_name):
|
||||
def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_name, mode='engineering'):
|
||||
"""
|
||||
合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件,
|
||||
以及 truncate_files 中以指定后缀结尾的文件,按照指定顺序合并。
|
||||
通用的 PDF 合并函数,根据不同的模式合并指定的文件。
|
||||
|
||||
参数:
|
||||
- output_folder (str): 包含以 {base_file_name}_before.pdf 结尾的 PDF 文件的文件夹路径。
|
||||
- truncate_files (list): 包含 PDF 文件路径的列表。
|
||||
- output_path (str): 合并后的 PDF 文件保存路径。
|
||||
- base_file_name (str): 用于匹配文件名的基础名称。
|
||||
- mode (str): 合并模式,支持 'engineering' 和 'goods'。
|
||||
|
||||
返回:
|
||||
- str: 合并后的 PDF 文件路径,如果未找到所有需要合并的文件则返回 ""。
|
||||
- str: 如果合并成功,返回 output_path;否则,返回空字符串 ""。
|
||||
"""
|
||||
# 1. 获取 output_folder 中所有文件
|
||||
try:
|
||||
all_output_files = os.listdir(output_folder)
|
||||
except FileNotFoundError:
|
||||
@ -115,50 +114,82 @@ def merge_selected_pdfs_for_engineering(output_folder, truncate_files, output_pa
|
||||
print(f"没有权限访问输出文件夹 '{output_folder}'。")
|
||||
return ""
|
||||
|
||||
# 2. 定义要选择的文件后缀及合并顺序,包括 before 文件
|
||||
desired_suffixes = [
|
||||
f'{base_file_name}_before.pdf',
|
||||
f'{base_file_name}_notice.pdf',
|
||||
f'{base_file_name}_tobidders_notice_table.pdf'
|
||||
]
|
||||
if mode == 'engineering':
|
||||
required_suffixes = [
|
||||
f'{base_file_name}_before.pdf',
|
||||
f'{base_file_name}_notice.pdf',
|
||||
f'{base_file_name}_tobidders_notice_table.pdf'
|
||||
]
|
||||
optional_suffixes = []
|
||||
elif mode == 'goods':
|
||||
required_suffixes = [
|
||||
f'{base_file_name}_before.pdf',
|
||||
f'{base_file_name}_notice.pdf',
|
||||
f'{base_file_name}_tobidders_notice_part1.pdf'
|
||||
]
|
||||
optional_suffixes = [
|
||||
f'{base_file_name}_tobidders_notice_part2.pdf'
|
||||
]
|
||||
else:
|
||||
print(f"未知的合并模式: {mode}")
|
||||
return ""
|
||||
|
||||
all_pdfs_to_merge = []
|
||||
missing_files = [] # 用于记录缺失的文件
|
||||
missing_files = []
|
||||
|
||||
for suffix in desired_suffixes:
|
||||
# 处理必需的文件
|
||||
for suffix in required_suffixes:
|
||||
if suffix == f'{base_file_name}_before.pdf':
|
||||
# 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件
|
||||
matching_files = [
|
||||
os.path.join(output_folder, f)
|
||||
for f in all_output_files
|
||||
if f.endswith(suffix)
|
||||
]
|
||||
else:
|
||||
# 从 truncate_files 中选择以指定后缀结尾的文件
|
||||
matching_files = [f for f in truncate_files if f.endswith(suffix)]
|
||||
|
||||
if matching_files:
|
||||
# 如果找到多个匹配的文件,按名称排序并添加
|
||||
matching_files_sorted = sorted(matching_files)
|
||||
all_pdfs_to_merge.extend(matching_files_sorted)
|
||||
for f in matching_files_sorted:
|
||||
print(f"选中文件: {f}")
|
||||
else:
|
||||
print(f"没有找到以 '{suffix}' 结尾的文件。")
|
||||
missing_files.append(suffix) # 记录缺失的文件
|
||||
missing_files.append(suffix)
|
||||
|
||||
# 检查是否所有需要的文件都已找到
|
||||
# 处理可选的文件
|
||||
for suffix in optional_suffixes:
|
||||
if suffix == f'{base_file_name}_before.pdf':
|
||||
matching_files = [
|
||||
os.path.join(output_folder, f)
|
||||
for f in all_output_files
|
||||
if f.endswith(suffix)
|
||||
]
|
||||
else:
|
||||
matching_files = [f for f in truncate_files if f.endswith(suffix)]
|
||||
|
||||
if matching_files:
|
||||
matching_files_sorted = sorted(matching_files)
|
||||
all_pdfs_to_merge.extend(matching_files_sorted)
|
||||
for f in matching_files_sorted:
|
||||
print(f"选中文件: {f}")
|
||||
else:
|
||||
print(f"可选文件 '{suffix}' 未找到,继续合并。")
|
||||
|
||||
# 检查是否所有必需的文件都已找到
|
||||
if missing_files:
|
||||
print("缺少以下必要的 PDF 文件,无法进行合并:")
|
||||
for missing in missing_files:
|
||||
print(f" - {missing}")
|
||||
return ""
|
||||
|
||||
print(f"总共将要合并的 PDF 文件数量: {len(all_pdfs_to_merge)}")
|
||||
|
||||
# 过滤掉不存在或为空的文件路径
|
||||
all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f)]
|
||||
all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f) and os.path.getsize(f) > 0]
|
||||
|
||||
if not all_pdfs_to_merge:
|
||||
print("没有找到要合并的 PDF 文件。")
|
||||
print("没有找到要合并的有效 PDF 文件。")
|
||||
return ""
|
||||
|
||||
# 调用 merge_pdfs 函数进行合并
|
||||
@ -171,134 +202,6 @@ def merge_selected_pdfs_for_engineering(output_folder, truncate_files, output_pa
|
||||
|
||||
# 检查合并后的文件是否存在且不为空
|
||||
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
|
||||
# 合并成功,删除 {base_file_name}_before.pdf 文件
|
||||
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
|
||||
if os.path.exists(before_pdf_path):
|
||||
try:
|
||||
os.remove(before_pdf_path)
|
||||
# print(f"已删除文件: {before_pdf_path}")
|
||||
except Exception as e:
|
||||
print(f"删除文件 {before_pdf_path} 时出错: {e}")
|
||||
else:
|
||||
print(f"未找到要删除的文件: {before_pdf_path}")
|
||||
|
||||
return output_path
|
||||
else:
|
||||
print(f"合并失败,没有生成 '{output_path}'。")
|
||||
return ""
|
||||
|
||||
|
||||
#合并封面+招标公告+投标人须知前附表+须知正文
|
||||
def merge_selected_pdfs_for_goods(output_folder, truncate_files, output_path, base_file_name):
|
||||
"""
|
||||
合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件,
|
||||
以及 truncate_files 中以指定后缀结尾的文件,按照指定顺序合并。
|
||||
|
||||
参数:
|
||||
- output_folder (str): 包含以 {base_file_name}_before.pdf 结尾的 PDF 文件的文件夹路径。
|
||||
- truncate_files (list): 包含 PDF 文件路径的列表。
|
||||
- output_path (str): 合并后的 PDF 文件保存路径。
|
||||
- base_file_name (str): 用于匹配文件名的基础名称。
|
||||
|
||||
返回:
|
||||
- str: 如果合并成功,返回 output_path;否则,返回空字符串 ""。
|
||||
"""
|
||||
# 1. 获取 output_folder 中所有文件
|
||||
try:
|
||||
all_output_files = os.listdir(output_folder)
|
||||
except FileNotFoundError:
|
||||
print(f"输出文件夹 '{output_folder}' 未找到。")
|
||||
return ""
|
||||
except PermissionError:
|
||||
print(f"没有权限访问输出文件夹 '{output_folder}'。")
|
||||
return ""
|
||||
|
||||
# 2. 定义要选择的文件后缀及合并顺序
|
||||
required_suffixes = [
|
||||
f'{base_file_name}_before.pdf',
|
||||
f'{base_file_name}_notice.pdf',
|
||||
f'{base_file_name}_tobidders_notice_part1.pdf'
|
||||
]
|
||||
|
||||
optional_suffixes = [
|
||||
f'{base_file_name}_tobidders_notice_part2.pdf'
|
||||
]
|
||||
|
||||
all_pdfs_to_merge = []
|
||||
missing_files = [] # 用于记录缺失的文件
|
||||
|
||||
# 3. 处理必需的文件
|
||||
for suffix in required_suffixes:
|
||||
if suffix == f'{base_file_name}_before.pdf':
|
||||
# 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件
|
||||
matching_files = [
|
||||
os.path.join(output_folder, f)
|
||||
for f in all_output_files
|
||||
if f.endswith(suffix)
|
||||
]
|
||||
else:
|
||||
# 从 truncate_files 中选择以指定后缀结尾的文件
|
||||
matching_files = [f for f in truncate_files if f.endswith(suffix)]
|
||||
|
||||
if matching_files:
|
||||
# 如果找到多个匹配的文件,按名称排序并添加
|
||||
matching_files_sorted = sorted(matching_files)
|
||||
all_pdfs_to_merge.extend(matching_files_sorted)
|
||||
# for f in matching_files_sorted:
|
||||
# print(f"选中文件: {f}")
|
||||
else:
|
||||
print(f"没有找到以 '{suffix}' 结尾的文件。")
|
||||
missing_files.append(suffix) # 记录缺失的文件
|
||||
|
||||
# 4. 处理可选的文件
|
||||
for suffix in optional_suffixes:
|
||||
if suffix == f'{base_file_name}_before.pdf':
|
||||
# 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件
|
||||
matching_files = [
|
||||
os.path.join(output_folder, f)
|
||||
for f in all_output_files
|
||||
if f.endswith(suffix)
|
||||
]
|
||||
else:
|
||||
# 从 truncate_files 中选择以指定后缀结尾的文件
|
||||
matching_files = [f for f in truncate_files if f.endswith(suffix)]
|
||||
|
||||
if matching_files:
|
||||
# 如果找到多个匹配的文件,按名称排序并添加
|
||||
matching_files_sorted = sorted(matching_files)
|
||||
all_pdfs_to_merge.extend(matching_files_sorted)
|
||||
for f in matching_files_sorted:
|
||||
print(f"选中文件: {f}")
|
||||
else:
|
||||
print(f"可选文件 '{suffix}' 未找到,继续合并。")
|
||||
|
||||
# 5. 检查是否所有必需的文件都已找到
|
||||
if missing_files:
|
||||
print("缺少以下必要的 PDF 文件,无法进行合并:")
|
||||
for missing in missing_files:
|
||||
print(f" - {missing}")
|
||||
return ""
|
||||
|
||||
print(f"总共将要合并的 PDF 文件数量: {len(all_pdfs_to_merge)}")
|
||||
|
||||
# 6. 过滤掉不存在或为空的文件路径
|
||||
all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f) and os.path.getsize(f) > 0]
|
||||
|
||||
if not all_pdfs_to_merge:
|
||||
print("没有找到要合并的有效 PDF 文件。")
|
||||
return ""
|
||||
|
||||
# 7. 调用 merge_pdfs 函数进行合并
|
||||
try:
|
||||
merge_pdfs(all_pdfs_to_merge, output_path)
|
||||
print(f"已成功合并 PDF 文件到 '{output_path}'。")
|
||||
except Exception as e:
|
||||
print(f"合并 PDF 文件时出错: {e}")
|
||||
return ""
|
||||
|
||||
# 8. 检查合并后的文件是否存在且不为空
|
||||
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
|
||||
# 合并成功,删除 {base_file_name}_before.pdf 文件
|
||||
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
|
||||
if os.path.exists(before_pdf_path):
|
||||
try:
|
||||
@ -308,7 +211,6 @@ def merge_selected_pdfs_for_goods(output_folder, truncate_files, output_path, ba
|
||||
print(f"删除文件 {before_pdf_path} 时出错: {e}")
|
||||
else:
|
||||
print(f"未找到要删除的文件: {before_pdf_path}")
|
||||
|
||||
return output_path
|
||||
else:
|
||||
print(f"合并失败,没有生成 '{output_path}'。")
|
||||
|
127
flask_app/general/截取pdf_main.py
Normal file
127
flask_app/general/截取pdf_main.py
Normal file
@ -0,0 +1,127 @@
|
||||
import concurrent.futures
|
||||
import os
|
||||
import time
|
||||
|
||||
from flask_app.general.merge_pdfs import merge_selected_pdfs
|
||||
from flask_app.general.截取pdf通用函数 import check_pdf_pages
|
||||
from flask_app.general.通用功能函数 import get_global_logger
|
||||
from flask_app.工程标.截取pdf工程标版 import truncate_pdf_main_engineering
|
||||
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main_goods
|
||||
|
||||
|
||||
def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selections=None):
|
||||
"""
|
||||
统一的 PDF 截取和合并函数,支持 'goods' 和 'engineering' 两种模式,
|
||||
以及默认的全选和指定选择。
|
||||
|
||||
参数:
|
||||
- pdf_path (str): 输入的 PDF 文件路径。
|
||||
- output_folder (str): 输出文件夹路径。
|
||||
- logger (Logger): 日志记录器。
|
||||
- mode (str, 可选): 模式类型,支持 'goods' 和 'engineering'。默认为 'engineering'。
|
||||
- selections (List[int], 可选): 选择的模式列表,默认为 [1, 2, 3, 4, 5]。
|
||||
|
||||
返回:
|
||||
- list: 截取和合并后的文件路径列表,如果截取或合并失败,则包含空字符串。
|
||||
"""
|
||||
if selections is None:
|
||||
selections = [1, 2, 3, 4, 5]
|
||||
|
||||
# 确认模式有效
|
||||
if mode not in ['goods', 'engineering']:
|
||||
raise ValueError("mode 参数必须是 'goods' 或 'engineering'")
|
||||
def handle_exception(selection):
|
||||
return ["", ""] if selection == 4 else [""]
|
||||
# 设置模式相关的参数和函数
|
||||
if mode == 'goods':
|
||||
truncate_function = truncate_pdf_main_goods
|
||||
merge_mode = 'goods'
|
||||
|
||||
# 根据 'goods' 模式定义异常处理的逻辑
|
||||
else: # mode == 'engineering'
|
||||
truncate_function = truncate_pdf_main_engineering
|
||||
merge_mode = 'engineering'
|
||||
|
||||
mode_flag = 1
|
||||
num_selections = len(selections)
|
||||
if num_selections < 5:
|
||||
mode_flag = 2
|
||||
|
||||
res = check_pdf_pages(pdf_path, mode_flag, logger)
|
||||
if res is not None:
|
||||
return res # 返回包含空字符串的列表
|
||||
|
||||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # 纯文件名
|
||||
truncate_files = []
|
||||
|
||||
# 使用 ThreadPoolExecutor 进行多线程处理
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=num_selections) as executor:
|
||||
# 提交所有任务并保持 selection 顺序
|
||||
future_to_selection = {
|
||||
selection: executor.submit(
|
||||
truncate_function,
|
||||
pdf_path,
|
||||
output_folder,
|
||||
selection,
|
||||
logger
|
||||
# 如果 'goods' 模式需要额外参数,如 output_suffix,可以在这里处理
|
||||
)
|
||||
for selection in selections
|
||||
}
|
||||
|
||||
# 按 selection 顺序收集结果
|
||||
for selection in selections:
|
||||
future = future_to_selection.get(selection)
|
||||
try:
|
||||
files = future.result()
|
||||
if files:
|
||||
truncate_files.extend(files)
|
||||
# 无需额外处理,因为 `truncate_function` 已处理空字符串的添加和日志记录
|
||||
except Exception as e:
|
||||
logger.error(f"Selection {selection} 生成了一个异常: {e}")
|
||||
# 根据模式和 selection 添加相应数量的空字符串
|
||||
truncate_files.extend(handle_exception(selection))
|
||||
|
||||
# 定义合并后的输出路径
|
||||
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
|
||||
|
||||
# 调用 merge_selected_pdfs 并获取返回值
|
||||
merged_path = merge_selected_pdfs(
|
||||
output_folder,
|
||||
truncate_files,
|
||||
merged_output_path,
|
||||
base_file_name,
|
||||
mode=merge_mode
|
||||
)
|
||||
|
||||
if merged_path:
|
||||
# 合并成功,添加合并后的文件路径
|
||||
truncate_files.append(merged_path)
|
||||
logger.info(f"已成功合并 PDF 文件到 '{merged_output_path}'。")
|
||||
else:
|
||||
# 合并失败,添加空字符串
|
||||
truncate_files.append("")
|
||||
logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}")
|
||||
|
||||
logger.info("已截取文件路径: " + str(truncate_files))
|
||||
return truncate_files
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger=get_global_logger("123")
|
||||
start_time = time.time()
|
||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
|
||||
pdf_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest5.pdf"
|
||||
|
||||
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
|
||||
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp"
|
||||
# selections = [1, 4] # 仅处理 selection 4、1
|
||||
selections=[5]
|
||||
files=truncate_pdf_multiple(pdf_path,output_folder,logger,'engineering',selections)
|
||||
print(files)
|
||||
# selection = 1 # 例如:1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
|
||||
# generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
|
||||
# print(generated_files)
|
||||
# print("生成的文件:", generated_files)
|
||||
end_time = time.time()
|
||||
print("耗时:" + str(end_time - start_time))
|
57
flask_app/general/截取pdf通用函数.py
Normal file
57
flask_app/general/截取pdf通用函数.py
Normal file
@ -0,0 +1,57 @@
|
||||
#flask_app/general/截取pdf通用函数.py
|
||||
import concurrent.futures
|
||||
import os
|
||||
|
||||
import regex
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
|
||||
|
||||
def get_start_and_common_header(input_path,end_page):
|
||||
common_header = extract_common_header(input_path)
|
||||
last_begin_index = 0
|
||||
begin_pattern = regex.compile(
|
||||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
||||
regex.MULTILINE
|
||||
)
|
||||
# 新增目录匹配模式
|
||||
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
||||
|
||||
pdf_document = PdfReader(input_path)
|
||||
for i, page in enumerate(pdf_document.pages):
|
||||
if i > end_page:
|
||||
return common_header, 0 # 如果页码大于end_page,直接返回last_begin_index=0
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
cleaned_text = clean_page_content(text, common_header)
|
||||
# 检查是否存在"目录"
|
||||
if catalog_pattern.search(cleaned_text):
|
||||
continue # 如果存在目录,跳过当前页面
|
||||
if begin_pattern.search(cleaned_text):
|
||||
last_begin_index = i # 更新第一个匹配的索引,页码从0开始
|
||||
return common_header, last_begin_index
|
||||
return common_header, last_begin_index
|
||||
|
||||
def check_pdf_pages(pdf_path, mode,logger):
|
||||
try:
|
||||
reader = PdfReader(pdf_path)
|
||||
num_pages = len(reader.pages)
|
||||
logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
|
||||
if num_pages <= 50:
|
||||
logger.info("PDF页数小于或等于50页,跳过切分逻辑。")
|
||||
if mode==1:
|
||||
return ['', '', '', '', '', '', '']
|
||||
else:
|
||||
return ['','','','']
|
||||
# 若页数大于50页,返回None表示继续处理
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"无法读取 PDF 页数: {e}")
|
||||
# 返回空列表意味着无法执行后续处理逻辑
|
||||
if mode == 1:
|
||||
return ['', '', '', '', '', '', '']
|
||||
else:
|
||||
return ['', '', '', '']
|
||||
|
||||
|
||||
|
@ -1,12 +1,9 @@
|
||||
# -*- encoding:utf-8 -*-
|
||||
import ast
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
|
||||
from flask_app.general.json_utils import clean_json_string
|
||||
from flask_app.general.多线程提问 import multi_threading
|
||||
from flask_app.general.通义千问long import upload_file
|
||||
from flask_app.工程标.判断是否分包等 import read_questions_from_judge
|
||||
|
||||
def process_judge_questions(judge_file_path, chosen_numbers, file_id, baseinfo_list1):
|
||||
@ -72,3 +69,9 @@ def process_string_list(string_list):
|
||||
# 如果没有匹配到内容,返回空列表
|
||||
return []
|
||||
|
||||
def get_global_logger(unique_id):
|
||||
if unique_id is None:
|
||||
return logging.getLogger() # 获取默认的日志器
|
||||
logger = logging.getLogger(unique_id)
|
||||
return logger
|
||||
|
||||
|
@ -6,7 +6,7 @@ import time
|
||||
from flask_app.general.多线程提问 import multi_threading
|
||||
from flask_app.工程标.根据条款号整合json import process_and_merge_entries,process_and_merge2
|
||||
from flask_app.general.json_utils import extract_content_from_json
|
||||
from flask_app.工程标.截取pdf import truncate_pdf_main
|
||||
from flask_app.工程标.截取pdf工程标版 import truncate_pdf_main
|
||||
from flask_app.工程标.提取json工程标版 import convert_clause_to_json
|
||||
prompt = """
|
||||
# 角色
|
||||
|
@ -3,7 +3,7 @@ import json
|
||||
import logging
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from flask_app.工程标.截取pdf import truncate_pdf_multiple
|
||||
from flask_app.工程标.截取pdf工程标版 import truncate_pdf_multiple
|
||||
from flask_app.general.table_content_extraction import extract_tables_main
|
||||
from flask_app.old_version.文档理解大模型版知识库处理.知识库操作_old import addfileToKnowledge, deleteKnowledge
|
||||
from flask_app.工程标.提取json工程标版 import convert_clause_to_json
|
||||
|
@ -3,7 +3,7 @@ import json
|
||||
import logging
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from flask_app.工程标.截取pdf import truncate_pdf_multiple
|
||||
from flask_app.工程标.截取pdf工程标版 import truncate_pdf_multiple
|
||||
from flask_app.general.table_content_extraction import extract_tables_main
|
||||
from flask_app.工程标.提取json工程标版 import convert_clause_to_json
|
||||
from flask_app.general.json_utils import transform_json_values
|
||||
|
@ -1,27 +1,16 @@
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import string
|
||||
import time
|
||||
|
||||
from flask_app.general.doubao import doubao_model
|
||||
from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx
|
||||
from flask_app.general.insert_del_pagemark import insert_mark
|
||||
from flask_app.general.json_utils import clean_json_string
|
||||
from flask_app.general.通义千问long import upload_file
|
||||
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_specific_goods
|
||||
from flask_app.general.通用功能函数 import get_global_logger
|
||||
from flask_app.general.截取pdf_main import truncate_pdf_multiple
|
||||
from flask_app.货物标.提取采购需求main import fetch_procurement_reqs
|
||||
from flask_app.货物标.技术参数要求提取后处理函数 import extract_matching_keys
|
||||
from flask_app.货物标.资格审查main import combine_qualification_review
|
||||
import concurrent.futures
|
||||
|
||||
def get_global_logger(unique_id):
|
||||
if unique_id is None:
|
||||
return logging.getLogger() # 获取默认的日志器
|
||||
logger = logging.getLogger(unique_id)
|
||||
return logger
|
||||
|
||||
|
||||
logger = None
|
||||
def get_nested(dic, keys, default=None):
|
||||
for key in keys:
|
||||
@ -367,7 +356,7 @@ def get_tech_and_business_deviation(file_path,file_type,unique_id,output_folder)
|
||||
logger.error("Unsupported file type provided. Preprocessing halted.")
|
||||
return None
|
||||
selections=[1,3,5]
|
||||
files=truncate_pdf_specific_goods(pdf_path,output_folder,selections,unique_id)
|
||||
files=truncate_pdf_multiple(pdf_path,output_folder,logger,'goods',selections)
|
||||
notice_path=files[0]
|
||||
qualification_file=files[1]
|
||||
procurement_file=files[2]
|
||||
|
@ -8,21 +8,14 @@ from flask_app.general.format_change import docx2pdf
|
||||
from flask_app.general.json_utils import clean_json_string
|
||||
from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
|
||||
from flask_app.general.通义千问long import upload_file
|
||||
from flask_app.general.通用功能函数 import get_global_logger
|
||||
from flask_app.货物标.基础信息解析main import aggregate_basic_info_goods
|
||||
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_specific_goods
|
||||
from flask_app.工程标.截取pdf import truncate_pdf_specific_engineering,truncate_pdf_main
|
||||
from flask_app.general.截取pdf_main import truncate_pdf_multiple
|
||||
from flask_app.general.post_processing import inner_post_processing
|
||||
from flask_app.工程标.基础信息整合工程标 import aggregate_basic_info_engineering
|
||||
|
||||
|
||||
def get_global_logger(unique_id):
|
||||
if unique_id is None:
|
||||
return logging.getLogger() # 获取默认的日志器
|
||||
logger = logging.getLogger(unique_id)
|
||||
return logger
|
||||
|
||||
#货物标
|
||||
def little_parse_goods(output_folder, pdf_path):
|
||||
def little_parse_goods(output_folder, pdf_path,logger):
|
||||
"""
|
||||
解析货物相关的基础信息。
|
||||
|
||||
@ -35,7 +28,7 @@ def little_parse_goods(output_folder, pdf_path):
|
||||
"""
|
||||
# 截取特定的货物 PDF 文件
|
||||
selections = [1,4] # 仅处理 selection 1和4 #公告+投标人须知
|
||||
files = truncate_pdf_specific_goods(pdf_path, output_folder,selections)
|
||||
files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods',selections)
|
||||
if not files:
|
||||
raise ValueError("未找到截取后的文件。")
|
||||
# 假设最后一个文件是需要处理的基础信息文件
|
||||
@ -58,7 +51,7 @@ def little_parse_goods(output_folder, pdf_path):
|
||||
return {"基础信息": aggregated_baseinfo}
|
||||
|
||||
|
||||
def little_parse_engineering(output_folder, pdf_path):
|
||||
def little_parse_engineering(output_folder, pdf_path,logger):
|
||||
"""
|
||||
解析工程相关的基础信息。
|
||||
|
||||
@ -71,13 +64,14 @@ def little_parse_engineering(output_folder, pdf_path):
|
||||
"""
|
||||
# 截取特定的工程 PDF 文件
|
||||
selections = [ 1,4] #公告+投标人须知前附表
|
||||
files = truncate_pdf_specific_engineering(pdf_path, output_folder,selections)
|
||||
files = truncate_pdf_multiple(pdf_path, output_folder,logger,'engineering',selections)
|
||||
if not files:
|
||||
raise ValueError("未找到截取后的文件。")
|
||||
# 假设最后一个文件是需要处理的基础信息文件
|
||||
baseinfo_file_path = files[-1]
|
||||
if not baseinfo_file_path:
|
||||
baseinfo_file_path=truncate_pdf_main(pdf_path,output_folder,5)[0] #invalid_path
|
||||
selections=[5]
|
||||
baseinfo_file_path=truncate_pdf_multiple(pdf_path,output_folder,logger,selections)[0] #invalid_path
|
||||
# 上传文件并获取文件 ID
|
||||
file_id = upload_file(baseinfo_file_path)
|
||||
# 注意:以下路径被硬编码,确保该路径存在并且正确
|
||||
@ -123,11 +117,11 @@ def little_parse_main(output_folder, file_path, file_type,zb_type,unique_id):
|
||||
return None
|
||||
# 根据招标类型调用相应的解析函数
|
||||
if zb_type == 2: # 货物标
|
||||
combined_data = little_parse_goods(output_folder, pdf_path)
|
||||
combined_data = little_parse_goods(output_folder, pdf_path,logger)
|
||||
logger.info("Parsed goods data: %s", json.dumps(combined_data, ensure_ascii=False, indent=4))
|
||||
res = inner_post_processing(combined_data["基础信息"])
|
||||
elif zb_type==1: #工程标
|
||||
combined_data = little_parse_engineering(output_folder, pdf_path)
|
||||
combined_data = little_parse_engineering(output_folder, pdf_path,logger)
|
||||
logger.info("Parsed engineering data: %s", json.dumps(combined_data, ensure_ascii=False, indent=4))
|
||||
res = inner_post_processing(combined_data["基础信息"])
|
||||
# 定义保存JSON文件的路径
|
||||
|
@ -8,8 +8,9 @@ from concurrent.futures import ThreadPoolExecutor
|
||||
from docx import Document
|
||||
|
||||
from flask_app.general.insert_del_pagemark import insert_mark,delete_mark
|
||||
from flask_app.工程标.截取pdf import truncate_pdf_multiple
|
||||
from flask_app.general.截取pdf_main import truncate_pdf_multiple
|
||||
from flask_app.general.merge_pdfs import merge_pdfs
|
||||
from flask_app.general.通用功能函数 import get_global_logger
|
||||
from flask_app.工程标.提取json工程标版 import convert_clause_to_json
|
||||
from flask_app.general.json_utils import transform_json_values
|
||||
from flask_app.general.无效标和废标公共代码 import combine_find_invalid
|
||||
@ -20,16 +21,9 @@ from flask_app.工程标.资格审查模块 import combine_review_standards
|
||||
from flask_app.general.商务技术评分提取 import combine_evaluation_standards
|
||||
from flask_app.general.format_change import pdf2docx, docx2pdf
|
||||
|
||||
|
||||
def get_global_logger(unique_id):
|
||||
if unique_id is None:
|
||||
return logging.getLogger() # 获取默认的日志器
|
||||
logger = logging.getLogger(unique_id)
|
||||
return logger
|
||||
|
||||
# 创建全局线程池
|
||||
executor = ThreadPoolExecutor()
|
||||
def preprocess_files(output_folder, file_path, file_type,unique_id,logger):
|
||||
def preprocess_files(output_folder, file_path, file_type,logger):
|
||||
logger.info("starting 文件预处理...")
|
||||
logger.info("output_folder..." + output_folder)
|
||||
start_time=time.time()
|
||||
@ -47,20 +41,20 @@ def preprocess_files(output_folder, file_path, file_type,unique_id,logger):
|
||||
logger.error("Unsupported file type provided. Preprocessing halted.")
|
||||
return None
|
||||
# 调用截取PDF多次
|
||||
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,unique_id)
|
||||
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'engineering')
|
||||
print("切割出的文件:"+str(truncate_files))
|
||||
|
||||
# 处理各个部分
|
||||
tobidders_notice_table=truncate_files[0] #投标人须知前附表
|
||||
notice_path=truncate_files[0] #招标公告
|
||||
|
||||
tobidders_notice = truncate_files[1] #投标人须知正文
|
||||
evaluation_method = truncate_files[1] #评标方法
|
||||
|
||||
evaluation_method = truncate_files[2] #评标方法
|
||||
qualification = truncate_files[2] #资格审查
|
||||
|
||||
qualification = truncate_files[3] #资格审查
|
||||
notice_path=truncate_files[4] #公告
|
||||
tobidders_notice_table = truncate_files[3] #投标人须知前附表
|
||||
tobidders_notice=truncate_files[4] #投标人须知正文
|
||||
|
||||
invalid_path=truncate_files[5]
|
||||
invalid_path=truncate_files[5] #无效标
|
||||
# invalid_docpath = copy_docx(docx_path) # docx截取无效标部分
|
||||
# invalid_docpath=pdf2docx(invalid_path)
|
||||
invalid_added_pdf = insert_mark(invalid_path)
|
||||
@ -215,7 +209,7 @@ def fetch_bid_opening(invalid_deleted_docx, merged_baseinfo_path_more, clause_pa
|
||||
def engineering_bid_main(output_folder, file_path, file_type, unique_id):
|
||||
logger = get_global_logger(unique_id)
|
||||
# 预处理文件,获取处理后的数据
|
||||
processed_data = preprocess_files(output_folder, file_path, file_type,unique_id,logger)
|
||||
processed_data = preprocess_files(output_folder, file_path, file_type,logger)
|
||||
if not processed_data:
|
||||
yield json.dumps({}) # 如果处理数据失败,返回空的 JSON
|
||||
|
||||
|
@ -7,23 +7,16 @@ from docx import Document
|
||||
from flask_app.general.format_change import docx2pdf, pdf2docx,doc2docx
|
||||
from flask_app.general.insert_del_pagemark import insert_mark, delete_mark
|
||||
from flask_app.general.json_utils import transform_json_values
|
||||
from flask_app.general.通用功能函数 import get_global_logger
|
||||
from flask_app.货物标.基础信息解析main import combine_basic_info
|
||||
from flask_app.货物标.投标人须知正文提取指定内容货物标版 import extract_from_notice
|
||||
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_multiple
|
||||
from flask_app.general.截取pdf_main import truncate_pdf_multiple
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import concurrent.futures
|
||||
from flask_app.货物标.提取json货物标版 import convert_clause_to_json
|
||||
from flask_app.general.无效标和废标公共代码 import combine_find_invalid
|
||||
from flask_app.货物标.资格审查main import combine_qualification_review
|
||||
from flask_app.general.商务技术评分提取 import combine_evaluation_standards
|
||||
import logging
|
||||
|
||||
|
||||
def get_global_logger(unique_id):
|
||||
if unique_id is None:
|
||||
return logging.getLogger() # 获取默认的日志器
|
||||
logger = logging.getLogger(unique_id)
|
||||
return logger
|
||||
|
||||
# 创建全局线程池
|
||||
executor = ThreadPoolExecutor()
|
||||
@ -47,7 +40,7 @@ def preprocess_files(output_folder, file_path, file_type,logger):
|
||||
return None
|
||||
|
||||
# 调用截取PDF多次
|
||||
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger) # index: 0->商务技术服务要求 1->评标办法 2->资格审查 3->投标人须知前附表 4->投标人须知正文
|
||||
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods') # index: 0->商务技术服务要求 1->评标办法 2->资格审查 3->投标人须知前附表 4->投标人须知正文
|
||||
|
||||
# 处理各个部分
|
||||
invalid_path=pdf_path
|
||||
|
@ -1,21 +1,14 @@
|
||||
#flask_app/工程标/截取pdf工程标版.py
|
||||
import regex
|
||||
import os
|
||||
import time
|
||||
from PyPDF2 import PdfReader, PdfWriter
|
||||
from flask_app.general.clean_pdf import clean_page_content, extract_common_header
|
||||
from flask_app.general.merge_pdfs import merge_selected_pdfs_for_engineering
|
||||
import concurrent.futures
|
||||
import logging
|
||||
from flask_app.general.clean_pdf import clean_page_content
|
||||
|
||||
from flask_app.general.截取pdf通用函数 import get_start_and_common_header
|
||||
from flask_app.general.通用功能函数 import get_global_logger
|
||||
|
||||
|
||||
def get_global_logger(unique_id):
|
||||
if unique_id is None:
|
||||
return logging.getLogger() # 获取默认的日志器
|
||||
logger = logging.getLogger(unique_id)
|
||||
return logger
|
||||
|
||||
|
||||
logger = None
|
||||
def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header):
|
||||
try:
|
||||
# 获取文件基本名称
|
||||
@ -301,67 +294,39 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
|
||||
return []
|
||||
|
||||
|
||||
def get_start_and_common_header(input_path):
|
||||
common_header = extract_common_header(input_path)
|
||||
last_begin_index = 0
|
||||
begin_pattern = regex.compile(
|
||||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
||||
regex.MULTILINE
|
||||
)
|
||||
# 新增目录匹配模式
|
||||
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
||||
|
||||
pdf_document = PdfReader(input_path)
|
||||
for i, page in enumerate(pdf_document.pages):
|
||||
if i > 10:
|
||||
return common_header, 0 # 如果页码大于10,直接返回last_begin_index=0
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
cleaned_text = clean_page_content(text, common_header)
|
||||
# 检查是否存在"目录"
|
||||
if catalog_pattern.search(cleaned_text):
|
||||
continue # 如果存在目录,跳过当前页面
|
||||
if begin_pattern.search(cleaned_text):
|
||||
last_begin_index = i # 更新第一个匹配的索引,页码从0开始
|
||||
return common_header, last_begin_index
|
||||
return common_header, last_begin_index
|
||||
|
||||
def truncate_pdf_main(input_path, output_folder, selection):
|
||||
def truncate_pdf_main_engineering(input_path, output_folder, selection,logger):
|
||||
if os.path.isdir(input_path):
|
||||
generated_files = []
|
||||
for file_name in os.listdir(input_path):
|
||||
if file_name.endswith('.pdf'):
|
||||
file_path = os.path.join(input_path, file_name)
|
||||
files = truncate_pdf_main(file_path, output_folder, selection)
|
||||
files = truncate_pdf_main_engineering(file_path, output_folder, selection)
|
||||
if files:
|
||||
generated_files.extend(files)
|
||||
return generated_files
|
||||
elif os.path.isfile(input_path) and input_path.endswith('.pdf'):
|
||||
# base_file_name = os.path.splitext(os.path.basename(input_path))[0]
|
||||
common_header, last_begin_index = get_start_and_common_header(input_path)
|
||||
common_header, last_begin_index = get_start_and_common_header(input_path,20)
|
||||
# print(last_begin_index)
|
||||
if selection == 1:
|
||||
# Selection 1: 投标人须知前附表
|
||||
# Selection 1: 招标公告
|
||||
pattern_pairs = [
|
||||
(
|
||||
regex.compile(
|
||||
r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知|(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表)\s*$',regex.MULTILINE),
|
||||
regex.compile(
|
||||
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]',
|
||||
regex.MULTILINE)
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'),
|
||||
regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
|
||||
),
|
||||
(
|
||||
regex.compile(
|
||||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|'
|
||||
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表\s*$',
|
||||
regex.MULTILINE
|
||||
),
|
||||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
||||
regex.MULTILINE),
|
||||
regex.compile(
|
||||
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]',
|
||||
regex.MULTILINE)
|
||||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知(?:前附表)?\s*$',
|
||||
regex.MULTILINE
|
||||
)
|
||||
)
|
||||
]
|
||||
output_suffix = "tobidders_notice"
|
||||
output_suffix = "notice"
|
||||
elif selection == 2:
|
||||
# Selection 2: 评标办法
|
||||
pattern_pairs = [
|
||||
@ -408,23 +373,28 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
||||
]
|
||||
output_suffix = "qualification"
|
||||
elif selection == 4:
|
||||
# Selection 4: 招标公告
|
||||
# Selection 4: 投标人须知
|
||||
pattern_pairs = [
|
||||
(
|
||||
regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'),
|
||||
regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
|
||||
(
|
||||
regex.compile(
|
||||
r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知|(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表)\s*$',
|
||||
regex.MULTILINE),
|
||||
regex.compile(
|
||||
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]',
|
||||
regex.MULTILINE)
|
||||
),
|
||||
(
|
||||
regex.compile(
|
||||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|'
|
||||
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表\s*$',
|
||||
regex.MULTILINE
|
||||
),
|
||||
(
|
||||
regex.compile(r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$', regex.MULTILINE),
|
||||
regex.compile(
|
||||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知(?:前附表)?\s*$',
|
||||
regex.MULTILINE
|
||||
)
|
||||
)
|
||||
]
|
||||
output_suffix = "notice"
|
||||
|
||||
regex.compile(
|
||||
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]',
|
||||
regex.MULTILINE)
|
||||
)
|
||||
]
|
||||
output_suffix = "tobidders_notice"
|
||||
elif selection == 5:
|
||||
# Selection 5: 无效标
|
||||
pattern_pairs = [
|
||||
@ -447,13 +417,13 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
||||
for idx, (begin_pattern, end_pattern) in enumerate(pattern_pairs, start=1):
|
||||
is_secondary_match = (idx == 2)
|
||||
begin_page = last_begin_index if last_begin_index != 0 else {
|
||||
1: 3, # 前附表
|
||||
1: 0, # 公告
|
||||
2: 10, # 评标
|
||||
3: 5, # 资格
|
||||
4: 0, # 公告
|
||||
4: 3, # 前附表
|
||||
5: 0 # 无效标
|
||||
}.get(selection, 0)
|
||||
if selection == 1: # 投标人须知
|
||||
if selection == 4: # 投标人须知
|
||||
output_paths = list(
|
||||
extract_pages_tobidders_notice(input_path, output_folder, begin_pattern, begin_page, common_header,
|
||||
is_secondary_match))
|
||||
@ -485,148 +455,29 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
||||
if fallback_result and any(os.path.isfile(f) for f in fallback_result):
|
||||
return fallback_result
|
||||
else:
|
||||
if selection == 1:
|
||||
return ["", ""]
|
||||
else:
|
||||
return ['']
|
||||
# 根据 selection 返回相应数量的空字符串
|
||||
empty_returns = ["", ""] if selection == 4 else [""]
|
||||
logger.error(f"Selection {selection}: 回退提取失败,返回空字符串。")
|
||||
return empty_returns
|
||||
else:
|
||||
print("Invalid input path. It is neither a PDF file nor a directory containing PDF files.")
|
||||
return ['']
|
||||
|
||||
|
||||
def truncate_pdf_multiple(input_path, output_folder, unique_id="123"):
|
||||
logger = get_global_logger(unique_id)
|
||||
# 新增逻辑:检查 PDF 总页数
|
||||
try:
|
||||
reader = PdfReader(input_path)
|
||||
num_pages = len(reader.pages)
|
||||
logger.info(f"PDF '{input_path}' 的总页数为: {num_pages}")
|
||||
if num_pages <= 50:
|
||||
logger.info(f"PDF页数小于或等于50页,跳过切分逻辑。")
|
||||
return ['', '', '', '', '', '', '']
|
||||
except Exception as e:
|
||||
logger.error(f"无法读取 PDF 页数: {e}")
|
||||
return ['', '', '', '', '', '', '']
|
||||
base_file_name = os.path.splitext(os.path.basename(input_path))[0] # 纯文件名
|
||||
truncate_files = []
|
||||
selections = range(1, 6) # 选择 1 到 5
|
||||
|
||||
# 使用 ThreadPoolExecutor 进行多线程处理
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
|
||||
# 提交所有任务并保持 selection 顺序
|
||||
future_to_selection = {selection: executor.submit(truncate_pdf_main, input_path, output_folder, selection) for
|
||||
selection in selections}
|
||||
|
||||
# 按 selection 顺序收集结果
|
||||
for selection in selections:
|
||||
future = future_to_selection.get(selection)
|
||||
try:
|
||||
files = future.result()
|
||||
if files and any(f for f in files):
|
||||
# 过滤空字符串
|
||||
valid_files = [f for f in files if f]
|
||||
truncate_files.extend(valid_files)
|
||||
else:
|
||||
logger.error(f"Selection {selection}: 截取失败,已添加空字符串。")
|
||||
# Append two empty strings for selection=1; otherwise, one empty string
|
||||
truncate_files.extend(["", ""] if selection == 1 else [""])
|
||||
except Exception as e:
|
||||
logger.error(f"Selection {selection} 生成了一个异常: {e}")
|
||||
# Append two empty strings for selection=1; otherwise, one empty string
|
||||
truncate_files.extend(["", ""] if selection == 1 else [""])
|
||||
|
||||
# Proceed with merging logic as before...
|
||||
if any(f for f in truncate_files if f): # 检查是否有有效的文件路径
|
||||
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
|
||||
merged_result = merge_selected_pdfs_for_engineering(output_folder, truncate_files, merged_output_path,
|
||||
base_file_name)
|
||||
if merged_result:
|
||||
truncate_files.append(merged_result)
|
||||
# logger.info(f"merged_baseinfo: 已生成合并文件: {merged_output_path}")
|
||||
else:
|
||||
truncate_files.append("") # 如果 merged_result 未生成,添加空字符串
|
||||
logger.warning("merged_baseinfo: 未生成合并文件,因为没有找到需要合并的 PDF 文件。")
|
||||
else:
|
||||
truncate_files.append("") # 如果没有文件需要合并,也添加空字符串
|
||||
logger.warning(f"merged_baseinfo: 没有文件需要合并 for {input_path}")
|
||||
|
||||
return truncate_files
|
||||
|
||||
|
||||
def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, unique_id="123"):
|
||||
try:
|
||||
logger = get_global_logger(unique_id)
|
||||
try:
|
||||
reader = PdfReader(input_path)
|
||||
num_pages = len(reader.pages)
|
||||
logger.info(f"PDF '{input_path}' 的总页数为: {num_pages}")
|
||||
if num_pages <= 50:
|
||||
logger.info(f"PDF页数小于或等于50页,跳过切分逻辑。")
|
||||
return ['', '', '', '']
|
||||
except Exception as e:
|
||||
logger.error(f"无法读取 PDF 页数: {e}")
|
||||
return ['', '', '', '']
|
||||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||
truncate_files = []
|
||||
|
||||
# 使用 ThreadPoolExecutor 进行多线程处理
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
|
||||
# 提交所有任务并保持 selection 顺序
|
||||
future_to_selection = {selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection) for
|
||||
selection in selections}
|
||||
|
||||
# 按 selection 顺序收集结果
|
||||
for selection in selections:
|
||||
future = future_to_selection.get(selection)
|
||||
try:
|
||||
files = future.result()
|
||||
if files and any(f for f in files):
|
||||
# 过滤空字符串
|
||||
valid_files = [f for f in files if f]
|
||||
truncate_files.extend(valid_files)
|
||||
else:
|
||||
logger.error(f"Selection {selection}: 截取失败,已添加空字符串。")
|
||||
# Append two empty strings for selection=1; otherwise, one empty string
|
||||
truncate_files.extend(["", ""] if selection == 1 else [""])
|
||||
except Exception as e:
|
||||
logger.error(f"Selection {selection} 生成了一个异常: {e}")
|
||||
# Append two empty strings for selection=1; otherwise, one empty string
|
||||
truncate_files.extend(["", ""] if selection == 1 else [""])
|
||||
|
||||
# Proceed with merging logic as before...
|
||||
if any(f for f in truncate_files if f): # 检查是否有有效的文件路径
|
||||
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_specific.pdf")
|
||||
merged_result = merge_selected_pdfs_for_engineering(output_folder, truncate_files, merged_output_path,
|
||||
base_file_name)
|
||||
if merged_result:
|
||||
truncate_files.append(merged_result)
|
||||
logger.info(f"merged_specific: 已生成合并文件: {merged_output_path}")
|
||||
else:
|
||||
truncate_files.append("") # 如果 merged_result 未生成,添加空字符串
|
||||
logger.warning("merged_specific: 未生成合并文件,因为没有找到需要合并的 PDF 文件。")
|
||||
else:
|
||||
truncate_files.append("") # 如果没有文件需要合并,也添加空字符串
|
||||
logger.warning(f"merged_specific: 没有文件需要合并 for {pdf_path}")
|
||||
|
||||
return truncate_files
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in truncate_pdf_specific_engineering: {e}")
|
||||
return [""] * len(selections) # 返回与 selections 数量相同的空字符串列表
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger=get_global_logger("123")
|
||||
start_time = time.time()
|
||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
|
||||
input_path=r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\西藏监管局办.pdf"
|
||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
|
||||
pdf_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest4_evaluation_method.pdf"
|
||||
|
||||
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
|
||||
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp"
|
||||
files=truncate_pdf_multiple(input_path,output_folder)
|
||||
# selections = [4, 1] # 仅处理 selection 4、1
|
||||
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
|
||||
print(files)
|
||||
# selection = 2 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
|
||||
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
# selections = [1, 4] # 仅处理 selection 4、1
|
||||
selections=[5]
|
||||
# files=truncate_pdf_multiple(pdf_path,output_folder,logger,'engineering',selections)
|
||||
# print(files)
|
||||
# selection = 1 # 例如:1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
|
||||
# generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
|
||||
# print(generated_files)
|
||||
# print("生成的文件:", generated_files)
|
||||
end_time = time.time()
|
@ -1,22 +1,14 @@
|
||||
import glob
|
||||
import logging
|
||||
|
||||
from PyPDF2 import PdfReader, PdfWriter
|
||||
import regex # 导入正则表达式库
|
||||
import os # 用于文件和文件夹操作
|
||||
|
||||
from flask_app.general.clean_pdf import clean_page_content, extract_common_header
|
||||
from flask_app.general.format_change import docx2pdf
|
||||
from flask_app.general.merge_pdfs import merge_and_cleanup, merge_selected_pdfs_for_goods
|
||||
import concurrent.futures
|
||||
from flask_app.general.merge_pdfs import merge_and_cleanup
|
||||
from flask_app.general.截取pdf通用函数 import get_start_and_common_header
|
||||
from flask_app.general.通用功能函数 import get_global_logger
|
||||
|
||||
|
||||
def get_global_logger(unique_id):
|
||||
if unique_id is None:
|
||||
return logging.getLogger() # 获取默认的日志器
|
||||
logger = logging.getLogger(unique_id)
|
||||
return logger
|
||||
|
||||
def is_pdf_or_doc(filename):
|
||||
# 判断文件是否为PDF或Word文档
|
||||
return filename.lower().endswith(('.pdf', '.doc', '.docx'))
|
||||
@ -28,21 +20,6 @@ def convert_to_pdf(file_path):
|
||||
return docx2pdf(file_path)
|
||||
return file_path
|
||||
|
||||
|
||||
def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
||||
pdf_path = convert_to_pdf(file_path)
|
||||
result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||
if result:
|
||||
if output_suffix == "tobidders_notice":
|
||||
# 确保返回的是元组,并将其中的 None 转换为 ""
|
||||
path1, path2 = result
|
||||
return [path1 or "", path2 or ""]
|
||||
elif output_suffix == "qualification1":
|
||||
merge_and_cleanup(result, "qualification3")
|
||||
return [result or ""]
|
||||
return [result or ""]
|
||||
return [""] # 返回空字符串
|
||||
|
||||
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
|
||||
output_suffix="normal"):
|
||||
start_page = None
|
||||
@ -124,7 +101,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
||||
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
||||
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page)
|
||||
elif output_suffix == "qualification1":
|
||||
truncate_pdf_main(pdf_path, output_folder, 2, "qualification3") # 合并'资格审查'章节和'评标办法'章节
|
||||
truncate_pdf_main_goods(pdf_path, output_folder, 2, "qualification3") # 合并'资格审查'章节和'评标办法'章节
|
||||
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
||||
except Exception as e:
|
||||
print(f"Error processing {pdf_path}: {e}")
|
||||
@ -171,26 +148,6 @@ def get_patterns_for_notice():
|
||||
)
|
||||
return begin_pattern, end_pattern
|
||||
|
||||
# def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header,
|
||||
# exclusion_pattern):
|
||||
# start_page = None
|
||||
# mid_page = None
|
||||
# end_page = None
|
||||
# for i, page in enumerate(pdf_document.pages):
|
||||
# text = page.extract_text() or ""
|
||||
# cleaned_text = clean_page_content(text, common_header)
|
||||
# if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
|
||||
# continue
|
||||
# if start_page is None and regex.search(begin_pattern, cleaned_text) and i > begin_page:
|
||||
# start_page = i
|
||||
# if start_page is not None and mid_page is None and regex.search(
|
||||
# r'^\s*[((]?\s*[一1]\s*[))]?\s*[、..]*\s*(说\s*明|总\s*则)', cleaned_text):
|
||||
# mid_page = i
|
||||
# if start_page is not None and mid_page is not None and regex.search(end_pattern, cleaned_text) and i > mid_page:
|
||||
# end_page = i
|
||||
# break
|
||||
# return start_page, mid_page, end_page
|
||||
|
||||
def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern):
|
||||
"""
|
||||
从PDF文档中提取起始页、中间页和结束页。
|
||||
@ -510,7 +467,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, b
|
||||
new_file.write(original_file.read())
|
||||
return new_file_path
|
||||
else:
|
||||
temp = truncate_pdf_main(pdf_path, output_folder, 2, "qualification2")
|
||||
temp = truncate_pdf_main_goods(pdf_path, output_folder, 2, "qualification2")
|
||||
if len(temp) > 0:
|
||||
return temp[0]
|
||||
else:
|
||||
@ -558,33 +515,8 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo
|
||||
print(f"Error in save_extracted_pages: {e}")
|
||||
return "" # 返回空字符串
|
||||
|
||||
def get_start_and_common_header(input_path):
|
||||
common_header = extract_common_header(input_path)
|
||||
last_begin_index = 0
|
||||
begin_pattern = regex.compile(
|
||||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
||||
regex.MULTILINE
|
||||
)
|
||||
# 新增目录匹配模式
|
||||
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
||||
|
||||
pdf_document = PdfReader(input_path)
|
||||
for i, page in enumerate(pdf_document.pages):
|
||||
if i > 10:
|
||||
return common_header, 0 # 如果页码大于10,直接返回last_begin_index=0
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
cleaned_text = clean_page_content(text, common_header)
|
||||
# 检查是否存在"目录"
|
||||
if catalog_pattern.search(cleaned_text):
|
||||
continue # 如果存在目录,跳过当前页面
|
||||
if begin_pattern.search(cleaned_text):
|
||||
last_begin_index = i # 更新第一个匹配的索引,页码从0开始
|
||||
return common_header, last_begin_index
|
||||
return common_header, last_begin_index
|
||||
|
||||
|
||||
def truncate_pdf_main(input_path, output_folder, selection, output_suffix="default"):
|
||||
def truncate_pdf_main_goods(input_path, output_folder, selection, output_suffix="default"):
|
||||
try:
|
||||
# 检查是否为文件夹
|
||||
if os.path.isdir(input_path):
|
||||
@ -619,7 +551,7 @@ def process_input(input_path, output_folder, selection, output_suffix):
|
||||
os.makedirs(output_folder)
|
||||
|
||||
# 获取起始和通用页眉
|
||||
common_header, last_begin_index = get_start_and_common_header(input_path)
|
||||
common_header, last_begin_index = get_start_and_common_header(input_path,10)
|
||||
begin_page = last_begin_index if last_begin_index != 0 else {
|
||||
4: 1,
|
||||
2: 5,
|
||||
@ -676,145 +608,43 @@ def process_input(input_path, output_folder, selection, output_suffix):
|
||||
if output_suffix == "default":
|
||||
output_suffix = local_output_suffix
|
||||
|
||||
# 调用实际处理文件内容的函数
|
||||
return process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||
# 将原先的 process_files 逻辑合并到此处
|
||||
pdf_path = convert_to_pdf(input_path)
|
||||
result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||
|
||||
# 根据提取结果以及不同的 output_suffix 进行处理
|
||||
if result:
|
||||
if output_suffix == "tobidders_notice":
|
||||
# 确保返回的是元组,并将其中的 None 转换为 ""
|
||||
path1, path2 = result
|
||||
return [path1 or "", path2 or ""]
|
||||
elif output_suffix == "qualification1":
|
||||
merge_and_cleanup(result, "qualification3")
|
||||
return [result or ""]
|
||||
return [result or ""]
|
||||
|
||||
return [""]
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in process_input: {e}")
|
||||
return ['']
|
||||
|
||||
|
||||
def truncate_pdf_multiple(pdf_path, output_folder, logger):
|
||||
# 新增逻辑:检查 PDF 总页数
|
||||
try:
|
||||
reader = PdfReader(pdf_path)
|
||||
num_pages = len(reader.pages)
|
||||
logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
|
||||
if num_pages <= 50:
|
||||
logger.info(f"PDF页数小于或等于50页,跳过切分逻辑。")
|
||||
return ['', '', '', '', '', '', '']
|
||||
except Exception as e:
|
||||
logger.error(f"无法读取 PDF 页数: {e}")
|
||||
return ['', '', '', '', '', '', '']
|
||||
|
||||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||
truncate_files = []
|
||||
|
||||
# 定义要处理的选择范围
|
||||
selections = range(1, 6)
|
||||
|
||||
# 使用 ThreadPoolExecutor 进行多线程处理
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
||||
# 提交所有任务并保持 selection 顺序
|
||||
future_to_selection = {
|
||||
selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default")
|
||||
for selection in selections}
|
||||
|
||||
# 按 selection 顺序收集结果
|
||||
for selection in selections:
|
||||
future = future_to_selection.get(selection)
|
||||
try:
|
||||
files = future.result()
|
||||
if files:
|
||||
truncate_files.extend(files)
|
||||
except Exception as e:
|
||||
logger.error(f"Selection {selection} 生成了一个异常: {e}")
|
||||
|
||||
# 定义合并后的输出路径
|
||||
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
|
||||
# 调用 merge_selected_pdfs 并获取返回值
|
||||
merged_path = merge_selected_pdfs_for_goods(output_folder, truncate_files, merged_output_path, base_file_name)
|
||||
if merged_path:
|
||||
# 合并成功,添加合并后的文件路径
|
||||
truncate_files.append(merged_path)
|
||||
# logger.info(f"已生成合并文件: {merged_output_path}")
|
||||
else:
|
||||
# 合并失败,添加空字符串
|
||||
truncate_files.append("")
|
||||
logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}")
|
||||
logger.info("已截取文件路径" + str(truncate_files))
|
||||
return truncate_files
|
||||
|
||||
|
||||
# 小解析,只需要前三章内容
|
||||
def truncate_pdf_specific_goods(pdf_path, output_folder, selections, unique_id="123"):
|
||||
"""
|
||||
处理 PDF 文件,选择指定的 selections,并合并结果。
|
||||
|
||||
Args:
|
||||
pdf_path (str): 要处理的 PDF 文件路径。
|
||||
output_folder (str): 截取后的文件保存文件夹路径。
|
||||
selections (iterable): 要处理的 selection 列表。
|
||||
|
||||
Returns:
|
||||
list: 截取的文件路径列表,包括合并后的文件路径(如果有)。
|
||||
"""
|
||||
logger = get_global_logger(unique_id)
|
||||
# 新增逻辑:检查 PDF 总页数
|
||||
try:
|
||||
reader = PdfReader(pdf_path)
|
||||
num_pages = len(reader.pages)
|
||||
logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
|
||||
if num_pages <= 50:
|
||||
logger.info(f"PDF页数小于或等于50页,跳过切分逻辑。")
|
||||
return ['', '', '', '']
|
||||
except Exception as e:
|
||||
logger.error(f"无法读取 PDF 页数: {e}")
|
||||
return ['', '', '', '']
|
||||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||
truncate_files = []
|
||||
|
||||
# 使用 ThreadPoolExecutor 进行多线程处理
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
|
||||
# 提交所有任务并保持 selection 顺序
|
||||
future_to_selection = {
|
||||
selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default")
|
||||
for selection in selections}
|
||||
|
||||
# 按 selection 顺序收集结果
|
||||
for selection in selections:
|
||||
future = future_to_selection.get(selection)
|
||||
try:
|
||||
files = future.result()
|
||||
if files:
|
||||
if isinstance(files, list):
|
||||
truncate_files.extend(files)
|
||||
elif isinstance(files, str):
|
||||
truncate_files.append(files)
|
||||
except Exception as e:
|
||||
logger.error(f"Selection {selection} 生成了一个异常: {e}")
|
||||
|
||||
# 定义合并后的输出路径
|
||||
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
|
||||
# 调用 merge_selected_pdfs 并获取返回值
|
||||
merged_path = merge_selected_pdfs_for_goods(output_folder, truncate_files, merged_output_path, base_file_name)
|
||||
if merged_path:
|
||||
# 合并成功,添加合并后的文件路径
|
||||
truncate_files.append(merged_path)
|
||||
# logger.info(f"已生成合并文件: {merged_output_path}")
|
||||
else:
|
||||
# 合并失败,添加空字符串
|
||||
truncate_files.append("")
|
||||
logger.warning(f"合并失败,没有生成merged_baseinfo for {pdf_path}")
|
||||
return truncate_files
|
||||
|
||||
|
||||
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 工程标中,判断是符合性审查之后,可以将它们设为同一章
|
||||
|
||||
# ztbfile.pdf少资格评审 包头少符合性评审
|
||||
if __name__ == "__main__":
|
||||
logger = get_global_logger("123")
|
||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
||||
# input_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
||||
pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
||||
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
|
||||
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
||||
input_path=r"C:\Users\Administrator\Desktop\fsdownload\ff2acdba-3a55-48a7-aa7a-61d9f89b909a\ztbfile.pdf"
|
||||
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\ff2acdba-3a55-48a7-aa7a-61d9f89b909a\tmp"
|
||||
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
||||
# files = truncate_pdf_multiple(input_path, output_folder,logger)
|
||||
# selections = [1,4]
|
||||
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
|
||||
# selections = [1, 4]
|
||||
# files = truncate_pdf_multiple(pdf_path, output_folder,logger,selections)
|
||||
# print(files)
|
||||
selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
print(generated_files)
|
||||
# selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
||||
# generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection)
|
||||
# print(generated_files)
|
||||
|
Loading…
x
Reference in New Issue
Block a user