Merge branch 'develop-test' into develop
# Conflicts: # flask_app/货物标/截取pdf货物标版.py # flask_app/货物标/技术参数要求提取.py
This commit is contained in:
commit
94e4e36cc5
@ -1,17 +1,38 @@
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
|
import fitz
|
||||||
from PyPDF2 import PdfReader
|
from PyPDF2 import PdfReader
|
||||||
|
|
||||||
|
|
||||||
def extract_common_header(pdf_path):
|
def extract_common_header(pdf_path):
|
||||||
def get_headers(pdf_document, start_page, pages_to_read):
|
"""
|
||||||
|
提取 PDF 文件的公共页眉。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
- pdf_path: PDF 文件路径。
|
||||||
|
|
||||||
|
返回:
|
||||||
|
- common_header: 公共页眉内容,字符串。如果未找到,则返回空字符串。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def get_headers(pdf_document, start_page, pages_to_read, is_pypdf2):
|
||||||
headers = []
|
headers = []
|
||||||
for i in range(start_page, min(start_page + pages_to_read, len(pdf_document.pages))):
|
for i in range(start_page, min(start_page + pages_to_read,
|
||||||
page = pdf_document.pages[i]
|
len(pdf_document.pages) if is_pypdf2 else pdf_document.page_count)):
|
||||||
text = page.extract_text() or ""
|
try:
|
||||||
if text:
|
if is_pypdf2:
|
||||||
# 只取每页的前三行,去除前后的空白字符
|
page = pdf_document.pages[i]
|
||||||
first_lines = [line.strip() for line in text.strip().split('\n')[:3]]
|
text = page.extract_text() or ""
|
||||||
headers.append(first_lines)
|
else:
|
||||||
|
page = pdf_document.load_page(i)
|
||||||
|
text = page.get_text() or ""
|
||||||
|
if text:
|
||||||
|
# 只取每页的前三行,去除前后的空白字符
|
||||||
|
first_lines = [line.strip() for line in text.strip().split('\n')[:3]]
|
||||||
|
headers.append(first_lines)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"提取第 {i} 页文本时出错: {e}")
|
||||||
|
continue
|
||||||
return headers
|
return headers
|
||||||
|
|
||||||
def find_common_headers(headers):
|
def find_common_headers(headers):
|
||||||
@ -49,45 +70,66 @@ def extract_common_header(pdf_path):
|
|||||||
|
|
||||||
return common_headers
|
return common_headers
|
||||||
|
|
||||||
pdf_document = PdfReader(pdf_path)
|
try:
|
||||||
total_pages = len(pdf_document.pages)
|
# 尝试使用 PyPDF2 读取 PDF
|
||||||
|
try:
|
||||||
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
total_pages = len(pdf_document.pages)
|
||||||
|
is_pypdf2 = True
|
||||||
|
# print("使用 PyPDF2 成功读取 PDF 文件。")
|
||||||
|
except Exception as e_pypdf2:
|
||||||
|
print(f"extract_common_header:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||||||
|
try:
|
||||||
|
# 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF
|
||||||
|
pdf_document = fitz.open(pdf_path)
|
||||||
|
total_pages = pdf_document.page_count
|
||||||
|
is_pypdf2 = False
|
||||||
|
# print("使用 PyMuPDF 成功读取 PDF 文件。")
|
||||||
|
except Exception as e_fitz:
|
||||||
|
print(f"extract_common_header:使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
|
||||||
|
return "" # 或者根据需求抛出异常
|
||||||
|
|
||||||
# 定义两个提取策略
|
# 定义两个提取策略
|
||||||
strategies = []
|
strategies = []
|
||||||
if total_pages >= 3:
|
if total_pages >= 3:
|
||||||
# 策略1:中间的3页
|
# 策略1:中间的3页
|
||||||
middle_page = total_pages // 2
|
middle_page = total_pages // 2
|
||||||
start_page = max(0, middle_page - 1)
|
start_page = max(0, middle_page - 1)
|
||||||
strategies.append((start_page, 3))
|
strategies.append((start_page, 3))
|
||||||
elif total_pages == 2:
|
elif total_pages == 2:
|
||||||
# 策略1:2页
|
# 策略1:2页
|
||||||
strategies.append((0, 2))
|
strategies.append((0, 2))
|
||||||
else:
|
else:
|
||||||
# 策略1:1页
|
# 策略1:1页
|
||||||
strategies.append((0, 1))
|
strategies.append((0, 1))
|
||||||
|
|
||||||
# 策略2:前三页
|
# 策略2:前三页
|
||||||
if total_pages >= 3:
|
if total_pages >= 3:
|
||||||
strategies.append((0, 3))
|
strategies.append((0, 3))
|
||||||
elif total_pages == 2:
|
elif total_pages == 2:
|
||||||
strategies.append((0, 2))
|
strategies.append((0, 2))
|
||||||
elif total_pages == 1:
|
elif total_pages == 1:
|
||||||
strategies.append((0, 1))
|
strategies.append((0, 1))
|
||||||
|
|
||||||
common_headers = []
|
common_headers = []
|
||||||
|
|
||||||
for idx, (start, count) in enumerate(strategies):
|
for idx, (start, count) in enumerate(strategies):
|
||||||
headers = get_headers(pdf_document, start, count)
|
headers = get_headers(pdf_document, start, count, is_pypdf2)
|
||||||
if len(headers) < 2:
|
if len(headers) < 2:
|
||||||
continue # 需要至少2页来比较
|
continue # 需要至少2页来比较
|
||||||
|
|
||||||
current_common = find_common_headers(headers)
|
current_common = find_common_headers(headers)
|
||||||
if current_common:
|
if current_common:
|
||||||
common_headers = current_common
|
common_headers = current_common
|
||||||
break # 找到共同部分后退出
|
# print(f"使用策略{idx + 1}找到共同的页眉: {common_headers}")
|
||||||
# 如果没有找到,继续下一个策略
|
break # 找到共同部分后退出
|
||||||
|
# 如果没有找到,继续下一个策略
|
||||||
|
|
||||||
return '\n'.join(common_headers)
|
return '\n'.join(common_headers)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error in extract_common_header: {e}")
|
||||||
|
return "" # 根据需求调整返回值
|
||||||
|
|
||||||
def clean_page_content(text, common_header):
|
def clean_page_content(text, common_header):
|
||||||
# 首先删除抬头公共部分
|
# 首先删除抬头公共部分
|
||||||
|
@ -1,53 +1,106 @@
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
|
import fitz
|
||||||
from PyPDF2 import PdfReader, PdfWriter
|
from PyPDF2 import PdfReader, PdfWriter
|
||||||
|
|
||||||
|
from flask_app.general.截取pdf通用函数 import create_get_text_function
|
||||||
|
|
||||||
|
|
||||||
#合并PDF
|
#合并PDF
|
||||||
|
# 主合并函数,尝试使用 PyPDF2,若失败则使用 fitz
|
||||||
def merge_pdfs(paths, output_path):
|
def merge_pdfs(paths, output_path):
|
||||||
# 检查是否所有路径都是空字符串或仅包含空白字符
|
# 检查所有路径是否为空或仅包含空白字符
|
||||||
if not any(path.strip() for path in paths):
|
if not any(path.strip() for path in paths):
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
# 优先尝试 PyPDF2 合并
|
||||||
|
try:
|
||||||
|
return merge_with_pypdf2(paths, output_path)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"使用 PyPDF2 合并失败,尝试使用 fitz。错误: {e}")
|
||||||
|
# 若失败则尝试 fitz
|
||||||
|
try:
|
||||||
|
return merge_with_fitz(paths, output_path)
|
||||||
|
except Exception as ex:
|
||||||
|
print(f"使用 fitz 合并也失败了。错误: {ex}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def merge_with_pypdf2(paths, output_path):
|
||||||
pdf_writer = PdfWriter()
|
pdf_writer = PdfWriter()
|
||||||
last_page_text = None # 用于存储上一个PDF的最后一页的文本
|
last_page_text = None # 存储上一个 PDF 的最后一页文本
|
||||||
|
|
||||||
for path in paths:
|
for path in paths:
|
||||||
# 跳过空字符串或仅包含空白字符的路径
|
|
||||||
if not path.strip():
|
if not path.strip():
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
pdf_reader = PdfReader(path)
|
pdf_reader = PdfReader(path)
|
||||||
pages = pdf_reader.pages
|
pages = pdf_reader.pages
|
||||||
start_index = 0 # 从第一页开始添加
|
get_text = create_get_text_function('pypdf2', pdf_reader)
|
||||||
|
|
||||||
# 如果这不是第一个文件,并且有上一个文件的最后一页文本
|
start_index = 0
|
||||||
if last_page_text is not None and len(pages) > 0:
|
if last_page_text is not None and len(pages) > 0:
|
||||||
current_first_page_text = pages[0].extract_text() or ""
|
current_first_page_text = get_text(0)
|
||||||
# 比较当前文件的第一页和上一个文件的最后一页的文本
|
|
||||||
if current_first_page_text == last_page_text:
|
if current_first_page_text == last_page_text:
|
||||||
start_index = 1 # 如果相同,跳过当前文件的第一页
|
start_index = 1
|
||||||
|
|
||||||
# 添加当前PDF的页面到写入器
|
|
||||||
for page_num in range(start_index, len(pages)):
|
for page_num in range(start_index, len(pages)):
|
||||||
pdf_writer.add_page(pages[page_num])
|
pdf_writer.add_page(pages[page_num])
|
||||||
|
|
||||||
# 更新 last_page_text 为当前PDF的最后一页的文本
|
|
||||||
if len(pages) > 0:
|
if len(pages) > 0:
|
||||||
last_page_text = pages[-1].extract_text() or ""
|
last_page_text = get_text(len(pages) - 1)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"文件 '{path}' 无法处理,错误: {e}")
|
print(f"文件 '{path}' 无法使用 PyPDF2 处理,错误: {e}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 如果没有添加任何页面,返回空字符串
|
|
||||||
if len(pdf_writer.pages) == 0:
|
if len(pdf_writer.pages) == 0:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
# 写入合并后的PDF到文件
|
with open(output_path, 'wb') as out:
|
||||||
|
pdf_writer.write(out)
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
def merge_with_fitz(paths, output_path):
|
||||||
|
# 使用 fitz 创建新文档
|
||||||
|
merged_pdf = fitz.open()
|
||||||
|
last_page_text = None
|
||||||
|
|
||||||
|
for path in paths:
|
||||||
|
if not path.strip():
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
pdf_doc = fitz.open(path)
|
||||||
|
get_text = create_get_text_function('fitz', pdf_doc)
|
||||||
|
page_count = pdf_doc.page_count
|
||||||
|
|
||||||
|
start_index = 0
|
||||||
|
if last_page_text is not None and page_count > 0:
|
||||||
|
current_first_page_text = get_text(0)
|
||||||
|
if current_first_page_text == last_page_text:
|
||||||
|
start_index = 1
|
||||||
|
|
||||||
|
# 插入页面到新文档
|
||||||
|
for page_num in range(start_index, page_count):
|
||||||
|
merged_pdf.insert_pdf(pdf_doc, from_page=page_num, to_page=page_num)
|
||||||
|
|
||||||
|
if page_count > 0:
|
||||||
|
last_page_text = get_text(page_count - 1)
|
||||||
|
|
||||||
|
pdf_doc.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"文件 '{path}' 无法使用 fitz 处理,错误: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if merged_pdf.page_count == 0:
|
||||||
|
merged_pdf.close()
|
||||||
|
return ""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(output_path, 'wb') as out:
|
merged_pdf.save(output_path)
|
||||||
pdf_writer.write(out)
|
merged_pdf.close()
|
||||||
return output_path
|
return output_path
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"无法写入输出文件,错误: {e}")
|
print(f"无法写入输出文件,错误: {e}")
|
||||||
|
merged_pdf.close()
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def judge_file_exist(original_path, new_suffix):
|
def judge_file_exist(original_path, new_suffix):
|
||||||
|
@ -119,7 +119,7 @@ if __name__ == "__main__":
|
|||||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
|
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
|
||||||
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
|
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
|
||||||
# pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
|
# pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
|
||||||
pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp\file1736998876340 (1).doc"
|
pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp\鄂州市急救中心展厅布展项目.pdf"
|
||||||
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
|
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
|
||||||
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
||||||
output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp"
|
output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp"
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
# flask_app/general/截取pdf通用函数.py
|
# flask_app/general/截取pdf通用函数.py
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
import fitz
|
||||||
import regex
|
import regex
|
||||||
from PyPDF2 import PdfReader, PdfWriter
|
from PyPDF2 import PdfReader, PdfWriter
|
||||||
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
|
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
|
||||||
@ -8,36 +10,94 @@ from flask_app.general.format_change import docx2pdf
|
|||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
|
|
||||||
|
|
||||||
|
# 定义一个函数用于提取文本
|
||||||
|
def get_text_pypdf2(pdf, page_num):
|
||||||
|
return pdf.pages[page_num].extract_text() or ""
|
||||||
|
|
||||||
|
def get_text_fitz(pdf, page_num):
|
||||||
|
return pdf.load_page(page_num).get_text() or ""
|
||||||
|
# 定义一个内部函数来绑定 pdf_document 对象
|
||||||
|
def create_get_text_function(pdf_lib, pdf_doc):
|
||||||
|
if pdf_lib == 'pypdf2':
|
||||||
|
return lambda page_num: get_text_pypdf2(pdf_doc, page_num)
|
||||||
|
elif pdf_lib == 'fitz':
|
||||||
|
return lambda page_num: get_text_fitz(pdf_doc, page_num)
|
||||||
|
else:
|
||||||
|
raise ValueError("不支持的 PDF 库。")
|
||||||
|
|
||||||
def get_start_and_common_header(input_path, end_page):
|
def get_start_and_common_header(input_path, end_page):
|
||||||
common_header = extract_common_header(input_path)
|
"""
|
||||||
last_begin_index = 0
|
提取 PDF 文件的公共页眉和起始页索引。
|
||||||
begin_pattern = regex.compile(
|
|
||||||
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\))]?\s*$',
|
参数:
|
||||||
regex.MULTILINE
|
- input_path: PDF 文件路径。
|
||||||
)
|
- end_page: 遍历的结束页码。
|
||||||
# 新增目录匹配模式
|
|
||||||
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
返回:
|
||||||
pdf_document = PdfReader(input_path)
|
- common_header: 公共页眉内容。
|
||||||
for i, page in enumerate(pdf_document.pages):
|
- last_begin_index: 起始页的页码索引。
|
||||||
if i > end_page:
|
"""
|
||||||
return common_header, 0 # 如果页码大于end_page,直接返回last_begin_index=0
|
try:
|
||||||
text = page.extract_text()
|
# 提取公共页眉
|
||||||
if text:
|
common_header = extract_common_header(input_path) # 假设该函数已定义
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
|
||||||
# 检查是否存在"目录"
|
last_begin_index = 0
|
||||||
if catalog_pattern.search(cleaned_text):
|
begin_pattern = regex.compile(
|
||||||
continue # 如果存在目录,跳过当前页面
|
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\))]?\s*$',
|
||||||
if begin_pattern.search(cleaned_text):
|
regex.MULTILINE
|
||||||
last_begin_index = i # 更新第一个匹配的索引,页码从0开始
|
)
|
||||||
return common_header, last_begin_index
|
# 新增目录匹配模式
|
||||||
return common_header, last_begin_index
|
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 尝试使用 PyPDF2 读取 PDF
|
||||||
|
pdf_document = PdfReader(input_path)
|
||||||
|
total_pages = len(pdf_document.pages)
|
||||||
|
get_text = create_get_text_function('pypdf2', pdf_document)
|
||||||
|
# print("使用 PyPDF2 成功读取 PDF 文件。")
|
||||||
|
except Exception as e_pypdf2:
|
||||||
|
print(f"get_start_and_common_header:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||||||
|
try:
|
||||||
|
# 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF
|
||||||
|
pdf_document = fitz.open(input_path)
|
||||||
|
total_pages = pdf_document.page_count
|
||||||
|
get_text = create_get_text_function('fitz', pdf_document)
|
||||||
|
# print("使用 PyMuPDF 成功读取 PDF 文件。")
|
||||||
|
except Exception as e_fitz:
|
||||||
|
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
|
||||||
|
return common_header, 0 # 或者根据需求抛出异常
|
||||||
|
|
||||||
|
# 遍历页面
|
||||||
|
for i in range(total_pages):
|
||||||
|
if i > end_page:
|
||||||
|
return common_header, 0 # 如果页码大于 end_page,直接返回 last_begin_index=0
|
||||||
|
text = get_text(i)
|
||||||
|
if text:
|
||||||
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
|
# 检查是否存在"目录"
|
||||||
|
if catalog_pattern.search(cleaned_text):
|
||||||
|
# print(f"页面 {i} 包含目录,跳过。")
|
||||||
|
continue # 如果存在目录,跳过当前页面
|
||||||
|
if begin_pattern.search(cleaned_text):
|
||||||
|
last_begin_index = i # 更新第一个匹配的索引,页码从0开始
|
||||||
|
print(f"匹配到起始模式,设置 last_begin_index 为: {last_begin_index}")
|
||||||
|
return common_header, last_begin_index
|
||||||
|
return common_header, last_begin_index
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error in get_start_and_common_header: {e}")
|
||||||
|
return "", 0 # 根据需求调整返回值
|
||||||
|
|
||||||
|
|
||||||
def check_pdf_pages(pdf_path, mode, logger):
|
def check_pdf_pages(pdf_path, mode, logger):
|
||||||
try:
|
try:
|
||||||
reader = PdfReader(pdf_path)
|
try:
|
||||||
num_pages = len(reader.pages)
|
pdf_document = PdfReader(pdf_path)
|
||||||
logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
|
num_pages = len(pdf_document.pages)
|
||||||
|
# logger.info(f"使用 PyPDF2 成功读取 PDF '{pdf_path}' 的总页数为: {num_pages}")
|
||||||
|
except Exception as e_pypdf2:
|
||||||
|
print(f"check_pdf_pages:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||||||
|
pdf_document = fitz.open(pdf_path)
|
||||||
|
num_pages = pdf_document.page_count
|
||||||
if num_pages <= 30:
|
if num_pages <= 30:
|
||||||
logger.info("PDF(invalid_path)页数小于或等于30页,跳过切分逻辑。")
|
logger.info("PDF(invalid_path)页数小于或等于30页,跳过切分逻辑。")
|
||||||
if mode == 'goods':
|
if mode == 'goods':
|
||||||
@ -57,50 +117,120 @@ def check_pdf_pages(pdf_path, mode, logger):
|
|||||||
|
|
||||||
def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header):
|
def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header):
|
||||||
try:
|
try:
|
||||||
# 检查 start_page 和 end_page 是否为 None
|
|
||||||
if start_page is None or end_page is None:
|
if start_page is None or end_page is None:
|
||||||
print("Error: start_page 或 end_page 为 None")
|
print("错误: start_page 或 end_page 为 None")
|
||||||
return ""
|
return ""
|
||||||
pdf_document = PdfReader(pdf_path)
|
|
||||||
total_pages = len(pdf_document.pages)
|
|
||||||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
|
||||||
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
|
|
||||||
|
|
||||||
|
# 尝试使用 PyPDF2 读取 PDF
|
||||||
|
try:
|
||||||
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
total_pages = len(pdf_document.pages)
|
||||||
|
get_text = lambda p: pdf_document.pages[p].extract_text()
|
||||||
|
using_pypdf2 = True
|
||||||
|
# print("使用 PyPDF2 成功读取 PDF 文件。")
|
||||||
|
except Exception as e_pypdf2:
|
||||||
|
print(f"save_extracted_pages:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||||||
|
# 尝试使用 PyMuPDF 读取 PDF
|
||||||
|
try:
|
||||||
|
pdf_document = fitz.open(pdf_path)
|
||||||
|
total_pages = pdf_document.page_count
|
||||||
|
get_text = lambda p: pdf_document.load_page(p).get_text()
|
||||||
|
using_pypdf2 = False
|
||||||
|
# print("使用 PyMuPDF 成功读取 PDF 文件。")
|
||||||
|
except Exception as e_fitz:
|
||||||
|
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# 检查页面范围的有效性
|
||||||
if start_page < 0 or end_page >= total_pages or start_page > end_page:
|
if start_page < 0 or end_page >= total_pages or start_page > end_page:
|
||||||
print(f"无效的页面范围: {start_page} 到 {end_page}")
|
print(f"无效的页面范围: {start_page} 到 {end_page}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||||
|
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
|
||||||
|
|
||||||
if output_suffix == 'notice' and start_page > 0:
|
if output_suffix == 'notice' and start_page > 0:
|
||||||
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
|
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
|
||||||
before_doc = PdfWriter()
|
|
||||||
toc_page = -1
|
toc_page = -1
|
||||||
# 查找目录页
|
|
||||||
for page_num in range(min(start_page, total_pages)):
|
for page_num in range(min(start_page, total_pages)):
|
||||||
page_text = pdf_document.pages[page_num].extract_text()
|
try:
|
||||||
cleaned_text = clean_page_content(page_text, common_header)
|
text = get_text(page_num)
|
||||||
|
except Exception as e_extract:
|
||||||
|
print(f"提取第 {page_num} 页文本时出错: {e_extract}")
|
||||||
|
continue
|
||||||
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
if regex.search(r'目\s*录', cleaned_text, regex.MULTILINE):
|
if regex.search(r'目\s*录', cleaned_text, regex.MULTILINE):
|
||||||
toc_page = page_num
|
toc_page = page_num
|
||||||
break
|
break
|
||||||
|
|
||||||
# 确定截取的页数
|
|
||||||
pages_to_extract = toc_page + 1 if toc_page != -1 else start_page
|
pages_to_extract = toc_page + 1 if toc_page != -1 else start_page
|
||||||
# 提取页面
|
|
||||||
for page_num in range(pages_to_extract):
|
|
||||||
before_doc.add_page(pdf_document.pages[page_num])
|
|
||||||
print(before_pdf_path)
|
|
||||||
with open(before_pdf_path, 'wb') as f_before:
|
|
||||||
before_doc.write(f_before)
|
|
||||||
|
|
||||||
output_doc = PdfWriter()
|
if using_pypdf2:
|
||||||
for page_num in range(start_page, end_page + 1):
|
before_doc = PdfWriter()
|
||||||
output_doc.add_page(pdf_document.pages[page_num])
|
for page_num in range(pages_to_extract):
|
||||||
with open(output_pdf_path, 'wb') as f:
|
try:
|
||||||
output_doc.write(f)
|
before_doc.add_page(pdf_document.pages[page_num])
|
||||||
print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
|
except Exception as e_page:
|
||||||
return output_pdf_path
|
print(f"添加第 {page_num} 页到 before_doc 时出错: {e_page}")
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
with open(before_pdf_path, 'wb') as f_before:
|
||||||
|
before_doc.write(f_before)
|
||||||
|
print(f"目录前的页面已保存到 {before_pdf_path}")
|
||||||
|
except Exception as e_write_before:
|
||||||
|
print(f"保存目录前的页面时出错: {e_write_before}")
|
||||||
|
else:
|
||||||
|
before_doc = fitz.open()
|
||||||
|
for page_num in range(pages_to_extract):
|
||||||
|
try:
|
||||||
|
before_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
|
||||||
|
except Exception as e_page:
|
||||||
|
print(f"添加第 {page_num} 页到 before_doc 时出错: {e_page}")
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
before_doc.save(before_pdf_path)
|
||||||
|
before_doc.close()
|
||||||
|
print(f"目录前的页面已保存到 {before_pdf_path}")
|
||||||
|
except Exception as e_write_before:
|
||||||
|
print(f"保存目录前的页面时出错: {e_write_before}")
|
||||||
|
|
||||||
|
# 提取并保存目标页面
|
||||||
|
if using_pypdf2:
|
||||||
|
output_doc = PdfWriter()
|
||||||
|
for page_num in range(start_page, end_page + 1):
|
||||||
|
try:
|
||||||
|
output_doc.add_page(pdf_document.pages[page_num])
|
||||||
|
except Exception as e_page:
|
||||||
|
print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}")
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
with open(output_pdf_path, 'wb') as f:
|
||||||
|
output_doc.write(f)
|
||||||
|
print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
|
||||||
|
return output_pdf_path
|
||||||
|
except Exception as e_write:
|
||||||
|
print(f"保存截取的页面时出错: {e_write}")
|
||||||
|
return ""
|
||||||
|
else:
|
||||||
|
output_doc = fitz.open()
|
||||||
|
for page_num in range(start_page, end_page + 1):
|
||||||
|
try:
|
||||||
|
output_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
|
||||||
|
except Exception as e_page:
|
||||||
|
print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}")
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
output_doc.save(output_pdf_path)
|
||||||
|
output_doc.close()
|
||||||
|
print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
|
||||||
|
return output_pdf_path
|
||||||
|
except Exception as e_write:
|
||||||
|
print(f"保存截取的页面时出错: {e_write}")
|
||||||
|
return ""
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error in save_extracted_pages: {e}")
|
print(f"发生未知错误: {e}")
|
||||||
return "" # 返回空字符串
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def is_pdf_or_doc(filename):
|
def is_pdf_or_doc(filename):
|
||||||
@ -148,7 +278,7 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
|||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
),
|
),
|
||||||
regex.compile(
|
regex.compile(
|
||||||
r"\s*(投标文件|响应文件|响应性文件|应答文件)(?:的)?格式\s*",
|
r"\s*(投标文件|响应文件|响应性文件|应答文件)(?:的)?格式(?:及相关附件)?\s*$",
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
@ -159,17 +289,25 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# 打开PDF文件
|
# 打开PDF文件
|
||||||
pdf_document = PdfReader(file_path)
|
try:
|
||||||
total_pages = len(pdf_document.pages)
|
pdf_document = PdfReader(file_path)
|
||||||
# 新增逻辑:如果总页数 <= 50,直接返回 file_path
|
pdf_lib = 'pypdf2'
|
||||||
|
# 获取总页数(对于 pypdf2 的 PDFDocument)
|
||||||
|
total_pages = len(pdf_document.pages)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"get_invalid_file:使用 pypdf2 读取失败,错误信息: {e},切换至 fitz。")
|
||||||
|
pdf_document = fitz.open(file_path)
|
||||||
|
pdf_lib = 'fitz'
|
||||||
|
# 获取总页数(对于 fitz 的 PDFDocument)
|
||||||
|
total_pages = pdf_document.page_count
|
||||||
if total_pages <= 50:
|
if total_pages <= 50:
|
||||||
print(f"PDF页数({total_pages}) <= 50,直接返回文件路径:{file_path}")
|
print(f"PDF页数({total_pages}) <= 50,直接返回文件路径:{file_path}")
|
||||||
return file_path, 0
|
return file_path, 0
|
||||||
|
get_text = create_get_text_function(pdf_lib, pdf_document)
|
||||||
# 提取并清理每页的文本内容
|
# 提取并清理每页的文本内容
|
||||||
page_texts = []
|
page_texts = []
|
||||||
for i in range(total_pages):
|
for i in range(total_pages):
|
||||||
page = pdf_document.pages[i]
|
text = get_text(i)
|
||||||
text = page.extract_text()
|
|
||||||
cleaned_text = clean_page_content(text, common_header) if text else ""
|
cleaned_text = clean_page_content(text, common_header) if text else ""
|
||||||
page_texts.append(cleaned_text)
|
page_texts.append(cleaned_text)
|
||||||
|
|
||||||
@ -194,7 +332,7 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
|||||||
for i in range(30, total_pages):
|
for i in range(30, total_pages):
|
||||||
text = page_texts[i]
|
text = page_texts[i]
|
||||||
exclusion_pattern = regex.compile(
|
exclusion_pattern = regex.compile(
|
||||||
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制|评审要素|评审标准)\s*$', regex.MULTILINE)
|
||||||
if regex.search(exclusion_pattern,text):
|
if regex.search(exclusion_pattern,text):
|
||||||
continue
|
continue
|
||||||
if regex.search(pattern, text):
|
if regex.search(pattern, text):
|
||||||
@ -210,8 +348,11 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
|||||||
return i
|
return i
|
||||||
|
|
||||||
# 如果没有匹配到,设置end_page逻辑
|
# 如果没有匹配到,设置end_page逻辑
|
||||||
if total_pages > 100:
|
if total_pages > 200:
|
||||||
print("未找到结束模式,总页数大于100,设置结束页为前100页。" + file_path)
|
print(f"未找到结束模式,总页数大于200,平滑调整结束页为 {end_page}。" + file_path)
|
||||||
|
return max(100, int(total_pages * 1 / 2))
|
||||||
|
elif 100 < total_pages <= 200:
|
||||||
|
print("未找到结束模式,总页数在100到200之间,设置结束页为总页数的三分之二。" + file_path)
|
||||||
return max(100, int(total_pages * 2 / 3))
|
return max(100, int(total_pages * 2 / 3))
|
||||||
else:
|
else:
|
||||||
print("未找到结束模式,设置结束页为最后一页。" + file_path)
|
print("未找到结束模式,设置结束页为最后一页。" + file_path)
|
||||||
@ -254,44 +395,61 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
|||||||
return "", 0
|
return "", 0
|
||||||
|
|
||||||
|
|
||||||
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
|
def extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
|
||||||
output_suffix="normal", invalid_endpage=-1):
|
output_suffix="normal", invalid_endpage=-1):
|
||||||
start_page = None
|
start_page = None
|
||||||
end_page = None
|
end_page = None
|
||||||
flag = True
|
flag = True
|
||||||
# 确定遍历范围
|
|
||||||
total_pages = len(pdf_document.pages)
|
try:
|
||||||
|
# 尝试使用 PyPDF2 读取 PDF
|
||||||
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
total_pages = len(pdf_document.pages)
|
||||||
|
get_text = create_get_text_function('pypdf2', pdf_document)
|
||||||
|
# print("使用 PyPDF2 成功读取 PDF 文件。")
|
||||||
|
except Exception as e_pypdf2:
|
||||||
|
print(f"extract_pages_generic:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||||||
|
try:
|
||||||
|
# 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF
|
||||||
|
pdf_document = fitz.open(pdf_path)
|
||||||
|
total_pages = pdf_document.page_count
|
||||||
|
get_text = create_get_text_function('fitz', pdf_document)
|
||||||
|
# print("使用 PyMuPDF 成功读取 PDF 文件。")
|
||||||
|
except Exception as e_fitz:
|
||||||
|
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
|
||||||
|
return None, None # 或者根据需求抛出异常
|
||||||
end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
|
end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
|
||||||
for i, page in enumerate(pdf_document.pages[begin_page:end_limit], start=begin_page):
|
for i in range(begin_page, end_limit):
|
||||||
text = page.extract_text() or ""
|
text = get_text(i)
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
if text:
|
||||||
if output_suffix == "tobidders_notice2": #extract_pages_twice_tobidders_notice会调用该代码
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
if output_suffix == "tobidders_notice2": #extract_pages_twice_tobidders_notice会调用该代码
|
||||||
if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern,cleaned_text):
|
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
||||||
flag = False
|
if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern,cleaned_text):
|
||||||
continue
|
flag = False
|
||||||
if begin_page == 0 and catalog_pattern.search(cleaned_text):
|
continue
|
||||||
continue # 如果存在目录,跳过当前页面
|
if begin_page == 0 and catalog_pattern.search(cleaned_text):
|
||||||
else:
|
continue # 如果存在目录,跳过当前页面
|
||||||
# 一般投标文件的编制、组成在'投标人须知前附表/正文'中,需要防止begin_pattern匹配到这部分内容,所以在start_page is None的时候使用该exclusion_pattern
|
|
||||||
if exclusion_pattern and flag and (start_page is None) and regex.search(exclusion_pattern, cleaned_text):
|
|
||||||
flag = False
|
|
||||||
continue
|
|
||||||
if start_page is None and regex.search(begin_pattern, cleaned_text):
|
|
||||||
if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
|
|
||||||
start_page = i
|
|
||||||
continue
|
|
||||||
if start_page is not None:
|
|
||||||
if output_suffix in ["tobidders_notice", "notice", "tobidders_notice2"]: # 使用列表判断多个匹配项
|
|
||||||
# 判断 end_pattern 是否匹配且当前页大于起始页
|
|
||||||
if regex.search(end_pattern, cleaned_text) and i > start_page:
|
|
||||||
end_page = i
|
|
||||||
break
|
|
||||||
else:
|
else:
|
||||||
# 如果 end_pattern 匹配且 begin_pattern 不匹配
|
# 一般投标文件的编制、组成在'投标人须知前附表/正文'中,需要防止begin_pattern匹配到这部分内容,所以在start_page is None的时候使用该exclusion_pattern
|
||||||
if regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern, cleaned_text):
|
if exclusion_pattern and flag and (start_page is None) and regex.search(exclusion_pattern, cleaned_text):
|
||||||
end_page = i
|
flag = False
|
||||||
break
|
continue
|
||||||
|
if start_page is None and regex.search(begin_pattern, cleaned_text):
|
||||||
|
if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
|
||||||
|
start_page = i
|
||||||
|
continue
|
||||||
|
if start_page is not None:
|
||||||
|
if output_suffix in ["tobidders_notice", "notice", "tobidders_notice2"]: # 使用列表判断多个匹配项
|
||||||
|
# 判断 end_pattern 是否匹配且当前页大于起始页
|
||||||
|
if regex.search(end_pattern, cleaned_text) and i > start_page:
|
||||||
|
end_page = i
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# 如果 end_pattern 匹配且 begin_pattern 不匹配
|
||||||
|
if regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern, cleaned_text):
|
||||||
|
end_page = i
|
||||||
|
break
|
||||||
|
|
||||||
return start_page, end_page
|
return start_page, end_page
|
||||||
|
|
||||||
@ -309,7 +467,7 @@ def generate_mid_pattern(chapter_type=None):
|
|||||||
# 定义基础的 mid_pattern
|
# 定义基础的 mid_pattern
|
||||||
base_mid_pattern = (
|
base_mid_pattern = (
|
||||||
r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|'
|
r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|'
|
||||||
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)|'
|
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释|定\s*义)|'
|
||||||
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$'
|
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$'
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -379,9 +537,7 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
|
|||||||
"""
|
"""
|
||||||
exclusion_pattern = regex.compile(
|
exclusion_pattern = regex.compile(
|
||||||
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
||||||
pdf_document = PdfReader(pdf_path)
|
def run_extraction(pdf_document,begin_pattern, extraction_stage='first'):
|
||||||
|
|
||||||
def run_extraction(begin_pattern, extraction_stage='first'):
|
|
||||||
"""
|
"""
|
||||||
使用提供的 begin_pattern 运行提取过程。
|
使用提供的 begin_pattern 运行提取过程。
|
||||||
根据 extraction_stage 判断是“第一次提取”还是“第三次提取”,
|
根据 extraction_stage 判断是“第一次提取”还是“第三次提取”,
|
||||||
@ -401,12 +557,18 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
|
|||||||
end_page = None
|
end_page = None
|
||||||
combined_mid_pattern = None # 中间页的组合模式
|
combined_mid_pattern = None # 中间页的组合模式
|
||||||
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
||||||
total_pages = len(pdf_document.pages)
|
total_pages = len(pdf_document.pages) if isinstance(pdf_document, PdfReader) else pdf_document.page_count
|
||||||
end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
|
end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
|
||||||
# ===== 遍历每一页,执行提取逻辑 =====
|
# ===== 遍历每一页,执行提取逻辑 =====
|
||||||
for i, page in enumerate(pdf_document.pages[:end_limit]):
|
for i in range(end_limit):
|
||||||
text = page.extract_text() or ""
|
try:
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
text = pdf_document.pages[i].extract_text() if isinstance(pdf_document,
|
||||||
|
PdfReader) else pdf_document.load_page(i).get_text()
|
||||||
|
text = text or ""
|
||||||
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"提取第 {i} 页文本时出错: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
# 如果已经找到中间页,且当前页匹配排除模式,则跳过
|
# 如果已经找到中间页,且当前页匹配排除模式,则跳过
|
||||||
if exclusion_pattern and mid_page is not None and regex.search(exclusion_pattern, cleaned_text):
|
if exclusion_pattern and mid_page is not None and regex.search(exclusion_pattern, cleaned_text):
|
||||||
@ -462,19 +624,33 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
|
|||||||
path2 = path1
|
path2 = path1
|
||||||
return path1, path2
|
return path1, path2
|
||||||
|
|
||||||
|
try:
|
||||||
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
total_pages = len(pdf_document.pages)
|
||||||
|
except Exception as e_pypdf2:
|
||||||
|
print(f"extract_pages_tobidders_notice:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||||||
|
try:
|
||||||
|
pdf_document = fitz.open(pdf_path)
|
||||||
|
total_pages = pdf_document.page_count
|
||||||
|
# print("使用 PyMuPDF 成功读取 PDF 文件。")
|
||||||
|
except Exception as e_fitz:
|
||||||
|
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
|
||||||
|
return "", "" # 或者根据需求抛出异常
|
||||||
|
|
||||||
# 第一次提取尝试,使用初始的 begin_pattern
|
# 第一次提取尝试,使用初始的 begin_pattern
|
||||||
begin_pattern = regex.compile(
|
begin_pattern = regex.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+|'
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+|'
|
||||||
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$',
|
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$|'
|
||||||
|
r'(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+\s*$',
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
)
|
)
|
||||||
start_page, mid_page, end_page = run_extraction(begin_pattern, extraction_stage='first')
|
start_page, mid_page, end_page = run_extraction(pdf_document,begin_pattern, extraction_stage='first')
|
||||||
# 如果第一次提取成功,保存并返回路径
|
# 如果第一次提取成功,保存并返回路径
|
||||||
if start_page is not None and mid_page is not None and end_page is not None:
|
if start_page is not None and mid_page is not None and end_page is not None:
|
||||||
return perform_extraction_and_save(start_page, mid_page, end_page)
|
return perform_extraction_and_save(start_page, mid_page, end_page)
|
||||||
|
|
||||||
print(f"第一次提取tobidders_notice失败,尝试第二次提取: {pdf_path}")
|
print(f"第一次提取tobidders_notice失败,尝试第二次提取: {pdf_path}")
|
||||||
start_page1, end_page1, start_page2, end_page2 = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
|
start_page1, end_page1, start_page2, end_page2 = extract_pages_twice_tobidders_notice(pdf_path, common_header, begin_page)
|
||||||
|
|
||||||
if start_page1 is not None and end_page1 is not None and start_page2 is not None and end_page2 is not None:
|
if start_page1 is not None and end_page1 is not None and start_page2 is not None and end_page2 is not None:
|
||||||
if end_page1==start_page2:
|
if end_page1==start_page2:
|
||||||
@ -500,7 +676,8 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
|
|||||||
return "", ""
|
return "", ""
|
||||||
|
|
||||||
|
|
||||||
def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page):
|
def extract_pages_twice_tobidders_notice(pdf_path, common_header, begin_page):
|
||||||
|
pdf_document=PdfReader(pdf_path)
|
||||||
# 投标须知前附表占一章、投标须知正文占一章
|
# 投标须知前附表占一章、投标须知正文占一章
|
||||||
output_suffix = "tobidders_notice"
|
output_suffix = "tobidders_notice"
|
||||||
begin_pattern = regex.compile(
|
begin_pattern = regex.compile(
|
||||||
@ -517,7 +694,7 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
|
|||||||
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
||||||
|
|
||||||
# 提取第一部分
|
# 提取第一部分
|
||||||
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,
|
start_page1, end_page1 = extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header,
|
||||||
exclusion_pattern, output_suffix)
|
exclusion_pattern, output_suffix)
|
||||||
if start_page1 is None or end_page1 is None:
|
if start_page1 is None or end_page1 is None:
|
||||||
return None, None, None,None
|
return None, None, None,None
|
||||||
@ -540,7 +717,7 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
|
|||||||
# 如果不包含排除关键词,继续提取第二部分
|
# 如果不包含排除关键词,继续提取第二部分
|
||||||
output_suffix='tobidders_notice2'
|
output_suffix='tobidders_notice2'
|
||||||
begin_page=end_page1-1 #传入start_page2-1是因为非notice截取->start_page>begin_page
|
begin_page=end_page1-1 #传入start_page2-1是因为非notice截取->start_page>begin_page
|
||||||
start_page2, end_page2 = extract_pages_generic(pdf_document, second_begin_pattern, end_pattern, begin_page, common_header,
|
start_page2, end_page2 = extract_pages_generic(pdf_path, second_begin_pattern, end_pattern, begin_page, common_header,
|
||||||
exclusion_pattern, output_suffix)
|
exclusion_pattern, output_suffix)
|
||||||
|
|
||||||
if start_page2 is None or end_page2 is None:
|
if start_page2 is None or end_page2 is None:
|
||||||
@ -557,14 +734,15 @@ def get_notice(pdf_path, output_folder, begin_page, common_header,invalid_endpag
|
|||||||
end_pattern = regex.compile(
|
end_pattern = regex.compile(
|
||||||
r'^第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|' # 第一部分
|
r'^第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|' # 第一部分
|
||||||
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前瞻部分
|
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前瞻部分
|
||||||
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', # 第二部分
|
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$|' # 第二部分
|
||||||
|
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
|
||||||
|
r'(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+\s*$', # 新增部分
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
)
|
)
|
||||||
exclusion_pattern = regex.compile(
|
exclusion_pattern = regex.compile(
|
||||||
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
||||||
output_suffix = 'notice'
|
output_suffix = 'notice'
|
||||||
pdf_document = PdfReader(pdf_path)
|
start_page, end_page = extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header,
|
||||||
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,
|
|
||||||
exclusion_pattern, output_suffix, invalid_endpage)
|
exclusion_pattern, output_suffix, invalid_endpage)
|
||||||
if start_page is None or end_page is None:
|
if start_page is None or end_page is None:
|
||||||
print(f"{output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
|
print(f"{output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
|
||||||
|
@ -338,7 +338,32 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection):
|
|||||||
return parsed
|
return parsed
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {"error": "调用大模型失败"}
|
return {"error": "调用大模型失败"}
|
||||||
|
def check_consecutive_chinese_numerals(data_dict):
|
||||||
|
"""
|
||||||
|
检查字典的键名中是否存在连续三个中文数字键名。此时通常视为clause1提取失败,直接调用大模型更可靠
|
||||||
|
返回:
|
||||||
|
bool: 如果存在连续三个中文数字键名,返回 False;否则返回 True。
|
||||||
|
"""
|
||||||
|
# 定义中文数字集合
|
||||||
|
chinese_numerals = {"一", "二", "三", "四", "五", "六", "七", "八", "九", "十", "十一", "十二", "十三"}
|
||||||
|
|
||||||
|
# 获取字典的键列表,保持插入顺序
|
||||||
|
keys = list(data_dict.keys())
|
||||||
|
|
||||||
|
# 初始化计数器
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
# 遍历所有键名
|
||||||
|
for key in keys:
|
||||||
|
if key in chinese_numerals:
|
||||||
|
count += 1
|
||||||
|
if count >= 3:
|
||||||
|
print("有三个及以上连续中文数字!")
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
count = 0 # 重置计数器
|
||||||
|
|
||||||
|
return True # 没有找到连续三个中文数字键名
|
||||||
# 读取JSON数据,提取内容,转换结构,并打印结果
|
# 读取JSON数据,提取内容,转换结构,并打印结果
|
||||||
def extract_from_notice(merged_baseinfo_path, clause_path, type):
|
def extract_from_notice(merged_baseinfo_path, clause_path, type):
|
||||||
"""
|
"""
|
||||||
@ -377,7 +402,7 @@ def extract_from_notice(merged_baseinfo_path, clause_path, type):
|
|||||||
if clause_path and clause_path.strip():
|
if clause_path and clause_path.strip():
|
||||||
with open(clause_path, 'r', encoding='utf-8') as file:
|
with open(clause_path, 'r', encoding='utf-8') as file:
|
||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
if len(data) >= 60: #默认clause中少于60条视为json提取失败!
|
if len(data) >= 60 and check_consecutive_chinese_numerals(data): #默认clause中少于60条视为json提取失败!
|
||||||
# 尝试使用大章节筛选
|
# 尝试使用大章节筛选
|
||||||
extracted_data = extract_between_sections(data, target_values,flag)
|
extracted_data = extract_between_sections(data, target_values,flag)
|
||||||
if extracted_data:
|
if extracted_data:
|
||||||
@ -406,7 +431,7 @@ def extract_from_notice(merged_baseinfo_path, clause_path, type):
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json'
|
||||||
merged_baseinfo_path=r"C:\Users\Administrator\Desktop\fsdownload\b29de31a-297e-42cf-b9ba-6859e530a472\ztbfile_merged_baseinfo.pdf"
|
merged_baseinfo_path=r"C:\Users\Administrator\Desktop\fsdownload\b29de31a-297e-42cf-b9ba-6859e530a472\ztbfile_merged_baseinfo.pdf"
|
||||||
clause_path=r"C:\Users\Administrator\Desktop\fsdownload\ec7d5328-9c57-450f-baf4-2e5a6f90ed1d\tmp\clause1.json"
|
clause_path=r"D:\flask_project\flask_app\static\output\output1\ce679bd7-a17a-4a95-bfc9-3801bd4a86e4\tmp\clause1.json"
|
||||||
try:
|
try:
|
||||||
res = extract_from_notice(merged_baseinfo_path,clause_path, 1) # 可以改变此处的 type 参数测试不同的场景
|
res = extract_from_notice(merged_baseinfo_path,clause_path, 1) # 可以改变此处的 type 参数测试不同的场景
|
||||||
res2 = json.dumps(res, ensure_ascii=False, indent=4)
|
res2 = json.dumps(res, ensure_ascii=False, indent=4)
|
||||||
|
@ -481,11 +481,11 @@ def convert_clause_to_json(file_path, output_folder, type=1):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
file_path = r'C:\Users\Administrator\Desktop\货物标\zbfiles\output4444\唐山市公安交通警察支队机动车查验机构视频存储回放系统竞争性谈判-招标文件正文(1)_tobidders_notice_part1.pdf'
|
file_path = r'D:\flask_project\flask_app\static\output\output1\ce679bd7-a17a-4a95-bfc9-3801bd4a86e4\tmp\ztbfile_tobidders_notice_part2.pdf'
|
||||||
# file_path=r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf'
|
# file_path=r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf'
|
||||||
# file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf'
|
# file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf'
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\6.2定版视频会议磋商文件_tobidders_notice_part2.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\6.2定版视频会议磋商文件_tobidders_notice_part2.pdf'
|
||||||
output_folder = r'C:\Users\Administrator\Desktop\货物标\zbfiles\output4444'
|
output_folder = r'D:\flask_project\flask_app\static\output\output1\ce679bd7-a17a-4a95-bfc9-3801bd4a86e4\tmp'
|
||||||
try:
|
try:
|
||||||
output_path = convert_clause_to_json(file_path, output_folder)
|
output_path = convert_clause_to_json(file_path, output_folder)
|
||||||
print(f"Final JSON result saved to: {output_path}")
|
print(f"Final JSON result saved to: {output_path}")
|
||||||
|
@ -117,7 +117,7 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
||||||
pdf_path=r"D:\flask_project\flask_app\static\output\output1\032c44dc-230c-4757-9160-c99f2eaac790\ztbfile_invalid.pdf"
|
pdf_path=r"D:\flask_project\flask_app\static\output\output1\139ecc93-bc85-4007-a4c5-1fb158a88719\ztbfile_invalid.pdf"
|
||||||
# file_path = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
# file_path = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
||||||
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
# flask_app/工程标/截取pdf工程标版.py
|
# flask_app/工程标/截取pdf工程标版.py
|
||||||
|
import fitz
|
||||||
import regex
|
import regex
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
@ -11,10 +12,9 @@ from flask_app.general.通用功能函数 import get_global_logger
|
|||||||
|
|
||||||
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, common_header,output_suffix,invalid_endpage=-1):
|
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, common_header,output_suffix,invalid_endpage=-1):
|
||||||
try:
|
try:
|
||||||
pdf_document = PdfReader(pdf_path)
|
|
||||||
exclusion_pattern = regex.compile(
|
exclusion_pattern = regex.compile(
|
||||||
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
||||||
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
|
start_page, end_page = extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page,
|
||||||
common_header, exclusion_pattern, output_suffix, invalid_endpage)
|
common_header, exclusion_pattern, output_suffix, invalid_endpage)
|
||||||
if start_page is not None and end_page is not None:
|
if start_page is not None and end_page is not None:
|
||||||
return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)
|
return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)
|
||||||
@ -30,7 +30,21 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
|
|||||||
此函数将在所有模式对都失败后调用。
|
此函数将在所有模式对都失败后调用。
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
pdf_document = PdfReader(pdf_path)
|
try:
|
||||||
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
is_pypdf2 = True
|
||||||
|
total_pages = len(pdf_document.pages)
|
||||||
|
# print("使用 PyPDF2 成功读取 PDF 文件。")
|
||||||
|
except Exception as e_pypdf2:
|
||||||
|
print(f"extract_pages_twice:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||||||
|
try:
|
||||||
|
pdf_document = fitz.open(pdf_path)
|
||||||
|
is_pypdf2 = False
|
||||||
|
total_pages = pdf_document.page_count
|
||||||
|
# print("使用 PyMuPDF 成功读取 PDF 文件。")
|
||||||
|
except Exception as e_fitz:
|
||||||
|
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
|
||||||
|
return ""
|
||||||
if output_suffix == "qualification":
|
if output_suffix == "qualification":
|
||||||
print("最后一次尝试获取qualification!")
|
print("最后一次尝试获取qualification!")
|
||||||
# 动态设置 include_keys
|
# 动态设置 include_keys
|
||||||
@ -49,9 +63,18 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
|
|||||||
)
|
)
|
||||||
start_page = None
|
start_page = None
|
||||||
end_page = None
|
end_page = None
|
||||||
|
def get_text(page_num):
|
||||||
|
try:
|
||||||
|
if is_pypdf2:
|
||||||
|
return pdf_document.pages[page_num].extract_text() or ""
|
||||||
|
else:
|
||||||
|
return pdf_document.load_page(page_num).get_text() or ""
|
||||||
|
except Exception as e:
|
||||||
|
print(f"提取第 {page_num} 页文本时出错: {e}")
|
||||||
|
return ""
|
||||||
# 从指定的开始页开始遍历
|
# 从指定的开始页开始遍历
|
||||||
for i, page in enumerate(pdf_document.pages[last_begin_index:], start=last_begin_index):
|
for i in range(last_begin_index, total_pages):
|
||||||
text = page.extract_text()
|
text = get_text(i)
|
||||||
if text:
|
if text:
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
# 确定起始页,需在last_begin_index之后
|
# 确定起始页,需在last_begin_index之后
|
||||||
@ -132,7 +155,7 @@ def truncate_pdf_main_engineering(input_path, output_folder, selection, logger,
|
|||||||
r'(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:[\u4e00-\u9fff、()()]*?)'
|
r'(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:[\u4e00-\u9fff、()()]*?)'
|
||||||
r'(?=.*(?:(?:磋商|谈判)(?=.*(?:办法|方法|内容))|(?:评标|评定|评审)))' # 修改的部分
|
r'(?=.*(?:(?:磋商|谈判)(?=.*(?:办法|方法|内容))|(?:评标|评定|评审)))' # 修改的部分
|
||||||
r'[\u4e00-\u9fff、()()]*\s*$|'
|
r'[\u4e00-\u9fff、()()]*\s*$|'
|
||||||
r'^\s*(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)评标(方法|办法)前附表\s*$)',
|
r'^\s*(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)评标(方法|办法)(?:前附表)?\s*$)',
|
||||||
# 第二种模式
|
# 第二种模式
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
),
|
),
|
||||||
|
@ -1,18 +1,18 @@
|
|||||||
|
import fitz
|
||||||
from PyPDF2 import PdfReader, PdfWriter
|
from PyPDF2 import PdfReader, PdfWriter
|
||||||
import regex # 导入正则表达式库
|
import regex # 导入正则表达式库
|
||||||
import os # 用于文件和文件夹操作
|
import os # 用于文件和文件夹操作
|
||||||
from flask_app.general.clean_pdf import clean_page_content, extract_common_header
|
from flask_app.general.clean_pdf import clean_page_content
|
||||||
from flask_app.general.merge_pdfs import merge_and_cleanup
|
from flask_app.general.merge_pdfs import merge_and_cleanup
|
||||||
from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, is_pdf_or_doc, \
|
from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, is_pdf_or_doc, \
|
||||||
convert_to_pdf, get_invalid_file, extract_pages_tobidders_notice, extract_pages_generic, get_notice
|
get_invalid_file, extract_pages_tobidders_notice, extract_pages_generic, get_notice, create_get_text_function
|
||||||
from flask_app.general.通用功能函数 import get_global_logger
|
from flask_app.general.通用功能函数 import get_global_logger
|
||||||
|
|
||||||
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, common_header,output_suffix,logger,invalid_endpage=-1):
|
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, common_header,output_suffix,logger,invalid_endpage=-1):
|
||||||
try:
|
try:
|
||||||
pdf_document = PdfReader(pdf_path)
|
|
||||||
exclusion_pattern = regex.compile(
|
exclusion_pattern = regex.compile(
|
||||||
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
||||||
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
|
start_page, end_page = extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page,
|
||||||
common_header, exclusion_pattern, output_suffix,invalid_endpage)
|
common_header, exclusion_pattern, output_suffix,invalid_endpage)
|
||||||
if start_page is None or end_page is None:
|
if start_page is None or end_page is None:
|
||||||
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
||||||
@ -47,7 +47,7 @@ def get_patterns_for_evaluation_method():
|
|||||||
r'(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:[\u4e00-\u9fff、()()]*?)'
|
r'(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:[\u4e00-\u9fff、()()]*?)'
|
||||||
r'(?=.*(?:(?:磋商|谈判)(?=.*(?:办法|方法|内容))|(?:评标|评定|评审)))'
|
r'(?=.*(?:(?:磋商|谈判)(?=.*(?:办法|方法|内容))|(?:评标|评定|评审)))'
|
||||||
r'[\u4e00-\u9fff、()()]*\s*$|'
|
r'[\u4e00-\u9fff、()()]*\s*$|'
|
||||||
r'^\s*(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!"\s*)(?<!“\s*)(?<!“\s*)评标(方法|办法)前附表\s*$)', # 第二种模式
|
r'^\s*(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!"\s*)(?<!“\s*)(?<!“\s*)评标(方法|办法)(?:前附表)?\s*$)', # 第二种模式
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
)
|
)
|
||||||
end_pattern = regex.compile(
|
end_pattern = regex.compile(
|
||||||
@ -64,7 +64,8 @@ def get_patterns_for_notice():
|
|||||||
)
|
)
|
||||||
return begin_pattern, end_pattern
|
return begin_pattern, end_pattern
|
||||||
|
|
||||||
def extract_pages_qualification(pdf_document, begin_page, common_header):
|
def extract_pages_qualification(pdf_path, begin_page, common_header):
|
||||||
|
pdf_document = PdfReader(pdf_path)
|
||||||
# 开始匹配模式,仅匹配“附录”、“附件”或“附表”
|
# 开始匹配模式,仅匹配“附录”、“附件”或“附表”
|
||||||
begin_pattern = regex.compile(
|
begin_pattern = regex.compile(
|
||||||
r'^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?)(?!的)',
|
r'^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?)(?!的)',
|
||||||
@ -73,7 +74,7 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
|
|||||||
|
|
||||||
# 优先匹配模式,匹配以“资格性检查”、“资格审查”或“符合性审查”开头
|
# 优先匹配模式,匹配以“资格性检查”、“资格审查”或“符合性审查”开头
|
||||||
priority_pattern = regex.compile(
|
priority_pattern = regex.compile(
|
||||||
r'^((资格|符合|资格性|符合性)(?:、(资格|符合|资格性|符合性))*)?(检查|审查)(?:表|比对表|对照表)?$',
|
r'^((资格|符合|资格性|符合性)(、(资格|符合|资格性|符合性))*)(检查|审查)(?:表|比对表|对照表)?\s*$',
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -94,12 +95,28 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
|
|||||||
include_keywords = ["资格审查", "资质审查", "符合性审查","资格性审查", "符合性检查","资格性检查", "资格检查"]
|
include_keywords = ["资格审查", "资质审查", "符合性审查","资格性审查", "符合性检查","资格性检查", "资格检查"]
|
||||||
exclude_keywords = ["声明函", "承诺函"]
|
exclude_keywords = ["声明函", "承诺函"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 尝试使用 PyPDF2 读取 PDF
|
||||||
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
total_pages = len(pdf_document.pages)
|
||||||
|
get_text = create_get_text_function('pypdf2', pdf_document)
|
||||||
|
# print("使用 PyPDF2 成功读取 PDF 文件。")
|
||||||
|
except Exception as e_pypdf2:
|
||||||
|
print(f"extract_pages_qualification:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||||||
|
try:
|
||||||
|
# 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF
|
||||||
|
pdf_document = fitz.open(pdf_path)
|
||||||
|
total_pages = pdf_document.page_count
|
||||||
|
get_text = create_get_text_function('fitz', pdf_document)
|
||||||
|
# print("使用 PyMuPDF 成功读取 PDF 文件。")
|
||||||
|
except Exception as e_fitz:
|
||||||
|
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
|
||||||
|
return None, None # 或者根据需求抛出异常
|
||||||
# 从指定的开始页开始遍历
|
# 从指定的开始页开始遍历
|
||||||
for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
|
for i in range(begin_page, total_pages):
|
||||||
text = page.extract_text()
|
text = get_text(i)
|
||||||
if text:
|
if text:
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
|
|
||||||
# 优先检查是否匹配优先模式
|
# 优先检查是否匹配优先模式
|
||||||
priority_match = priority_pattern.search(cleaned_text)
|
priority_match = priority_pattern.search(cleaned_text)
|
||||||
if priority_match and start_page is None:
|
if priority_match and start_page is None:
|
||||||
@ -142,7 +159,6 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, b
|
|||||||
try:
|
try:
|
||||||
exclusion_pattern = regex.compile(
|
exclusion_pattern = regex.compile(
|
||||||
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
||||||
pdf_document = PdfReader(pdf_path)
|
|
||||||
patterns = None
|
patterns = None
|
||||||
start_page = None
|
start_page = None
|
||||||
end_page = None
|
end_page = None
|
||||||
@ -154,10 +170,10 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, b
|
|||||||
elif output_suffix == "notice":
|
elif output_suffix == "notice":
|
||||||
patterns = [get_patterns_for_notice()]
|
patterns = [get_patterns_for_notice()]
|
||||||
elif output_suffix == "qualification1":
|
elif output_suffix == "qualification1":
|
||||||
start_page, end_page = extract_pages_qualification(pdf_document, begin_page, common_header)
|
start_page, end_page = extract_pages_qualification(pdf_path, begin_page, common_header)
|
||||||
if patterns:
|
if patterns:
|
||||||
for pattern_pair in patterns:
|
for pattern_pair in patterns:
|
||||||
start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
|
start_page, end_page = extract_pages_generic(pdf_path, pattern_pair[0], pattern_pair[1], begin_page,
|
||||||
common_header, exclusion_pattern, output_suffix)
|
common_header, exclusion_pattern, output_suffix)
|
||||||
if start_page is not None and end_page is not None:
|
if start_page is not None and end_page is not None:
|
||||||
break
|
break
|
||||||
@ -315,12 +331,11 @@ if __name__ == "__main__":
|
|||||||
logger = get_global_logger("123")
|
logger = get_global_logger("123")
|
||||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
||||||
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
||||||
pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\需求\“天府粮仓”青白江区“一园三片”数字化提升和标准化体系建设项目(三次)招标文件(N510113202400010820250102001).pdf"
|
pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp\鄂州市急救中心展厅布展项目.pdf"
|
||||||
|
|
||||||
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
||||||
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
||||||
output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\需求"
|
output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp"
|
||||||
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
||||||
selection = 5 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
|
selection = 3 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
|
||||||
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
||||||
print(generated_files)
|
print(generated_files)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user