From 772aaa17f02ea09c636c2b4ca9af547989ca53eb Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Thu, 24 Oct 2024 20:51:48 +0800 Subject: [PATCH] 10.24 --- flask_app/general/format_change.py | 234 +++++++++++----------- flask_app/general/读取文件/按页读取pdf.py | 2 +- flask_app/testdir/截取文件格式.py | 160 +++++++++------ 3 files changed, 215 insertions(+), 181 deletions(-) diff --git a/flask_app/general/format_change.py b/flask_app/general/format_change.py index 14f99f0..01bacc4 100644 --- a/flask_app/general/format_change.py +++ b/flask_app/general/format_change.py @@ -49,130 +49,132 @@ def pdf2docx(local_path_in): print(f"format_change p2d:have downloaded file to: {downloaded_filepath}") return downloaded_filepath -def doc2docx(local_path_in): - remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d' - receive_download_url = upload_file(local_path_in, remote_url) - print(receive_download_url) - filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹 - local_filename = os.path.join(folder, filename) # 输出文件名 - downloaded_filepath, file_type = download_file(receive_download_url, local_filename) - print(f"format_change d2d:have downloaded file to: {downloaded_filepath}") - return downloaded_filepath -def docx2pdf(local_path_in): - remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p' - receive_download_url = upload_file(local_path_in, remote_url) - filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹 - local_filename = os.path.join(folder, filename) # 输出文件名 - downloaded_filepath,file_type = download_file(receive_download_url, local_filename) - print(f"format_change d2p:have downloaded file to: {downloaded_filepath}") - return downloaded_filepath -# def docx2pdf(file_path): -# """ -# 将本地的 .docx 或 .doc 文件转换为 .pdf 文件。 -# -# 参数: -# - file_path: str, 本地文件的路径,支持 .docx 和 .doc 格式。 -# """ -# # 检查文件是否存在 -# if not os.path.isfile(file_path): -# raise FileNotFoundError(f"文件未找到: {file_path}") -# -# # 获取文件名和扩展名 -# base_name = os.path.basename(file_path) -# name, ext = os.path.splitext(base_name) -# ext = ext.lower().lstrip('.') -# -# if ext not in ['docx', 'doc']: -# raise ValueError(f"doc2pdf 仅支持 .docx 和 .doc 文件,当前文件扩展名为: .{ext}") -# -# # 定义转换接口 -# endpoint = 'http://120.26.236.97:5008/convert_to_pdf' -# -# # 获取文件所在目录 -# output_dir = os.path.dirname(file_path) -# -# # 准备上传的文件 -# with open(file_path, 'rb') as f: -# files = {'file': (base_name, f)} -# try: -# print(f"正在将 {base_name} 转换为 .pdf 格式...") -# response = requests.post(endpoint, files=files) -# response.raise_for_status() # 检查请求是否成功 -# except requests.RequestException as e: -# print(f"转换过程中发生错误: {e}") -# return -# -# # 准备保存转换后文件的路径 -# output_file_name = f"{name}.pdf" -# output_path = os.path.join(output_dir, output_file_name) -# -# # 保存转换后的文件 -# with open(output_path, 'wb') as out_file: -# out_file.write(response.content) -# -# print(f"文件已成功转换并保存至: {output_path}") -# return output_path -# -# -# def doc2docx(file_path): -# """ -# 将本地的 .doc 文件转换为 .docx 文件。 -# -# 参数: -# - file_path: str, 本地文件的路径,支持 .doc 格式。 -# """ -# # 检查文件是否存在 -# if not os.path.isfile(file_path): -# raise FileNotFoundError(f"文件未找到: {file_path}") -# -# # 获取文件名和扩展名 -# base_name = os.path.basename(file_path) -# name, ext = os.path.splitext(base_name) -# ext = ext.lower().lstrip('.') -# -# if ext != 'doc': -# raise ValueError(f"doc2docx 仅支持 .doc 文件,当前文件扩展名为: .{ext}") -# -# # 定义转换接口 -# endpoint = 'http://120.26.236.97:5008/convert_to_docx' -# -# # 获取文件所在目录 -# output_dir = os.path.dirname(file_path) -# -# # 准备上传的文件 -# with open(file_path, 'rb') as f: -# files = {'file': (base_name, f)} -# try: -# print(f"正在将 {base_name} 转换为 .docx 格式...") -# response = requests.post(endpoint, files=files) -# response.raise_for_status() # 检查请求是否成功 -# except requests.RequestException as e: -# print(f"转换过程中发生错误: {e}") -# return -# -# # 准备保存转换后文件的路径 -# output_file_name = f"{name}.docx" -# output_path = os.path.join(output_dir, output_file_name) -# -# # 保存转换后的文件 -# with open(output_path, 'wb') as out_file: -# out_file.write(response.content) -# -# print(f"文件已成功转换并保存至: {output_path}") -# return output_path +# def doc2docx(local_path_in): +# remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d' +# receive_download_url = upload_file(local_path_in, remote_url) +# print(receive_download_url) +# filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹 +# local_filename = os.path.join(folder, filename) # 输出文件名 +# downloaded_filepath, file_type = download_file(receive_download_url, local_filename) +# print(f"format_change d2d:have downloaded file to: {downloaded_filepath}") +# return downloaded_filepath +# def docx2pdf(local_path_in): +# remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p' +# receive_download_url = upload_file(local_path_in, remote_url) +# filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹 +# local_filename = os.path.join(folder, filename) # 输出文件名 +# downloaded_filepath,file_type = download_file(receive_download_url, local_filename) +# print(f"format_change d2p:have downloaded file to: {downloaded_filepath}") +# return downloaded_filepath + +def docx2pdf(file_path): + """ + 将本地的 .docx 或 .doc 文件转换为 .pdf 文件。 + + 参数: + - file_path: str, 本地文件的路径,支持 .docx 和 .doc 格式。 + """ + # 检查文件是否存在 + if not os.path.isfile(file_path): + raise FileNotFoundError(f"文件未找到: {file_path}") + + # 获取文件名和扩展名 + base_name = os.path.basename(file_path) + name, ext = os.path.splitext(base_name) + ext = ext.lower().lstrip('.') + + if ext not in ['docx', 'doc']: + raise ValueError(f"doc2pdf 仅支持 .docx 和 .doc 文件,当前文件扩展名为: .{ext}") + + # 定义转换接口 + endpoint = 'http://120.26.236.97:5008/convert_to_pdf' + # endpoint = 'http://192.168.0.2:5008/convert_to_pdf' + + # 获取文件所在目录 + output_dir = os.path.dirname(file_path) + + # 准备上传的文件 + with open(file_path, 'rb') as f: + files = {'file': (base_name, f)} + try: + print(f"正在将 {base_name} 转换为 .pdf 格式...") + response = requests.post(endpoint, files=files) + response.raise_for_status() # 检查请求是否成功 + except requests.RequestException as e: + print(f"转换过程中发生错误: {e}") + return + + # 准备保存转换后文件的路径 + output_file_name = f"{name}.pdf" + output_path = os.path.join(output_dir, output_file_name) + + # 保存转换后的文件 + with open(output_path, 'wb') as out_file: + out_file.write(response.content) + + print(f"文件已成功转换并保存至: {output_path}") + return output_path + + +def doc2docx(file_path): + """ + 将本地的 .doc 文件转换为 .docx 文件。 + + 参数: + - file_path: str, 本地文件的路径,支持 .doc 格式。 + """ + # 检查文件是否存在 + if not os.path.isfile(file_path): + raise FileNotFoundError(f"文件未找到: {file_path}") + + # 获取文件名和扩展名 + base_name = os.path.basename(file_path) + name, ext = os.path.splitext(base_name) + ext = ext.lower().lstrip('.') + + if ext != 'doc': + raise ValueError(f"doc2docx 仅支持 .doc 文件,当前文件扩展名为: .{ext}") + + # 定义转换接口 + endpoint = 'http://120.26.236.97:5008/convert_to_docx' + + # 获取文件所在目录 + output_dir = os.path.dirname(file_path) + + # 准备上传的文件 + with open(file_path, 'rb') as f: + files = {'file': (base_name, f)} + try: + print(f"正在将 {base_name} 转换为 .docx 格式...") + response = requests.post(endpoint, files=files) + response.raise_for_status() # 检查请求是否成功 + except requests.RequestException as e: + print(f"转换过程中发生错误: {e}") + return + + # 准备保存转换后文件的路径 + output_file_name = f"{name}.docx" + output_path = os.path.join(output_dir, output_file_name) + + # 保存转换后的文件 + with open(output_path, 'wb') as out_file: + out_file.write(response.content) + + print(f"文件已成功转换并保存至: {output_path}") + return output_path if __name__ == '__main__': # 替换为你的文件路径和API URL # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf" - local_path_in = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.docx" + # local_path_in = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.docx" # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf" + local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf" # downloaded_file=doc2docx(local_path_in) - # downloaded_file=pdf2docx(local_path_in) - for i in range(1): - downloaded_file=docx2pdf(local_path_in) - print(downloaded_file) + downloaded_file=pdf2docx(local_path_in) + # downloaded_file=docx2pdf(local_path_in) + print(downloaded_file) diff --git a/flask_app/general/读取文件/按页读取pdf.py b/flask_app/general/读取文件/按页读取pdf.py index 20754a2..92462af 100644 --- a/flask_app/general/读取文件/按页读取pdf.py +++ b/flask_app/general/读取文件/按页读取pdf.py @@ -153,7 +153,7 @@ if __name__ == '__main__': # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf' - file_path='D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\ztbfile_tobidders_notice_part2.pdf' + file_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\\zbtest12.pdf" # ress = extract_common_header(file_path) # print(ress) res=extract_text_by_page(file_path) diff --git a/flask_app/testdir/截取文件格式.py b/flask_app/testdir/截取文件格式.py index 1671fb5..b069ed1 100644 --- a/flask_app/testdir/截取文件格式.py +++ b/flask_app/testdir/截取文件格式.py @@ -1,7 +1,6 @@ from PyPDF2 import PdfReader, PdfWriter import re # 导入正则表达式库 import os # 用于文件和文件夹操作 -# from flask_app.general.format_change import docx2pdf # 确保此模块可用 def clean_page_content(text, common_header): """ @@ -13,14 +12,14 @@ def clean_page_content(text, common_header): # 替换首次出现的完整行 text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1) - # 删除页码,合并多个正则表达式为一个 - text = re.sub( - r'(^\s*\d+\s*(?=\D))|(\s+\d+\s*$)|(\s*/\s*\d+\s*)|(\s*[—-]\s*\d+\s*[—-]\s*)', - '', - text, - flags=re.MULTILINE - ) + # 删除页码,支持多种格式 + text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 + text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码 + text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码 + text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码 return text + + def extract_common_header(pdf_path): """ 从PDF的前几页提取公共抬头。 @@ -39,13 +38,19 @@ def extract_common_header(pdf_path): if text: # 只取每页的前三行 first_lines = text.strip().split('\n')[:3] - headers.append(set(line.strip() for line in first_lines if line.strip())) + headers.append(first_lines) if len(headers) < 2: return "" # 如果没有足够的页来比较,返回空字符串 # 寻找每一行中的公共部分,按顺序保留 - common_headers = set.intersection(*headers) + common_headers = [] + for lines in zip(*headers): + first_words = lines[0].split() + common_line = [word for word in first_words if all(word in line.split() for line in lines[1:])] + if common_line: + common_headers.append(' '.join(common_line)) + return '\n'.join(common_headers) @@ -67,13 +72,17 @@ def is_pdf_or_doc(filename): def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix): """ - 将提取的页面保存为新的PDF文件。 + 将提取的页面保存为新的PDF文件,统一命名为 "bid_format.pdf" 并存放在独立的子文件夹中。 """ try: - base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] - output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf") + output_pdf_path = os.path.join(output_folder, "bid_format.pdf") - if start_page is None or end_page is None or start_page > end_page: + # 检查 start_page 和 end_page 是否为 None + if start_page is None or end_page is None: + print(f"起始页或结束页为 None: start_page={start_page}, end_page={end_page} 文件: {pdf_path}") + return "" + + if start_page < 0 or end_page >= len(pdf_document.pages) or start_page > end_page: print(f"无效的页面范围: {start_page} 到 {end_page} 文件: {pdf_path}") return "" @@ -89,29 +98,57 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo return "" # 返回空字符串 -def find_page_indices(pdf_document, begin_pattern, end_pattern, begin_page, common_header): +def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header): """ - 查找起始页和结束页的索引。 + 通用函数,根据模式提取起始页和结束页。 """ start_page = None end_page = None - total_pages = len(pdf_document.pages) - for i, page in enumerate(pdf_document.pages): - if i <= begin_page: - continue text = page.extract_text() or "" cleaned_text = clean_page_content(text, common_header) - if start_page is None and re.search(begin_pattern, cleaned_text): + # print(cleaned_text) + if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page: start_page = i - continue - if start_page is not None and re.search(end_pattern, cleaned_text): + if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page: end_page = i break + return start_page, end_page - return start_page, end_page if end_page else total_pages - 1 - - +def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header): + pdf_document = PdfReader(pdf_path) + total_pages = len(pdf_document.pages) - 1 # 获取总页数 + begin_page = 10 + start_page = None + end_page = None + begin_patterns = [ + re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*', re.MULTILINE), + re.compile(r"\s*(投标文件格式|响应文件格式|响应性文件格式)\s*") + ] + exclusion_pattern = re.compile(r'文件的构成|文件的组成|目录|评标办法|技术标准|其他材料|合同') + end_pattern = re.compile( + r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) + for begin_pattern in begin_patterns: + for i, page in enumerate(pdf_document.pages): + text = page.extract_text() or "" + cleaned_text = clean_page_content(text, common_header) + if exclusion_pattern and re.search(exclusion_pattern, cleaned_text): + continue + if re.search(begin_pattern, cleaned_text) and i > begin_page: + start_page = i + if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page: + end_page = i + break + if start_page is not None: + break + if start_page is None: + print(f"{output_suffix}: 未找到起始页,提取失败! 文件: {pdf_path}") + return "" + if end_page is None: + # 如果未匹配到结束页,默认截取到文件末尾 + end_page = total_pages + print(f"{output_suffix}: 未找到结束页,默认截取到文件末尾。 文件: {pdf_path}") + return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix) def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): """ 根据开始和结束模式提取PDF页面。 @@ -119,31 +156,20 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter try: common_header = extract_common_header(pdf_path) pdf_document = PdfReader(pdf_path) + total_pages = len(pdf_document.pages) - 1 # 获取总页数 - start_page, end_page = find_page_indices(pdf_document, begin_pattern, end_pattern, begin_page, common_header) + # 提取起始页和结束页 + start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header) - if output_suffix == "format" and start_page is None: - print(f"{output_suffix}: 未找到起始页,尝试二次提取! 文件: {pdf_path}") - # 二次提取逻辑,保留原有功能 - start_page, end_page = find_page_indices( - pdf_document, - re.compile(r'^(?:响应|投标).*?格式.*', re.MULTILINE), - end_pattern, - begin_page, - common_header - ) + if output_suffix == "format": if start_page is None: - print(f"{output_suffix}: 未找到起始页,提取失败! 文件: {pdf_path}") - return "" - - if start_page is None: - print(f"未找到起始页,提取失败! 文件: {pdf_path}") - return "" - - if end_page is None: - end_page = len(pdf_document.pages) - 1 - print(f"{output_suffix}: 未找到结束页,默认截取到文件末尾。 文件: {pdf_path}") - + print(f"{output_suffix}: 未找到起始页,提取失败! 文件: {pdf_path} 尝试二次提取!") + return extract_pages_twice(pdf_path,output_folder,output_suffix,common_header) + if end_page is None: + # 如果未匹配到结束页,默认截取到文件末尾 + end_page = total_pages + print(f"{output_suffix}: 未找到结束页,默认截取到文件末尾。 文件: {pdf_path}") + return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix) return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix) except Exception as e: print(f"Error processing {pdf_path}: {e}") @@ -156,7 +182,8 @@ def process_files(file_path, output_folder, begin_pattern, begin_page, end_patte """ # pdf_path = convert_to_pdf(file_path) pdf_path=file_path - return extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) or "" + result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) + return result or "" def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): @@ -172,47 +199,52 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt file_path = os.path.join(input_path, file_name) if is_pdf_or_doc(file_path): result = process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) - generated_files.append(result) + if isinstance(result, tuple): + generated_files.extend([f if f else "" for f in result]) # 保留空字符串 + else: + generated_files.append(result) # 直接添加result,可能是空字符串 elif os.path.isfile(input_path) and is_pdf_or_doc(input_path): result = process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) - generated_files.append(result) + if isinstance(result, tuple): + generated_files.extend([f if f else "" for f in result]) # 保留空字符串 + else: + generated_files.append(result) # 直接添加result,可能是空字符串 else: - print("提供的路径既不是文件夹也不是PDF/Word文件。") - return [f for f in generated_files if f] # 过滤空字符串 + print("提供的路径既不是文件夹也不是PDF文件。") + + return generated_files -def truncate_pdf_main(input_path, output_folder, selection): +def truncate_pdf_main(input_path, output_folder, selection,begin_page=10): """ 主函数,根据选择截取PDF内容。 """ try: if selection == 1: # 投标文件格式 - begin_page = 10 begin_pattern = re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*', + r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*' ) end_pattern = re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', - re.MULTILINE + r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE ) local_output_suffix = "format" else: print("无效的选择:请选择1") - return [] + return None # 调用相应的处理函数 - return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, local_output_suffix) + return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, local_output_suffix) or "" except Exception as e: print(f"Error in truncate_pdf_main: {e}") - return [] + return "" # 返回空字符串 if __name__ == "__main__": # 定义输入和输出路径 - # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles" - input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹" + # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件(2).pdf" + input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest12.pdf" output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹" selection = 1 # 1 - 投标文件格式 # 执行截取 generated_files = truncate_pdf_main(input_path, output_folder, selection) - # print("生成的文件:", generated_files) + print("生成的文件:", generated_files)