diff --git a/flask_app/general/clean_pdf.py b/flask_app/general/clean_pdf.py index d55dc6d..c727b85 100644 --- a/flask_app/general/clean_pdf.py +++ b/flask_app/general/clean_pdf.py @@ -1,9 +1,6 @@ import re from PyPDF2 import PdfReader -from flask_app.general.format_change import docx2pdf - - def extract_common_header(pdf_path): def get_headers(pdf_document, start_page, pages_to_read): @@ -84,10 +81,6 @@ def clean_page_content(text, common_header): text = re.sub(r'^\s*[—-]\s*\d{1,3}\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码 return text - -from PyPDF2 import PdfReader - - def is_scanned_pdf(file_path, max_pages=15): """ 检查 PDF 是否为扫描件(即前 15 页无文本)。 @@ -109,19 +102,6 @@ def is_scanned_pdf(file_path, max_pages=15): return False # 不是扫描型 return True # 前 max_pages 页都没有文本 -def get_pdf_page_count(file_path): - """ - 获取 PDF 文件的页码数量 - """ - try: - pdf_path=file_path - if file_path.lower().endswith(('.doc', '.docx')): - pdf_path = docx2pdf(file_path) - reader = PdfReader(pdf_path) - return len(reader.pages) - except Exception as e: - print(f"读取 PDF 页码时出错:{e}") - return 0 if __name__ == '__main__': file_path = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\2020-安徽-安徽省生态环境厅电梯采购.pdf" res=is_scanned_pdf(file_path) diff --git a/flask_app/general/format_change.py b/flask_app/general/format_change.py index 760bdf1..de994d7 100644 --- a/flask_app/general/format_change.py +++ b/flask_app/general/format_change.py @@ -2,6 +2,7 @@ import json import os import mimetypes import requests +from PyPDF2 import PdfReader from flask_app.general.clean_pdf import is_scanned_pdf def download_file(url, local_filename): @@ -173,6 +174,20 @@ def docx2pdf(local_path_in): print(f"format_change d2p:have downloaded file to: {downloaded_filepath}") return downloaded_filepath +def get_pdf_page_count(file_path): + """ + 获取 PDF 文件的页码数量 + """ + try: + pdf_path=file_path + if file_path.lower().endswith(('.doc', '.docx')): + pdf_path = docx2pdf(file_path) + reader = PdfReader(pdf_path) + return len(reader.pages) + except Exception as e: + print(f"读取 PDF 页码时出错:{e}") + return 0 + # def docx2pdf(file_path): # """ # 将本地的 .docx 或 .doc 文件转换为 .pdf 文件。 diff --git a/flask_app/货物标/提取采购需求main.py b/flask_app/货物标/提取采购需求main.py index b205363..0cec0a2 100644 --- a/flask_app/货物标/提取采购需求main.py +++ b/flask_app/货物标/提取采购需求main.py @@ -1,7 +1,7 @@ import concurrent.futures import json import time -from flask_app.general.clean_pdf import get_pdf_page_count +from flask_app.general.format_change import get_pdf_page_count from flask_app.general.doubao import pdf2txt from flask_app.general.file2markdown import convert_file_to_markdown from flask_app.general.format_change import pdf2docx