Merge branch '12.18doubao' into develop

This commit is contained in:
zy123 2024-12-19 14:45:55 +08:00
commit 9015ad525b
3 changed files with 16 additions and 21 deletions

View File

@ -1,9 +1,6 @@
import re
from PyPDF2 import PdfReader
from flask_app.general.format_change import docx2pdf
def extract_common_header(pdf_path):
def get_headers(pdf_document, start_page, pages_to_read):
@ -84,10 +81,6 @@ def clean_page_content(text, common_header):
text = re.sub(r'^\s*[—-]\s*\d{1,3}\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
return text
from PyPDF2 import PdfReader
def is_scanned_pdf(file_path, max_pages=15):
"""
检查 PDF 是否为扫描件即前 15 页无文本
@ -109,19 +102,6 @@ def is_scanned_pdf(file_path, max_pages=15):
return False # 不是扫描型
return True # 前 max_pages 页都没有文本
def get_pdf_page_count(file_path):
"""
获取 PDF 文件的页码数量
"""
try:
pdf_path=file_path
if file_path.lower().endswith(('.doc', '.docx')):
pdf_path = docx2pdf(file_path)
reader = PdfReader(pdf_path)
return len(reader.pages)
except Exception as e:
print(f"读取 PDF 页码时出错:{e}")
return 0
if __name__ == '__main__':
file_path = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\2020-安徽-安徽省生态环境厅电梯采购.pdf"
res=is_scanned_pdf(file_path)

View File

@ -2,6 +2,7 @@ import json
import os
import mimetypes
import requests
from PyPDF2 import PdfReader
from flask_app.general.clean_pdf import is_scanned_pdf
def download_file(url, local_filename):
@ -172,6 +173,20 @@ def docx2pdf(local_path_in):
print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath
def get_pdf_page_count(file_path):
"""
获取 PDF 文件的页码数量
"""
try:
pdf_path=file_path
if file_path.lower().endswith(('.doc', '.docx')):
pdf_path = docx2pdf(file_path)
reader = PdfReader(pdf_path)
return len(reader.pages)
except Exception as e:
print(f"读取 PDF 页码时出错:{e}")
return 0
# def docx2pdf(file_path):
# """
# 将本地的 .docx 或 .doc 文件转换为 .pdf 文件。

View File

@ -1,7 +1,7 @@
import concurrent.futures
import json
import time
from flask_app.general.clean_pdf import get_pdf_page_count
from flask_app.general.format_change import get_pdf_page_count
from flask_app.general.doubao import pdf2txt
from flask_app.general.file2markdown import convert_file_to_markdown
from flask_app.general.format_change import pdf2docx