Merge branch '12.18doubao' into develop
This commit is contained in:
commit
9015ad525b
@ -1,9 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
from PyPDF2 import PdfReader
|
from PyPDF2 import PdfReader
|
||||||
|
|
||||||
from flask_app.general.format_change import docx2pdf
|
|
||||||
|
|
||||||
|
|
||||||
def extract_common_header(pdf_path):
|
def extract_common_header(pdf_path):
|
||||||
|
|
||||||
def get_headers(pdf_document, start_page, pages_to_read):
|
def get_headers(pdf_document, start_page, pages_to_read):
|
||||||
@ -84,10 +81,6 @@ def clean_page_content(text, common_header):
|
|||||||
text = re.sub(r'^\s*[—-]\s*\d{1,3}\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
|
text = re.sub(r'^\s*[—-]\s*\d{1,3}\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
from PyPDF2 import PdfReader
|
|
||||||
|
|
||||||
|
|
||||||
def is_scanned_pdf(file_path, max_pages=15):
|
def is_scanned_pdf(file_path, max_pages=15):
|
||||||
"""
|
"""
|
||||||
检查 PDF 是否为扫描件(即前 15 页无文本)。
|
检查 PDF 是否为扫描件(即前 15 页无文本)。
|
||||||
@ -109,19 +102,6 @@ def is_scanned_pdf(file_path, max_pages=15):
|
|||||||
return False # 不是扫描型
|
return False # 不是扫描型
|
||||||
return True # 前 max_pages 页都没有文本
|
return True # 前 max_pages 页都没有文本
|
||||||
|
|
||||||
def get_pdf_page_count(file_path):
|
|
||||||
"""
|
|
||||||
获取 PDF 文件的页码数量
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
pdf_path=file_path
|
|
||||||
if file_path.lower().endswith(('.doc', '.docx')):
|
|
||||||
pdf_path = docx2pdf(file_path)
|
|
||||||
reader = PdfReader(pdf_path)
|
|
||||||
return len(reader.pages)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"读取 PDF 页码时出错:{e}")
|
|
||||||
return 0
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
file_path = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\2020-安徽-安徽省生态环境厅电梯采购.pdf"
|
file_path = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\2020-安徽-安徽省生态环境厅电梯采购.pdf"
|
||||||
res=is_scanned_pdf(file_path)
|
res=is_scanned_pdf(file_path)
|
||||||
|
@ -2,6 +2,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import requests
|
import requests
|
||||||
|
from PyPDF2 import PdfReader
|
||||||
|
|
||||||
from flask_app.general.clean_pdf import is_scanned_pdf
|
from flask_app.general.clean_pdf import is_scanned_pdf
|
||||||
def download_file(url, local_filename):
|
def download_file(url, local_filename):
|
||||||
@ -172,6 +173,20 @@ def docx2pdf(local_path_in):
|
|||||||
print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
|
print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
|
||||||
return downloaded_filepath
|
return downloaded_filepath
|
||||||
|
|
||||||
|
def get_pdf_page_count(file_path):
|
||||||
|
"""
|
||||||
|
获取 PDF 文件的页码数量
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
pdf_path=file_path
|
||||||
|
if file_path.lower().endswith(('.doc', '.docx')):
|
||||||
|
pdf_path = docx2pdf(file_path)
|
||||||
|
reader = PdfReader(pdf_path)
|
||||||
|
return len(reader.pages)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"读取 PDF 页码时出错:{e}")
|
||||||
|
return 0
|
||||||
|
|
||||||
# def docx2pdf(file_path):
|
# def docx2pdf(file_path):
|
||||||
# """
|
# """
|
||||||
# 将本地的 .docx 或 .doc 文件转换为 .pdf 文件。
|
# 将本地的 .docx 或 .doc 文件转换为 .pdf 文件。
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
from flask_app.general.clean_pdf import get_pdf_page_count
|
from flask_app.general.format_change import get_pdf_page_count
|
||||||
from flask_app.general.doubao import pdf2txt
|
from flask_app.general.doubao import pdf2txt
|
||||||
from flask_app.general.file2markdown import convert_file_to_markdown
|
from flask_app.general.file2markdown import convert_file_to_markdown
|
||||||
from flask_app.general.format_change import pdf2docx
|
from flask_app.general.format_change import pdf2docx
|
||||||
|
Loading…
x
Reference in New Issue
Block a user