2.19 使用脚本额外判断docx文档是否为乱码文件

This commit is contained in:
zy123 2025-02-19 09:41:38 +08:00
parent d0dc142547
commit 6a2b1c2ffb
3 changed files with 38 additions and 28 deletions

View File

@ -25,7 +25,7 @@ def extract_text_by_page_fitz(file_path):
return result
def extract_text_by_page(file_path):
def extract_text_by_page_pypdf2(file_path):
# common_header=""
common_header = extract_common_header(file_path)
# print(common_header)
@ -109,7 +109,7 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
"""
try:
# 提取文本内容
extracted_text = extract_text_by_page(pdf_path)
extracted_text = extract_text_by_page_pypdf2(pdf_path)
# 将提取的文本写入TXT文件
with open(txt_path, 'w', encoding='utf-8') as txt_file:
@ -203,7 +203,7 @@ if __name__ == '__main__':
ress = extract_common_header(pdf_path)
print(ress)
print("-----------------")
extract_text_by_page(pdf_path)
extract_text_by_page_pypdf2(pdf_path)
# res=extract_text_by_page_fitz(pdf_path)
# print(res)磋商文件_tobidders_notice_part2.pdf
# save_extracted_text_to_txt(file_path,"output.txt")

View File

@ -1,9 +1,9 @@
from flask_app.general.读取文件.按页读取pdf import extract_text_by_page
from flask_app.general.读取文件.按页读取pdf import extract_text_by_page_pypdf2
def check_strings_in_pdf(file_path):
judge_list=['施工机械设备', '企业信息登记']
# Read text from PDF
text = extract_text_by_page(file_path) # Assuming this returns all text from the PDF
text = extract_text_by_page_pypdf2(file_path) # Assuming this returns all text from the PDF
full_text = ''.join(text).replace('\n', '').replace(' ', '') # Clean up the text
# Initialize the questions list

View File

@ -1,7 +1,7 @@
import time
from PyPDF2 import PdfReader # 确保已安装 PyPDF2: pip install PyPDF2
from docx import Document
import fitz
from flask_app.general.llm.通义千问long import upload_file, qianwen_long
def judge_zbfile_exec(file_path):
@ -12,28 +12,38 @@ def judge_zbfile_exec(file_path):
start_time = time.time()
# 检查文件是否为PDF格式
if file_path.lower().endswith('.pdf'):
# 使用 with 语句确保文件关闭
with open(file_path, 'rb') as f:
reader = PdfReader(f)
num_pages = len(reader.pages)
if num_pages <= 5:
return False
elif file_path.lower().endswith('.docx'):
doc = Document(file_path)
accumulated_text = ""
chunk_size = 10 # 每次读取10个段落
paragraphs = doc.paragraphs
try:
with open(file_path, 'rb') as f:
reader = PdfReader(f)
num_pages = len(reader.pages)
except Exception:
try:
doc = fitz.open(file_path)
num_pages = len(doc)
except Exception:
print("PDF 文件读取失败")
return False # 两种解析方式都失败,直接返回 False
for i in range(0, len(paragraphs), chunk_size):
chunk = paragraphs[i:i + chunk_size]
for para in chunk:
accumulated_text += para.text
# 判断累计字符数是否已经达到1000字,
if len(accumulated_text) >= 1000:
break
# 若累计内容不足1000字则直接返回False
if len(accumulated_text) < 1000:
return False
if num_pages <= 5:
return False # 小于等于 5 页的 PDF 直接判定为非招标文件
elif file_path.lower().endswith('.docx'):
try:
doc = Document(file_path)
accumulated_text = ""
chunk_size = 10 # 每次读取10个段落
paragraphs = doc.paragraphs
for i in range(0, len(paragraphs), chunk_size):
chunk = paragraphs[i:i + chunk_size]
for para in chunk:
accumulated_text += para.text
if len(accumulated_text) >= 1000:
break # 读取超过1000字后即可停止
if len(accumulated_text) < 1000:
return False # 若累计内容不足1000字则直接返回 False
except Exception:
print("DOCX 文件读取失败,可能为乱码文件")
return False # 解析失败直接返回 False
# 使用大模型进行判断
user_query = """该文件是否属于招标文件?如果是的话,请返回'',如果不是的话,返回''。请不要返回其他解释或内容。
以下是常见的招标文件类型
@ -58,7 +68,7 @@ def judge_zbfile_exec(file_path):
if __name__ == '__main__':
start_time = time.time()
pdf_path = r"C:\Users\Administrator\Downloads\file1739842556194.docx"
pdf_path = r"C:\Users\Administrator\Desktop\fsdownload\19f53a17-ad4c-43b5-a7ed-981958ec3e0fs\ztbfile.docx"
res = judge_zbfile_exec(pdf_path)
if res:
print("yes")