diff --git a/flask_app/general/读取文件/按页读取pdf.py b/flask_app/general/读取文件/按页读取pdf.py index e44a222..c764a11 100644 --- a/flask_app/general/读取文件/按页读取pdf.py +++ b/flask_app/general/读取文件/按页读取pdf.py @@ -25,7 +25,7 @@ def extract_text_by_page_fitz(file_path): return result -def extract_text_by_page(file_path): +def extract_text_by_page_pypdf2(file_path): # common_header="" common_header = extract_common_header(file_path) # print(common_header) @@ -109,7 +109,7 @@ def save_extracted_text_to_txt(pdf_path, txt_path): """ try: # 提取文本内容 - extracted_text = extract_text_by_page(pdf_path) + extracted_text = extract_text_by_page_pypdf2(pdf_path) # 将提取的文本写入TXT文件 with open(txt_path, 'w', encoding='utf-8') as txt_file: @@ -203,7 +203,7 @@ if __name__ == '__main__': ress = extract_common_header(pdf_path) print(ress) print("-----------------") - extract_text_by_page(pdf_path) + extract_text_by_page_pypdf2(pdf_path) # res=extract_text_by_page_fitz(pdf_path) # print(res)磋商文件_tobidders_notice_part2.pdf # save_extracted_text_to_txt(file_path,"output.txt") diff --git a/flask_app/old_version/资格评审前判断_old.py b/flask_app/old_version/资格评审前判断_old.py index eb3dc87..384a767 100644 --- a/flask_app/old_version/资格评审前判断_old.py +++ b/flask_app/old_version/资格评审前判断_old.py @@ -1,9 +1,9 @@ -from flask_app.general.读取文件.按页读取pdf import extract_text_by_page +from flask_app.general.读取文件.按页读取pdf import extract_text_by_page_pypdf2 def check_strings_in_pdf(file_path): judge_list=['施工机械设备', '企业信息登记'] # Read text from PDF - text = extract_text_by_page(file_path) # Assuming this returns all text from the PDF + text = extract_text_by_page_pypdf2(file_path) # Assuming this returns all text from the PDF full_text = ''.join(text).replace('\n', '').replace(' ', '') # Clean up the text # Initialize the questions list diff --git a/flask_app/routes/判断是否是招标文件.py b/flask_app/routes/判断是否是招标文件.py index 94d7b83..2becb60 100644 --- a/flask_app/routes/判断是否是招标文件.py +++ b/flask_app/routes/判断是否是招标文件.py @@ -1,7 +1,7 @@ import time from PyPDF2 import PdfReader # 确保已安装 PyPDF2: pip install PyPDF2 from docx import Document - +import fitz from flask_app.general.llm.通义千问long import upload_file, qianwen_long def judge_zbfile_exec(file_path): @@ -12,28 +12,38 @@ def judge_zbfile_exec(file_path): start_time = time.time() # 检查文件是否为PDF格式 if file_path.lower().endswith('.pdf'): - # 使用 with 语句确保文件关闭 - with open(file_path, 'rb') as f: - reader = PdfReader(f) - num_pages = len(reader.pages) - if num_pages <= 5: - return False - elif file_path.lower().endswith('.docx'): - doc = Document(file_path) - accumulated_text = "" - chunk_size = 10 # 每次读取10个段落 - paragraphs = doc.paragraphs + try: + with open(file_path, 'rb') as f: + reader = PdfReader(f) + num_pages = len(reader.pages) + except Exception: + try: + doc = fitz.open(file_path) + num_pages = len(doc) + except Exception: + print("PDF 文件读取失败") + return False # 两种解析方式都失败,直接返回 False - for i in range(0, len(paragraphs), chunk_size): - chunk = paragraphs[i:i + chunk_size] - for para in chunk: - accumulated_text += para.text - # 判断累计字符数是否已经达到1000字, - if len(accumulated_text) >= 1000: - break - # 若累计内容不足1000字,则直接返回False - if len(accumulated_text) < 1000: - return False + if num_pages <= 5: + return False # 小于等于 5 页的 PDF 直接判定为非招标文件 + elif file_path.lower().endswith('.docx'): + try: + doc = Document(file_path) + accumulated_text = "" + chunk_size = 10 # 每次读取10个段落 + paragraphs = doc.paragraphs + + for i in range(0, len(paragraphs), chunk_size): + chunk = paragraphs[i:i + chunk_size] + for para in chunk: + accumulated_text += para.text + if len(accumulated_text) >= 1000: + break # 读取超过1000字后即可停止 + if len(accumulated_text) < 1000: + return False # 若累计内容不足1000字,则直接返回 False + except Exception: + print("DOCX 文件读取失败,可能为乱码文件") + return False # 解析失败直接返回 False # 使用大模型进行判断 user_query = """该文件是否属于招标文件?如果是的话,请返回'是',如果不是的话,返回'否'。请不要返回其他解释或内容。 以下是常见的招标文件类型: @@ -58,7 +68,7 @@ def judge_zbfile_exec(file_path): if __name__ == '__main__': start_time = time.time() - pdf_path = r"C:\Users\Administrator\Downloads\file1739842556194.docx" + pdf_path = r"C:\Users\Administrator\Desktop\fsdownload\19f53a17-ad4c-43b5-a7ed-981958ec3e0fs\ztbfile.docx" res = judge_zbfile_exec(pdf_path) if res: print("yes")