From e7051ea84eb172e13c307b9a3d2144c514a1643e Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Tue, 21 Jan 2025 19:20:26 +0800 Subject: [PATCH] =?UTF-8?q?1.21=E8=A7=A3=E5=86=B3bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/多线程提问.py | 3 -- flask_app/general/截取pdf通用函数.py | 4 +-- flask_app/货物标/商务服务其他要求提取.py | 35 ++++++++++++++++-------- flask_app/货物标/基础信息解析货物标版.py | 6 ++-- flask_app/货物标/截取pdf货物标版.py | 6 ++-- 5 files changed, 32 insertions(+), 22 deletions(-) diff --git a/flask_app/general/多线程提问.py b/flask_app/general/多线程提问.py index 71f3402..6eb302e 100644 --- a/flask_app/general/多线程提问.py +++ b/flask_app/general/多线程提问.py @@ -40,7 +40,6 @@ prom = '请记住以下材料,他们对回答问题有帮助,请你简洁准 def read_questions_from_file(file_path): questions = [] current_question = "" - current_number = 0 with open(file_path, 'r', encoding='utf-8') as file: for line in file: @@ -59,8 +58,6 @@ def read_questions_from_file(file_path): if current_question: questions.append(current_question.strip()) - # 开始新的问题 - current_number = int(match.group(1)) # 提取问题内容,去掉编号和点 current_question = line.split('.', 1)[1].strip() + "\n" else: diff --git a/flask_app/general/截取pdf通用函数.py b/flask_app/general/截取pdf通用函数.py index d81ab00..fc77ea0 100644 --- a/flask_app/general/截取pdf通用函数.py +++ b/flask_app/general/截取pdf通用函数.py @@ -80,7 +80,7 @@ def get_start_and_common_header(input_path, end_page): continue # 如果存在目录,跳过当前页面 if begin_pattern.search(cleaned_text): last_begin_index = i # 更新第一个匹配的索引,页码从0开始 - print(f"匹配到起始模式,设置 last_begin_index 为: {last_begin_index}") + # print(f"匹配到起始模式,设置 last_begin_index 为: {last_begin_index}") return common_header, last_begin_index return common_header, last_begin_index except Exception as e: @@ -273,7 +273,7 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page): # 定义结束模式 end_patterns = [ regex.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:格式|文件(?:的)?组成|文件(?:的)?构成|文件(?:的)?编制).*|' + r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:(?:文件|书)(?:的)?(?:组成|构成|编制)|格式).*|' r'第[一二三四五六七八九十]+(?:章|部分).*?合同|[::]清标报告', regex.MULTILINE ), diff --git a/flask_app/货物标/商务服务其他要求提取.py b/flask_app/货物标/商务服务其他要求提取.py index 3e6dddb..888fa7c 100644 --- a/flask_app/货物标/商务服务其他要求提取.py +++ b/flask_app/货物标/商务服务其他要求提取.py @@ -1,13 +1,15 @@ # -*- encoding:utf-8 -*- import json import re +import fitz from PyPDF2 import PdfReader import textwrap from flask_app.general.doubao import read_txt_to_string from flask_app.general.json_utils import clean_json_string from flask_app.general.model_continue_query import continue_answer, process_continue_answers +from flask_app.general.截取pdf通用函数 import create_get_text_function from flask_app.general.通义千问long import upload_file, qianwen_long_stream, qianwen_plus -from flask_app.货物标.截取pdf货物标版 import extract_common_header, clean_page_content +from flask_app.general.clean_pdf import extract_common_header, clean_page_content from flask_app.general.format_change import docx2pdf, pdf2docx import concurrent.futures from flask_app.general.doubao import doubao_model @@ -18,8 +20,19 @@ def find_exists(truncate_file, required_keys): # if not truncate_file: # return ["技术要求", "商务要求", "服务要求", "其他要求"] common_header = extract_common_header(truncate_file) # 假设该函数已定义 - pdf_document = PdfReader(truncate_file) - + try: + pdf_document = PdfReader(truncate_file) + pdf_lib = 'pypdf2' + except Exception as e: + print(f"使用 PyPDF2 读取失败,切换到 fitz。错误信息: {e}") + pdf_document = fitz.open(truncate_file) + pdf_lib = 'fitz' + get_text = create_get_text_function(pdf_lib, pdf_document) + # 获取总页数 + if pdf_lib == 'pypdf2': + total_pages = len(pdf_document.pages) + else: # fitz + total_pages = pdf_document.page_count # 定义正则模式 begin_pattern = re.compile( r'(?:^第[一二三四五六七八九十百千]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分” @@ -36,13 +49,13 @@ def find_exists(truncate_file, required_keys): end_pattern = re.compile( r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE) - # 只处理第一页和最后一页 - first_page = pdf_document.pages[0].extract_text() or "" - last_page = pdf_document.pages[-1].extract_text() or "" + # 处理第一页和最后一页 + first_page_text = get_text(0) + last_page_text = get_text(total_pages - 1) # 清理页面内容 - first_page_clean = clean_page_content(first_page, common_header) - last_page_clean = clean_page_content(last_page, common_header) + first_page_clean = clean_page_content(first_page_text, common_header) + last_page_clean = clean_page_content(last_page_text, common_header) # 在第一页寻找起始位置 start_match = re.search(begin_pattern, first_page_clean) @@ -63,9 +76,9 @@ def find_exists(truncate_file, required_keys): # 获取中间页面的内容 middle_content = "" - if len(pdf_document.pages) > 2: - for page_num in range(1, len(pdf_document.pages) - 1): - page_text = pdf_document.pages[page_num].extract_text() or "" + if total_pages > 2: + for page_num in range(1, total_pages - 1): + page_text = get_text(page_num) cleaned_text = clean_page_content(page_text, common_header) middle_content += cleaned_text + "\n" diff --git a/flask_app/货物标/基础信息解析货物标版.py b/flask_app/货物标/基础信息解析货物标版.py index fbc2bc9..88aae9c 100644 --- a/flask_app/货物标/基础信息解析货物标版.py +++ b/flask_app/货物标/基础信息解析货物标版.py @@ -67,12 +67,12 @@ def combine_basic_info(merged_baseinfo_path, procurement_path,clause_path,invali if __name__ == "__main__": start_time=time.time() # baseinfo_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all\\ztbfile_merged_baseinfo\\ztbfile_merged_baseinfo_3-31.pdf" - merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\3783ce68-1839-4449-97e6-cd07749d8664\ztbfile_merged_baseinfo.pdf" + merged_baseinfo_path=r"C:\Users\Administrator\Desktop\fsdownload\0c80edcc-cc86-4d53-8bd4-78a531446760\ztbfile.docx" # procurement_file_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\tmp\\ztbfile_procurement.pdf" clause_path=r'D:\flask_project\flask_app\static\output\output1\3783ce68-1839-4449-97e6-cd07749d8664\clause1.json' - invalid_path=r'D:\flask_project\flask_app\static\output\output1\3783ce68-1839-4449-97e6-cd07749d8664\invalid_del.docx' + invalid_path=r"C:\Users\Administrator\Desktop\fsdownload\0c80edcc-cc86-4d53-8bd4-78a531446760\ztbfile.docx" # res = combine_basic_info(merged_baseinfo_path, procurement_file_path,clause_path) - res=combine_basic_info(merged_baseinfo_path,"",clause_path,invalid_path) + res=combine_basic_info(merged_baseinfo_path,"","",invalid_path) print("------------------------------------") print(json.dumps(res, ensure_ascii=False, indent=4)) end_time=time.time() diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py index 445fada..096c81b 100644 --- a/flask_app/货物标/截取pdf货物标版.py +++ b/flask_app/货物标/截取pdf货物标版.py @@ -319,11 +319,11 @@ if __name__ == "__main__": logger = get_global_logger("123") # input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标" # pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf" - pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp\鄂州市急救中心展厅布展项目.pdf" + pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\0c80edcc-cc86-4d53-8bd4-78a531446760\ztbfile.pdf" # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf" # pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf" - output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp" + output_folder = r"C:\Users\Administrator\Desktop\fsdownload\0c80edcc-cc86-4d53-8bd4-78a531446760\tmp" # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2" - selection = 3 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path + selection = 1 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger) print(generated_files)