1.21解决bug

2025-01-21 19:20:26 +08:00 · 2025-01-21 19:20:26 +08:00 · e7051ea84e
commit e7051ea84e
parent 366a5b1fdb
5 changed files with 32 additions and 22 deletions
--- a/flask_app/general/多线程提问.py
+++ b/flask_app/general/多线程提问.py
@ -40,7 +40,6 @@ prom = '请记住以下材料，他们对回答问题有帮助，请你简洁准
 def read_questions_from_file(file_path):
    questions = []
    current_question = ""
    current_number = 0
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
@ -59,8 +58,6 @@ def read_questions_from_file(file_path):
                if current_question:
                    questions.append(current_question.strip())
                # 开始新的问题
                current_number = int(match.group(1))
                # 提取问题内容，去掉编号和点
                current_question = line.split('.', 1)[1].strip() + "\n"
            else:
--- a/flask_app/general/截取pdf通用函数.py
+++ b/flask_app/general/截取pdf通用函数.py
@ -80,7 +80,7 @@ def get_start_and_common_header(input_path, end_page):
                    continue  # 如果存在目录，跳过当前页面
                if begin_pattern.search(cleaned_text):
                    last_begin_index = i  # 更新第一个匹配的索引，页码从0开始
-                    print(f"匹配到起始模式，设置 last_begin_index 为: {last_begin_index}")
+                    # print(f"匹配到起始模式，设置 last_begin_index 为: {last_begin_index}")
                    return common_header, last_begin_index
        return common_header, last_begin_index
    except Exception as e:
@ -273,7 +273,7 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
    # 定义结束模式
    end_patterns = [
        regex.compile(
-            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:格式|文件(?:的)?组成|文件(?:的)?构成|文件(?:的)?编制).*|'
+            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:(?:文件|书)(?:的)?(?:组成|构成|编制)|格式).*|'
            r'第[一二三四五六七八九十]+(?:章|部分).*?合同|[：:]清标报告',
            regex.MULTILINE
        ),
--- a/flask_app/货物标/商务服务其他要求提取.py
+++ b/flask_app/货物标/商务服务其他要求提取.py
@ -1,13 +1,15 @@
 # -*- encoding:utf-8 -*-
 import json
 import re
 import fitz
 from PyPDF2 import PdfReader
 import textwrap
 from flask_app.general.doubao import read_txt_to_string
 from flask_app.general.json_utils import clean_json_string
 from flask_app.general.model_continue_query import continue_answer, process_continue_answers
 from flask_app.general.截取pdf通用函数 import create_get_text_function
 from flask_app.general.通义千问long import upload_file, qianwen_long_stream, qianwen_plus
-from flask_app.货物标.截取pdf货物标版 import extract_common_header, clean_page_content
+from flask_app.general.clean_pdf import extract_common_header, clean_page_content
 from flask_app.general.format_change import docx2pdf, pdf2docx
 import concurrent.futures
 from flask_app.general.doubao import doubao_model
@ -18,8 +20,19 @@ def find_exists(truncate_file, required_keys):
    # if not truncate_file:
    #     return ["技术要求", "商务要求", "服务要求", "其他要求"]
    common_header = extract_common_header(truncate_file)  # 假设该函数已定义
-    pdf_document = PdfReader(truncate_file)
+    try:
-
+        pdf_document = PdfReader(truncate_file)
        pdf_lib = 'pypdf2'
    except Exception as e:
        print(f"使用 PyPDF2 读取失败，切换到 fitz。错误信息: {e}")
        pdf_document = fitz.open(truncate_file)
        pdf_lib = 'fitz'
    get_text = create_get_text_function(pdf_lib, pdf_document)
    # 获取总页数
    if pdf_lib == 'pypdf2':
        total_pages = len(pdf_document.pages)
    else:  # fitz
        total_pages = pdf_document.page_count
    # 定义正则模式
    begin_pattern = re.compile(
        r'(?:^第[一二三四五六七八九十百千]+(?:章|部分)\s*'  # 匹配“第X章”或“第X部分”
@ -36,13 +49,13 @@ def find_exists(truncate_file, required_keys):
    end_pattern = re.compile(
        r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$', re.MULTILINE)
-    # 只处理第一页和最后一页
+    # 处理第一页和最后一页
-    first_page = pdf_document.pages[0].extract_text() or ""
+    first_page_text = get_text(0)
-    last_page = pdf_document.pages[-1].extract_text() or ""
+    last_page_text = get_text(total_pages - 1)
    # 清理页面内容
-    first_page_clean = clean_page_content(first_page, common_header)
+    first_page_clean = clean_page_content(first_page_text, common_header)
-    last_page_clean = clean_page_content(last_page, common_header)
+    last_page_clean = clean_page_content(last_page_text, common_header)
    # 在第一页寻找起始位置
    start_match = re.search(begin_pattern, first_page_clean)
@ -63,9 +76,9 @@ def find_exists(truncate_file, required_keys):
    # 获取中间页面的内容
    middle_content = ""
-    if len(pdf_document.pages) > 2:
+    if total_pages > 2:
-        for page_num in range(1, len(pdf_document.pages) - 1):
+        for page_num in range(1, total_pages - 1):
-            page_text = pdf_document.pages[page_num].extract_text() or ""
+            page_text = get_text(page_num)
            cleaned_text = clean_page_content(page_text, common_header)
            middle_content += cleaned_text + "\n"
--- a/flask_app/货物标/基础信息解析货物标版.py
+++ b/flask_app/货物标/基础信息解析货物标版.py
@ -67,12 +67,12 @@ def combine_basic_info(merged_baseinfo_path, procurement_path,clause_path,invali
 if __name__ == "__main__":
    start_time=time.time()
    # baseinfo_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all\\ztbfile_merged_baseinfo\\ztbfile_merged_baseinfo_3-31.pdf"
-    merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\3783ce68-1839-4449-97e6-cd07749d8664\ztbfile_merged_baseinfo.pdf"
+    merged_baseinfo_path=r"C:\Users\Administrator\Desktop\fsdownload\0c80edcc-cc86-4d53-8bd4-78a531446760\ztbfile.docx"
    # procurement_file_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\tmp\\ztbfile_procurement.pdf"
    clause_path=r'D:\flask_project\flask_app\static\output\output1\3783ce68-1839-4449-97e6-cd07749d8664\clause1.json'
-    invalid_path=r'D:\flask_project\flask_app\static\output\output1\3783ce68-1839-4449-97e6-cd07749d8664\invalid_del.docx'
+    invalid_path=r"C:\Users\Administrator\Desktop\fsdownload\0c80edcc-cc86-4d53-8bd4-78a531446760\ztbfile.docx"
    # res = combine_basic_info(merged_baseinfo_path, procurement_file_path,clause_path)
-    res=combine_basic_info(merged_baseinfo_path,"",clause_path,invalid_path)
+    res=combine_basic_info(merged_baseinfo_path,"","",invalid_path)
    print("------------------------------------")
    print(json.dumps(res, ensure_ascii=False, indent=4))
    end_time=time.time()
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -319,11 +319,11 @@ if __name__ == "__main__":
    logger = get_global_logger("123")
    # input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
    # pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
-    pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp\鄂州市急救中心展厅布展项目.pdf"
+    pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\0c80edcc-cc86-4d53-8bd4-78a531446760\ztbfile.pdf"
    # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件（广水市教育局封闭管理）.pdf"
    # pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
-    output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp"
+    output_folder = r"C:\Users\Administrator\Desktop\fsdownload\0c80edcc-cc86-4d53-8bd4-78a531446760\tmp"
    # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
-    selection = 3  # 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求 6-invalid_path
+    selection = 1  # 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求 6-invalid_path
    generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
    print(generated_files)