10.17 小解析货物标

2024-10-17 19:07:57 +08:00 · 2024-10-17 19:07:57 +08:00 · abd972df3a
commit abd972df3a
parent 8255070f60
6 changed files with 233 additions and 90 deletions
--- a/flask_app/general/little_zbparse.py
+++ b/flask_app/general/little_zbparse.py
@ -11,7 +11,8 @@ from flask_app.main.基础信息整合 import judge_consortium_bidding
 from flask_app.main.多线程提问 import read_questions_from_file, multi_threading
 from flask_app.main.通义千问long import upload_file
 from flask_app.货物标.基础信息解析main import aggregate_basic_info_goods
-from flask_app.货物标.货物标截取pdf import truncate_pdf_specific
+from flask_app.货物标.货物标截取pdf import truncate_pdf_specific_goods
+from flask_app.main.截取pdf import truncate_pdf_specific_engineering
 from flask_app.main.post_processing import inner_post_processing

 def get_global_logger(unique_id):
@ -36,8 +37,8 @@ def merge(merged):

 def get_goods_baseinfo(baseinfo_file_path):
    file_id = upload_file(baseinfo_file_path)
-    # baseinfo_file_path='flask_app/static/提示词/基本信息货物标.txt'
-    baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息货物标.txt'
+    baseinfo_file_path='flask_app/static/提示词/基本信息货物标.txt'
+    # baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息货物标.txt'
    questions = read_questions_from_file(baseinfo_file_path)
    more_query = "请你根据招标文件信息，回答以下问题：是否需要递交投标保证金（或磋商保证金）？请按json格式给我提供信息，键名分为'是否递交投标保证金'（或'是否递交磋商保证金'）,键值仅限于'是','否','未知',若存在矛盾信息，请回答'未知'。"
    questions.append(more_query)
@ -45,7 +46,7 @@ def get_goods_baseinfo(baseinfo_file_path):
    baseinfo_list = [clean_json_string(res) for _, res in baseinfo_results] if baseinfo_results else []
    type,merged=merge(baseinfo_list.pop())
    if type:
-        judge_questions="根据招标文件第二章投标人须知，该项目投标保证金（或磋商保证金）的内容或要求是什么？请按json格式给我提供信息，外层键名为'投标保证金'（或'磋商保证金'），若需要以嵌套键值对返回结果，那么嵌套键名为你对相应要求的总结，而对应键值需要完全与原文保持一致。"
+        judge_questions=["根据招标文件第二章投标人须知，该项目投标保证金（或磋商保证金）的内容或要求是什么？请按json格式给我提供信息，外层键名为'投标保证金'（或'磋商保证金'），若需要以嵌套键值对返回结果，那么嵌套键名为你对相应要求的总结，而对应键值需要完全与原文保持一致。"]
        res2 = multi_threading(judge_questions, "", file_id, 2)  # 调用千问-long
        if not res2:
            print("基础信息整合： multi_threading error!")
@ -59,11 +60,13 @@ def get_goods_baseinfo(baseinfo_file_path):

 #货物标
 def little_parse_goods(output_folder,file_path):
-    files=truncate_pdf_specific(file_path,output_folder)
+    files=truncate_pdf_specific_goods(file_path,output_folder)
    baseinfo_list=get_goods_baseinfo(files[-1])
    aggregated_baseinfo = aggregate_basic_info_goods(baseinfo_list)
    return {"基础信息": aggregated_baseinfo}
-def little_parse_engineering(output_folder,downloaded_filepath):
+def little_parse_engineering(output_folder,file_path):
+    files=truncate_pdf_specific_engineering(file_path,output_folder)
+
    return True

 def little_parse_main(output_folder, file_path, file_type,zb_type,unique_id):
@ -81,6 +84,7 @@ def little_parse_main(output_folder, file_path, file_type,zb_type,unique_id):
        """
    global logger
    logger = get_global_logger(unique_id)
+    logger.info("zb_type:"+str(zb_type))
    # 根据文件类型处理文件路径
    if file_type == 1:  # docx
        docx_path = file_path
@ -119,8 +123,12 @@ if __name__ == "__main__":
    file_type = 2  # 1:docx 2:pdf 3:其他
    zb_type=2   #1:工程标 2：货物标
    input_file = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\ztbfile.pdf"
-    res=little_parse_main(output_folder, input_file, file_type, zb_type,"122334")
-    print(json.dumps(res, ensure_ascii=False, indent=4))
+    final_json_path=little_parse_main(output_folder, input_file, file_type, zb_type,"122334")
+    with open(final_json_path, 'r', encoding='utf-8') as f:
+        logger.info('final_json_path:' + final_json_path)
+        zbparse_data = json.load(f)
+        json_str = json.dumps(zbparse_data, ensure_ascii=False)
+        print(json_str)
    end_time = time.time()
    elapsed_time = end_time - start_time  # 计算耗时
    print(f"Function execution took {elapsed_time} seconds.")
--- a/flask_app/main/merge_pdfs.py
+++ b/flask_app/main/merge_pdfs.py
@ -0,0 +1,58 @@
+import os
+
+from PyPDF2 import PdfReader, PdfWriter
+#合并PDF
+def merge_pdfs(paths, output_path):
+    pdf_writer = PdfWriter()
+    last_page_text = None  # 用于存储上一个PDF的最后一页的文本
+
+    for path in paths:
+        pdf_reader = PdfReader(path)
+        pages = pdf_reader.pages
+        start_index = 0  # 从第一页开始添加
+
+        # 如果这不是第一个文件，并且有上一个文件的最后一页文本
+        if last_page_text is not None and len(pages) > 0:
+            current_first_page_text = pages[0].extract_text() if pages[0].extract_text() else ""
+            # 比较当前文件的第一页和上一个文件的最后一页的文本
+            if current_first_page_text == last_page_text:
+                start_index = 1  # 如果相同，跳过当前文件的第一页
+
+        # 添加当前PDF的页面到写入器
+        for page in range(start_index, len(pages)):
+            pdf_writer.add_page(pages[page])
+
+        # 更新last_page_text为当前PDF的最后一页的文本
+        if len(pages) > 0:
+            last_page_text = pages[-1].extract_text() if pages[-1].extract_text() else ""
+
+    # 写入合并后的PDF到文件
+    with open(output_path, 'wb') as out:
+        pdf_writer.write(out)
+
+def judge_file_exist(original_path, new_suffix):
+    # 提取目录路径和原始文件名
+    directory = os.path.dirname(original_path)
+    original_filename = os.path.basename(original_path)
+
+    # 替换文件名中的旧后缀为新后缀
+    # 假设原始文件名格式为 '2-招标文件_qualification.pdf'
+    # 需要替换 '_qualification' 部分为 '_qualification2'
+    new_filename = original_filename.replace("_qualification1", f"_{new_suffix}")
+    new_filename = new_filename.replace("_qualification2", f"_{new_suffix}")
+    # 生成新的文件路径
+    new_file_path = os.path.join(directory, new_filename)
+
+    # 检查新文件是否存在
+    if os.path.isfile(new_file_path):
+        return new_file_path
+    else:
+        return None
+
+def merge_and_cleanup(output_pdf_path, suffix_to_merge):
+    another_file_path = judge_file_exist(output_pdf_path, suffix_to_merge)
+    if another_file_path:
+        paths = [output_pdf_path, another_file_path]  # 需要合并的 PDF 文件路径
+        merge_pdfs(paths, output_pdf_path)
+        os.remove(another_file_path)
+        print(f"文件 {another_file_path} 已删除。")
--- a/flask_app/main/start_up.py
+++ b/flask_app/main/start_up.py
@ -177,7 +177,7 @@ def generate_response(final_json_path):
        zbparse_data = json.load(f)
        json_str = json.dumps(zbparse_data, ensure_ascii=False)
    return jsonify({
-        'message': 'File uploaded and processed successfully',
+        'message': 'Little Parse processed successfully',
        'filename': os.path.basename(final_json_path),
        'data': json_str
    })
--- a/flask_app/main/截取pdf.py
+++ b/flask_app/main/截取pdf.py
@ -1,6 +1,7 @@
 from PyPDF2 import PdfReader, PdfWriter
 import re  # 导入正则表达式库
 import os  # 用于文件和文件夹操作
+from flask_app.main.merge_pdfs import merge_pdfs
 def clean_page_content(text, common_header):
    # 首先删除抬头公共部分
    if common_header:  # 确保有公共抬头才进行替换
@ -52,31 +53,63 @@ def extract_common_header(pdf_path):

    return '\n'.join(common_headers)

+
 def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page):
-    """从原始PDF截取指定范围的页面并保存到新的PDF文件中"""
-    # 获取文件基本名称
-    base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
-    # 构建输出文件路径
-    output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
+    """
+    从原始PDF截取指定范围的页面并保存到新的PDF文件中。
+    如果 output_suffix 是 'notice'，则额外保存 start_page 之前的页面。

-    # 读取PDF文件
-    pdf_document = PdfReader(pdf_path)
-    output_doc = PdfWriter()
+    参数:
+        pdf_path (str): 原始PDF文件路径。
+        output_folder (str): 输出文件夹路径。
+        output_suffix (str): 输出文件的后缀，用于区分不同的提取。
+        start_page (int): 起始页码（0基）。
+        end_page (int): 结束页码（0基）。

-    # 检查起始和结束页码是否有效
-    if start_page is not None and end_page is not None and start_page <= end_page:
-        # 添加指定范围的页面到新的PDF文档中
+    返回:
+        str: 保存的PDF文件路径。如果提取失败，返回空字符串。
+    """
+    try:
+        # 获取文件基本名称
+        base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
+
+        # 构建主要输出文件路径
+        output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
+
+        # 读取PDF文件
+        pdf_document = PdfReader(pdf_path)
+        total_pages = len(pdf_document.pages)
+
+        # 检查起始和结束页码是否有效
+        if start_page < 0 or end_page >= total_pages or start_page > end_page:
+            print(f"无效的页面范围: {start_page} 到 {end_page}")
+            return ""
+
+        # 如果 output_suffix 是 'notice'，保存 start_page 之前的页面
+        if output_suffix == 'notice' and start_page > 0:
+            before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
+            before_doc = PdfWriter()
+            for page_num in range(0, start_page):
+                before_doc.add_page(pdf_document.pages[page_num])
+            with open(before_pdf_path, 'wb') as f_before:
+                before_doc.write(f_before)
+            print(f"已保存页面从 0 到 {start_page - 1} 为 {before_pdf_path}")
+
+        # 提取指定范围的页面
+        output_doc = PdfWriter()
        for page_num in range(start_page, end_page + 1):
            output_doc.add_page(pdf_document.pages[page_num])

        # 保存新的PDF文件
-        with open(output_pdf_path, 'wb') as f:
-            output_doc.write(f)
+        with open(output_pdf_path, 'wb') as f_output:
+            output_doc.write(f_output)

-        print(f"已截取并保存页面从 {start_page + 1} 到 {end_page + 1} 为 {output_pdf_path}")
-    else:
-        print("提供的页码范围无效。")
-    return output_pdf_path
+        print(f"{output_suffix} 已截取并保存页面从 {start_page + 1} 到 {end_page + 1} 为 {output_pdf_path}")
+        return output_pdf_path
+
+    except Exception as e:
+        print(f"Error in save_pages_to_new_pdf: {e}")
+        return ""  # 返回空字符串


 def extract_pages_twice(pdf_path, output_folder, output_suffix):
@ -260,14 +293,116 @@ def truncate_pdf_multiple(input_path, output_folder):
        truncate_files.extend(files)
    return truncate_files

+def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_name):
+    """
+    合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件，
+    以及 truncate_files 中以指定后缀结尾的文件，按照指定顺序合并。
+
+    参数：
+    - output_folder (str): 包含以 {base_file_name}_before.pdf 结尾的 PDF 文件的文件夹路径。
+    - truncate_files (list): 包含 PDF 文件路径的列表。
+    - output_path (str): 合并后的 PDF 文件保存路径。
+    - base_file_name (str): 用于匹配文件名的基础名称。
+    """
+    # 1. 获取 output_folder 中所有文件
+    try:
+        all_output_files = os.listdir(output_folder)
+    except FileNotFoundError:
+        print(f"输出文件夹 '{output_folder}' 未找到。")
+        return
+    except PermissionError:
+        print(f"没有权限访问输出文件夹 '{output_folder}'。")
+        return
+
+    # 2. 定义要选择的文件后缀及合并顺序，包括 before 文件
+    desired_suffixes = [
+        f'{base_file_name}_before.pdf',
+        f'{base_file_name}_notice.pdf',
+        f'{base_file_name}_tobidders_notice_table.pdf',
+        f'{base_file_name}_tobidders_notice.pdf'
+    ]
+
+    all_pdfs_to_merge = []
+
+    for suffix in desired_suffixes:
+        if suffix == f'{base_file_name}_before.pdf':
+            # 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件
+            matching_files = [
+                os.path.join(output_folder, f)
+                for f in all_output_files
+                if f.endswith(suffix)
+            ]
+        else:
+            # 从 truncate_files 中选择以指定后缀结尾的文件
+            matching_files = [f for f in truncate_files if f.endswith(suffix)]
+
+        if matching_files:
+            # 如果找到多个匹配的文件，按名称排序并添加
+            matching_files_sorted = sorted(matching_files)
+            all_pdfs_to_merge.extend(matching_files_sorted)
+            for f in matching_files_sorted:
+                print(f"选中文件: {f}")
+        else:
+            print(f"没有找到以 '{suffix}' 结尾的文件。")
+
+    print(f"总共将要合并的 PDF 文件数量: {len(all_pdfs_to_merge)}")
+
+    if not all_pdfs_to_merge:
+        print("没有找到要合并的 PDF 文件。")
+        return
+
+    # 调用 merge_pdfs 函数进行合并
+    merge_pdfs(all_pdfs_to_merge, output_path)
+    print(f"已成功合并 PDF 文件到 '{output_path}'。")
+
+def truncate_pdf_specific_engineering(pdf_path, output_folder):
+    """
+    处理 PDF 文件，选择 selection 为 5、1 和 3 的部分，并合并结果。
+
+    Args:
+        pdf_path (str): 要处理的 PDF 文件路径。
+        output_folder (str): 截取后的文件保存文件夹路径。
+
+    Returns:
+        list: 截取的文件路径列表，包括合并后的文件路径（如果有）。
+    """
+    try:
+        base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
+        truncate_files = []
+        selections = [5, 1, 3]  # 仅处理 selection 5、1 和 3
+
+        for selection in selections:
+            files = truncate_pdf_main(pdf_path, output_folder, selection)
+            if files:
+                if isinstance(files, list):
+                    truncate_files.extend(files)
+                elif isinstance(files, str):
+                    truncate_files.append(files)
+
+        if truncate_files:
+            merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_specific.pdf")
+            merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name)
+            truncate_files.append(merged_output_path)
+            print(f"已生成合并文件: {merged_output_path}")
+        else:
+            print(f"没有文件需要合并 for {pdf_path}")
+
+        return truncate_files
+
+    except Exception as e:
+        print(f"Error in truncate_pdf_specific_two: {e}")
+        return []  # 返回空列表表示失败
+

 # TODO:需要完善二次请求。目前invalid一定能返回 前附表  须知正文如果为空的话要额外处理一下，比如说就不进行跳转（见xx表） 开评定标这里也要考虑  如果评分表为空，也要处理。
 if __name__ == "__main__":
    # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74\\ztbfile.pdf"
    # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74"
    input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标03.pdf"
-    output_folder="C:\\Users\\Administrator\\Desktop\\招标文件"
-    truncate_pdf_multiple(input_path,output_folder)
+    output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\special_output"
+    files=truncate_pdf_multiple(input_path,output_folder)
+    # files=truncate_pdf_specific(input_path,output_folder)
+    print(files)
    # selection = 3 # 例如：1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
    # generated_files = truncate_pdf_main(input_path, output_folder, selection)
    # # print("生成的文件:", generated_files)
--- a/flask_app/货物标/货物标截取pdf.py
+++ b/flask_app/货物标/货物标截取pdf.py
@ -2,7 +2,7 @@ from PyPDF2 import PdfReader, PdfWriter
 import re  # 导入正则表达式库
 import os  # 用于文件和文件夹操作
 from flask_app.main.format_change import docx2pdf
-
+from flask_app.main.merge_pdfs import merge_and_cleanup,merge_pdfs

 def clean_page_content(text, common_header):
    # 首先删除抬头公共部分
@ -102,65 +102,6 @@ def convert_to_pdf(file_path):
        return docx2pdf(file_path)
    return file_path

-
-def judge_file_exist(original_path, new_suffix):
-    # 提取目录路径和原始文件名
-    directory = os.path.dirname(original_path)
-    original_filename = os.path.basename(original_path)
-
-    # 替换文件名中的旧后缀为新后缀
-    # 假设原始文件名格式为 '2-招标文件_qualification.pdf'
-    # 需要替换 '_qualification' 部分为 '_qualification2'
-    new_filename = original_filename.replace("_qualification1", f"_{new_suffix}")
-    new_filename = new_filename.replace("_qualification2", f"_{new_suffix}")
-    # 生成新的文件路径
-    new_file_path = os.path.join(directory, new_filename)
-
-    # 检查新文件是否存在
-    if os.path.isfile(new_file_path):
-        return new_file_path
-    else:
-        return None
-
-#合并PDF
-def merge_pdfs(paths, output_path):
-    pdf_writer = PdfWriter()
-    last_page_text = None  # 用于存储上一个PDF的最后一页的文本
-
-    for path in paths:
-        pdf_reader = PdfReader(path)
-        pages = pdf_reader.pages
-        start_index = 0  # 从第一页开始添加
-
-        # 如果这不是第一个文件，并且有上一个文件的最后一页文本
-        if last_page_text is not None and len(pages) > 0:
-            current_first_page_text = pages[0].extract_text() if pages[0].extract_text() else ""
-            # 比较当前文件的第一页和上一个文件的最后一页的文本
-            if current_first_page_text == last_page_text:
-                start_index = 1  # 如果相同，跳过当前文件的第一页
-
-        # 添加当前PDF的页面到写入器
-        for page in range(start_index, len(pages)):
-            pdf_writer.add_page(pages[page])
-
-        # 更新last_page_text为当前PDF的最后一页的文本
-        if len(pages) > 0:
-            last_page_text = pages[-1].extract_text() if pages[-1].extract_text() else ""
-
-    # 写入合并后的PDF到文件
-    with open(output_path, 'wb') as out:
-        pdf_writer.write(out)
-
-
-def merge_and_cleanup(output_pdf_path, suffix_to_merge):
-    another_file_path = judge_file_exist(output_pdf_path, suffix_to_merge)
-    if another_file_path:
-        paths = [output_pdf_path, another_file_path]  # 需要合并的 PDF 文件路径
-        merge_pdfs(paths, output_pdf_path)
-        os.remove(another_file_path)
-        print(f"文件 {another_file_path} 已删除。")
-
-
 def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
    pdf_path = convert_to_pdf(file_path)
    result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
@ -597,7 +538,7 @@ def truncate_pdf_multiple(pdf_path, output_folder):
    return truncate_files

 #小解析，只需要前三章内容
-def truncate_pdf_specific(pdf_path, output_folder):
+def truncate_pdf_specific_goods(pdf_path, output_folder):
    """
    处理 PDF 文件，选择 selection 为 4 和 5 的部分，并合并结果。

@ -636,7 +577,7 @@ if __name__ == "__main__":
    input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件正文(1).pdf"
    output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all"
    # files = truncate_pdf_multiple(input_path, output_folder)
-    files=truncate_pdf_specific(input_path,output_folder)
+    files=truncate_pdf_specific_goods(input_path,output_folder)
    print(files[-1])
    # selection = 1  # 例如：1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-公告
    # generated_files = truncate_pdf_main(input_path, output_folder, selection)
--- a/flask_app/货物标/货物标解析main.py
+++ b/flask_app/货物标/货物标解析main.py
@ -211,6 +211,7 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
    # deleteKnowledge(index)

 #TODO:目前的无效标这块的键值都删去空格了，所有的键名都删去空格
+#广水市 2022 年义务教育学校多媒体补充采购项目 资格审查有问题
 if __name__ == "__main__":
    output_folder = "flask_app/static/output/zytest1"