9.13

2024-09-13 15:03:55 +08:00 · 2024-09-13 15:03:55 +08:00 · 79834efb5e
commit 79834efb5e
parent 010eff9405
27 changed files with 534 additions and 384 deletions
--- a/flask_app/main/docx截取docx.py
+++ b/flask_app/main/docx截取docx.py
@ -2,8 +2,6 @@ from docx import Document
 import re
 import os
 from flask import g
 def copy_docx(source_path):
    doc = Document(source_path)  # 打开源文档
@ -45,7 +43,7 @@ def copy_docx(source_path):
                break
    new_doc.save(destination_path)  # 保存新文档
-    g.logger.info("docx截取docx成功！")
+    print("docx截取docx成功！")
 # 调用函数
--- a/flask_app/main/download.py
+++ b/flask_app/main/download.py
@ -1,7 +1,6 @@
 import requests
 import mimetypes
 from flask import g
 def download_file(url, local_filename):
@ -29,13 +28,13 @@ def download_file(url, local_filename):
            else:
                return full_filename,3
    except requests.HTTPError as e:
-        g.logger.error(f"download: HTTP Error: {e}")
+        print(f"download: HTTP Error: {e}")
        return None
    except requests.RequestException as e:
-        g.logger.error(f"download: Error downloading the file: {e}")
+        print(f"download: Error downloading the file: {e}")
        return None
    except Exception as e:
-        g.logger.error(f"download: An error occurred: {e}")
+        print(f"download: An error occurred: {e}")
        return None
 if __name__ == '__main__':
--- a/flask_app/main/format_change.py
+++ b/flask_app/main/format_change.py
@ -1,7 +1,6 @@
 import json
 import os
 import requests
 from flask import g
 from flask_app.main.download import download_file
@ -21,14 +20,14 @@ def upload_file(file_path, url):
    # 检查响应状态码
    if response.status_code == 200:
-        g.logger.info("format_change 文件上传成功")
+        print("format_change 文件上传成功")
        receive_file_response = response.content.decode('utf-8')
        receive_file_json = json.loads(receive_file_response)
        receive_file_url = receive_file_json["data"]
    else:
-        g.logger.info(f"format_change 文件上传失败，状态码: {response.status_code}")
+        print(f"format_change 文件上传失败，状态码: {response.status_code}")
-        g.logger.info(f"format_change {response.text}")
+        print(f"format_change {response.text}")
    return receive_file_url
@ -46,7 +45,7 @@ def pdf2docx(local_path_in):
    filename, folder = get_filename_and_folder(local_path_in)  #输入输出在同一个文件夹
    local_path_out=os.path.join(folder,filename)     #输出文件名
    downloaded_filepath,file_type=download_file(receive_download_url, local_path_out)
-    g.logger.info("format_change p2d:have downloaded file to:",downloaded_filepath)
+    print(f"format_change p2d:have downloaded file to: {downloaded_filepath}")
    return downloaded_filepath
 def docx2pdf(local_path_in):
@ -55,7 +54,7 @@ def docx2pdf(local_path_in):
    filename, folder = get_filename_and_folder(local_path_in)  # 输入输出在同一个文件夹
    local_path_out = os.path.join(folder, filename)  # 输出文件名
    downloaded_filepath,file_type = download_file(receive_download_url, local_path_out)
-    g.logger.info("format_change d2p:have downloaded file to:", downloaded_filepath)
+    print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
    return downloaded_filepath
 if __name__ == '__main__':
--- a/flask_app/main/json_utils.py
+++ b/flask_app/main/json_utils.py
@ -1,9 +1,6 @@
 import json
 import re
 from flask import g
 def extract_content_from_json(json_data):
    """提取 { 和 } 之间的内容，并将其解析为字典"""
    if not json_data.strip():
@ -14,10 +11,10 @@ def extract_content_from_json(json_data):
            json_data = match.group(0)
            return json.loads(json_data)     #返回字典
        except json.JSONDecodeError as e:
-            g.logger.info(f"json_utils: extract_content_from_json: JSON decode error: {e}")
+            print(f"json_utils: extract_content_from_json: JSON decode error: {e}")
            return {}
    else:
-        g.logger.info("json_utils: extract_content_from_json: No valid JSON content found.")
+        print("json_utils: extract_content_from_json: No valid JSON content found.")
        return {}
 def clean_json_string(json_string):
@ -66,18 +63,18 @@ def add_keys_to_json(target_dict, source_dict):
    dict: 更新后的字典。
    """
    if not target_dict:
-        g.logger.error("json_utils: Error: Target dictionary is empty.")
+        print("json_utils: Error: Target dictionary is empty.")
        return {}
    if len(target_dict) != 1:
-        g.logger.error("json_utils: Error: Target dictionary must contain exactly one top-level key.")
+        print("json_utils: Error: Target dictionary must contain exactly one top-level key.")
        return target_dict
    # 获取唯一的外层键
    target_key, existing_dict = next(iter(target_dict.items()))
    if not isinstance(existing_dict, dict):
-        g.logger.error(f"json_utils: Error: The value under the key '{target_key}' is not a dictionary.")
+        print(f"json_utils: Error: The value under the key '{target_key}' is not a dictionary.")
        return target_dict
    # 合并字典
@ -95,7 +92,7 @@ def rename_outer_key(original_data,new_key):
    # 提取原始数据中的唯一外层值（假设只有一个外层键）
    if not original_data or not isinstance(original_data, dict):
-        g.logger.error("json_utils: Error: Invalid input or input is not a dictionary.")  # 如果输入无效或不是字典，则返回空字典
+        print("json_utils: Error: Invalid input or input is not a dictionary.")  # 如果输入无效或不是字典，则返回空字典
        return {}
    # 使用 next(iter(...)) 提取第一个键的值
--- a/flask_app/main/start_up.py
+++ b/flask_app/main/start_up.py
@ -1,6 +1,5 @@
 import logging
 import shutil
 import sys
 import time
 import uuid
 from datetime import datetime, timedelta
@ -40,7 +39,8 @@ def before_request():
 def create_logger():
    unique_id = str(uuid.uuid4())
    g.unique_id = unique_id
-    output_folder = f"flask_app/static/output/{unique_id}"
+    # output_folder = f"flask_app/static/output/{unique_id}"
    output_folder = f"C:/Users/Administrator/Desktop/货物标/zboutpub/{unique_id}"
    os.makedirs(output_folder, exist_ok=True)
    log_filename = "log.txt"
    log_path = os.path.join(output_folder, log_filename)
@ -59,11 +59,12 @@ def create_logger():
@app.route('/upload', methods=['POST'])
 def zbparse():
    logger=g.logger
    file_url = validate_request()
    if isinstance(file_url, tuple):  # Check if the returned value is an error response
        return file_url
    try:
-        app.logger.info("starting parsing url:" + file_url)
+        logger.info("starting parsing url:" + file_url)
        final_json_path, output_folder = download_and_process_file(file_url)
        if not final_json_path:
            return jsonify({'error': 'File processing failed'}), 500
@ -71,7 +72,7 @@ def zbparse():
        # remove_directory(output_folder)  # 然后删除文件夹
        return response  # 最后返回获取的响应
    except Exception as e:
-        app.logger.error('Exception occurred: ' + str(e))  # 使用全局 logger 记录
+        logger.error('Exception occurred: ' + str(e))  # 使用全局 logger 记录
        return jsonify({'error': str(e)}), 500
@ -138,7 +139,8 @@ def validate_request():
 def download_and_process_file(file_url):
    logger = g.logger
    unique_id = g.unique_id
-    output_folder = f"flask_app/static/output/{unique_id}"  # 直接使用全局 unique_id 构建路径
+    # output_folder = f"flask_app/static/output/{unique_id}"  # 直接使用全局 unique_id 构建路径
    output_folder = f"C:/Users/Administrator/Desktop/货物标/zboutpub/{unique_id}"
    filename = "ztbfile"
    downloaded_filename = os.path.join(output_folder, filename)
--- a/flask_app/main/table_content_extraction.py
+++ b/flask_app/main/table_content_extraction.py
@ -3,7 +3,6 @@ import os
 from docx import Document
 import json
 from flask import g
 def read_tables_from_docx(file_path):
@ -89,13 +88,13 @@ def save_data_to_json(data, output_folder):
    """将数据保存到JSON文件中."""
    with open(output_filepath, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)
-    g.logger.info(f"table_content_extraction: The data has been processed and saved to '{output_filepath}'.")
+    print(f"table_content_extraction: The data has been processed and saved to '{output_filepath}'.")
    return output_filepath
 def extract_tables_main(path, output_folder):
    if not os.path.exists(path):
-        g.logger.error(f"table_content_extraction: The specified file does not exist: {path}")
+        print(f"table_content_extraction: The specified file does not exist: {path}")
        return ""
    # 读取文档表格数据
    table_data = read_tables_from_docx(path)
--- a/flask_app/main/基础信息整合.py
+++ b/flask_app/main/基础信息整合.py
@ -1,7 +1,5 @@
 import json
 from flask import g
 from flask_app.main.json_utils import clean_json_string, nest_json_under_key,rename_outer_key, combine_json_results
 from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
 from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge
@ -32,9 +30,6 @@ def combine_basic_info(baseinfo_list):
        # 根据检测到的键动态调整 key_groups
    dynamic_key_handling(key_groups, relevant_keys_detected)
    # 打印 key_groups 的内容检查它们是否被正确更新
    # g.logger.info("Updated key_groups after dynamic handling:")
        # 使用合并后的字典创建最终输出
    for group_name, keys in key_groups.items():
@ -82,8 +77,7 @@ def judge_consortium_bidding(baseinfo_list):
 def project_basic_info(knowledge_name,truncate0,output_folder,clause_path):    #投标人须知前附表
    # 调用大模型回答项目基础信息
    baseinfo_list = []
-    # baseinfo_file_path='../static/提示词/前两章提问总结.txt'
+    baseinfo_file_path='flask_app/static/提示词/前两章提问总结.txt'
    baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt'  # 替换为你的txt文件路径
    questions = read_questions_from_file(baseinfo_file_path)
    res1 = multi_threading(questions, knowledge_name)
    for _, response in res1:  # _占位，代表ques;response[0]也是ques;response[1]是ans
@ -91,13 +85,12 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path):    #
            if response and len(response) > 1:  # 检查response存在且有至少两个元素
                baseinfo_list.append(response[1])
            else:
-                g.logger.error(f"基础信息整合： Warning: Missing or incomplete response data for query index {_}.")
+                print(f"基础信息整合： Warning: Missing or incomplete response data for query index {_}.")
        except Exception as e:
-            g.logger.error(f"基础信息整合： Error processing response for query index {_}: {e}")
+            print(f"基础信息整合： Error processing response for query index {_}: {e}")
    # 判断是否分包、是否需要递交投标保证金等
    chosen_numbers, merged = judge_whether_main(truncate0,output_folder)
    baseinfo_list.append(merged)
    # judge_file_path = '../static/提示词/是否相关问题.txt'
    judge_file_path ='flask_app/static/提示词/是否相关问题.txt'
    judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
@ -109,7 +102,7 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path):    #
    file_id=upload_file(truncate0)
    res2 = multi_threading(judge_questions, "",file_id,2)                  #调用千问-long
    if not res2:
-        g.logger.error("基础信息整合： multi_threading errror!")
+        print("基础信息整合： multi_threading errror!")
    else:
        # 打印结果
        for question, response in res2:
--- a/flask_app/main/多线程提问.py
+++ b/flask_app/main/多线程提问.py
@ -6,7 +6,6 @@ import concurrent.futures
 import time
 from dashscope import Assistants, Messages, Runs, Threads
 from flask import g
 from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
 from flask_app.main.通义千问long import qianwen_long, upload_file
 prompt = """
@ -118,10 +117,10 @@ def pure_assistant():
 def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type):
    if llm_type==1:
-        g.logger.info(f"rag_assistant! question:{question}")
+        print(f"rag_assistant! question:{question}")
        assistant = rag_assistant(knowledge_name)
    elif llm_type==2:
-        g.logger.info(f"qianwen_long! question:{question}")
+        print(f"qianwen_long! question:{question}")
        qianwen_res = qianwen_long(file_id,question)
        result_queue.put((ans_index,(question,qianwen_res)))
        return
@ -131,7 +130,7 @@ def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type
    result_queue.put((ans_index, (question, ans)))  # 在队列中添加索引 (question, ans)
 def multi_threading(queries, knowledge_name="", file_id="",llm_type=1):
-    g.logger.info("多线程提问：starting multi_threading...")
+    print("多线程提问：starting multi_threading...")
    result_queue = queue.Queue()
    # 使用 ThreadPoolExecutor 管理线程
@ -150,7 +149,7 @@ def multi_threading(queries, knowledge_name="", file_id="",llm_type=1):
            try:
                future.result()  # 可以用来捕获异常或确认任务完成
            except Exception as exc:
-                g.logger.error(f"Query {index} generated an exception: {exc}")
+                print(f"Query {index} generated an exception: {exc}")
    # 从队列中获取所有结果并按索引排序
    results = [None] * len(queries)
--- a/flask_app/main/形式响应评审.py
+++ b/flask_app/main/形式响应评审.py
@ -3,8 +3,6 @@ import re
 import json
 import time
 from flask import g
 from flask_app.main.多线程提问 import multi_threading
 from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2
 from flask_app.main.json_utils import extract_content_from_json
@ -189,9 +187,9 @@ def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause
                temp = extract_content_from_json(response[1])
                first_response_list.append(temp)
            else:
-                g.logger.error(f"形式响应评审：Warning: Missing or incomplete response data for query index {_}.")
+                print(f"形式响应评审：Warning: Missing or incomplete response data for query index {_}.")
        except Exception as e:
-            g.logger.error(f"形式响应评审：Error processing response for query index {_}: {e}")
+            print(f"形式响应评审：Error processing response for query index {_}: {e}")
    # Assume JSON file paths are defined or configured correctly
    # print(entries_with_numbers)      #[{'形式评审标准.多标段投标': '3.7.4（5）'}]
--- a/flask_app/main/截取pdf.py
+++ b/flask_app/main/截取pdf.py
@ -2,43 +2,47 @@ from PyPDF2 import PdfReader, PdfWriter
 import re  # 导入正则表达式库
 import os  # 用于文件和文件夹操作
-from flask import g
+def clean_page_content(text, common_header):
    # 首先删除抬头公共部分
    if common_header:  # 确保有公共抬头才进行替换
        for header_line in common_header.split('\n'):
            if header_line.strip():  # 只处理非空行
                # 替换首次出现的完整行
                text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
    # 删除页码 eg:89/129   这个代码分三步走可以把89/129完全删除
    text = re.sub(r'^\s*\d+\s*(?=\D)', '', text)  # 删除开头的页码，仅当紧跟非数字字符时
    text = re.sub(r'\s+\d+\s*$', '', text)  # 删除结尾的页码
    text = re.sub(r'\s*\/\s*\d+\s*', '', text)  # 删除形如 /129 的页码
    return text
 def extract_common_header(pdf_path):
    pdf_document = PdfReader(pdf_path)
    headers = []
-    num_pages_to_read = 3  # 预读页数
+    start_page = 4  # 从第5页开始读取，索引为4
    num_pages_to_read = 3  # 连续读取3页
-    for i in range(min(num_pages_to_read, len(pdf_document.pages))):
+    # 确保从第5页开始，且总页数足够
    for i in range(start_page, min(start_page + num_pages_to_read, len(pdf_document.pages))):
        page = pdf_document.pages[i]
-        text = page.extract_text()
+        text = page.extract_text() or ""
-        if text:  # 确保页面有文本内容
+        if text:
-            first_line = text.strip().split('\n')[0]
+            # 只取每页的前三行
-            headers.append(first_line)
+            first_lines = text.strip().split('\n')[:3]
            headers.append(first_lines)
    if len(headers) < 2:
        return ""  # 如果没有足够的页来比较，返回空字符串
-    # 使用set交集来找出公共部分
+    # 寻找每一行中的公共部分
-    common_header = set(headers[0].split()).intersection(*[set(header.split()) for header in headers[1:]])
+    common_headers = []
-    common_header = ' '.join(common_header)
+    for lines in zip(*headers):
-    return common_header
+        # 在每一行中寻找公共单词
        common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]])
        if common_line:
            common_headers.append(' '.join(common_line))
-
+    return '\n'.join(common_headers)
 def clean_page_content(text, common_header):
    # 首先删除抬头公共部分
    if common_header:  # 确保有公共抬头才进行替换
        cleaned_text = text.replace(common_header, '', 1)  # 假设抬头出现在文本开头，只替换一次
    else:
        cleaned_text = text
    # 删除页码
    cleaned_text = re.sub(r'^\s*\d+\s*(?=\D)', '', cleaned_text)  # 删除开头的页码，仅当紧跟非数字字符时
    cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text)  # 删除结尾的页码
    cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text)  # 删除形如 /129 的页码
    return cleaned_text
 def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page):
@ -62,16 +66,16 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
        with open(output_pdf_path, 'wb') as f:
            output_doc.write(f)
-        g.logger.info(f"已截取并保存页面从 {start_page + 1} 到 {end_page + 1} 为 {output_pdf_path}")
+        print(f"已截取并保存页面从 {start_page + 1} 到 {end_page + 1} 为 {output_pdf_path}")
    else:
-        g.logger.error("提供的页码范围无效。")
+        print("提供的页码范围无效。")
    return output_pdf_path
 def extract_pages_twice(pdf_path, output_folder, output_suffix):
    common_header = extract_common_header(pdf_path)
    last_begin_index = 0
-    begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷')
+    begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|投标邀请书')
    pdf_document = PdfReader(pdf_path)
    for i, page in enumerate(pdf_document.pages):
        text = page.extract_text()
@ -108,7 +112,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix):
                        break  # 找到结束页后退出循环
        if start_page is None or end_page is None:
-            g.logger.error(f"twice: 未找到起始或结束页在文件 {pdf_path} 中！")
+            print(f"twice: 未找到起始或结束页在文件 {pdf_path} 中！")
            return ""
        else:
            return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
@ -157,7 +161,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
        if output_suffix == "qualification" or output_suffix =="invalid":
            return extract_pages_twice(pdf_path, output_folder, output_suffix)
        else:
-            g.logger.error(f"first: 未找到起始或结束页在文件 {pdf_path} 中！")
+            print(f"first: 未找到起始或结束页在文件 {pdf_path} 中！")
            return ""
    else:
        return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
@ -185,7 +189,7 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
        if output_pdf_path and os.path.isfile(output_pdf_path):
            return [output_pdf_path]  # 以列表形式返回，以保持一致性
    else:
-        g.logger.error("提供的路径既不是文件夹也不是PDF文件。")
+        print("提供的路径既不是文件夹也不是PDF文件。")
    return []
@ -224,7 +228,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
        output_suffix = "qualification"
    elif selection == 5:
        # 配置用于 "招标公告" 的正则表达式模式和短语
-        begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷')
+        begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|投标邀请书')
        begin_page = 0
        end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*投标人须知', re.MULTILINE)
        output_suffix = "notice"
@ -235,7 +239,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
        end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|[：:]清标报告|第二卷', re.MULTILINE)
        output_suffix = "invalid"
    else:
-        g.logger.error("无效的选择:请选择1-6")
+        print("无效的选择:请选择1-6")
        return None
    # Process the selected input
--- a/flask_app/main/投标人须知正文条款提取成json文件.py
+++ b/flask_app/main/投标人须知正文条款提取成json文件.py
@ -4,8 +4,6 @@ import fitz
 import re
 import os
 from flask import g
 def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
@ -129,7 +127,7 @@ def convert_to_json(file_path, start_word, end_phrases):
 def convert_clause_to_json(input_path,output_folder,type=1):
    if not os.path.exists(input_path):
-        g.logger.error(f"The specified file does not exist: {input_path}")
+        print(f"The specified file does not exist: {input_path}")
        return ""
    if type==1:
        start_word = "投标人须知正文"
--- a/flask_app/main/招标文件解析.py
+++ b/flask_app/main/招标文件解析.py
@ -4,8 +4,6 @@ import logging
 import os
 import time
 from flask import g
 from flask_app.main.截取pdf import truncate_pdf_multiple
 from flask_app.main.table_content_extraction import extract_tables_main
 from flask_app.main.知识库操作 import addfileToKnowledge, deleteKnowledge
@ -20,16 +18,17 @@ from flask_app.main.商务标技术标整合 import combine_evaluation_standards
 from flask_app.main.format_change import pdf2docx, docx2pdf
 from flask_app.main.docx截取docx import copy_docx
-# def get_global_logger(unique_id):
+def get_global_logger(unique_id):
-#     if unique_id is None:
+    if unique_id is None:
-#         return logging.getLogger()  # 获取默认的日志器
+        return logging.getLogger()  # 获取默认的日志器
-#     logger = logging.getLogger(unique_id)
+    logger = logging.getLogger(unique_id)
-#     return logger
+    return logger
 logger=None
 # 可能有问题：pdf转docx导致打勾符号消失
 def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
-    g.logger.info("starting 文件预处理...")
+    logger.info("starting 文件预处理...")
    # 根据文件类型处理文件路径
    if file_type == 1:  # docx
        docx_path = downloaded_file_path
@ -39,7 +38,7 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
        docx_path = pdf2docx(pdf_path)  # 将pdf转换为docx以供上传到知识库
    else:
        # 如果文件类型不是预期中的1或2，记录错误并返回None
-        g.logger.error("Unsupported file type provided. Preprocessing halted.")
+        logger.error("Unsupported file type provided. Preprocessing halted.")
        return None
    # 上传知识库
@ -59,7 +58,7 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
    truncate1 = truncate_files[1]     #评标办法前附表
    truncate3 = truncate_files[3]     #资格审查表
    clause_path = convert_clause_to_json(truncate_files[2], output_folder)  # 投标人须知正文条款pdf->json
-    g.logger.info("文件预处理done")
+    logger.info("文件预处理done")
    return {
        'input_file_path':downloaded_file_path,
@ -94,59 +93,59 @@ def post_processing(data,includes):
    return result
 # 基本信息
 def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path):  # 投标人须知前附表
-    g.logger.info("starting基础信息...")
+    logger.info("starting基础信息...")
    basic_res = project_basic_info(knowledge_name, truncate0, output_folder, clause_path)
-    g.logger.info("基础信息done")
+    logger.info("基础信息done")
    return basic_res
 # 形式、响应、资格评审
 def fetch_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path,input_file,output_folder):
-    g.logger.info("starting资格审查...")
+    logger.info("starting资格审查...")
    review_standards_res = combine_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath,
                                                    clause_path,input_file,output_folder)
-    g.logger.info("资格审查done")
+    logger.info("资格审查done")
    return review_standards_res
 # 评分细则
 def fetch_evaluation_standards(truncate1):  # 评标办法前附表
-    g.logger.info("starting商务标技术标...")
+    logger.info("starting商务标技术标...")
    evaluation_standards_res = combine_evaluation_standards(truncate1)
-    g.logger.info("商务标技术标done")
+    logger.info("商务标技术标done")
    return evaluation_standards_res
 # 无效、废标项解析
 def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3):
    # 废标项要求：千问
-    g.logger.info("starting无效标与废标...")
+    logger.info("starting无效标与废标...")
    find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3)
-    g.logger.info("无效标与废标done...")
+    logger.info("无效标与废标done...")
    return find_invalid_res
 # 投标文件要求
 def fetch_bidding_documents_requirements(clause_path):
-    g.logger.info("starting投标文件要求...")
+    logger.info("starting投标文件要求...")
    fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1)
    qualify_nested_res = nest_json_under_key(fetch_bidding_documents_requirements_json, "投标文件要求")
-    g.logger.info("投标文件要求done...")
+    logger.info("投标文件要求done...")
    return qualify_nested_res
 # 开评定标流程
 def fetch_bid_opening(clause_path):
-    g.logger.info("starting开评定标流程...")
+    logger.info("starting开评定标流程...")
    fetch_bid_opening_json = extract_from_notice(clause_path, 2)
    qualify_nested_res = nest_json_under_key(fetch_bid_opening_json, "开评定标流程")
-    g.logger.info("开评定标流程done...")
+    logger.info("开评定标流程done...")
    return qualify_nested_res
 def main_processing(output_folder, downloaded_file_path, file_type, unique_id):  # file_type=1->docx  file_type=2->pdf
-    # global global_logger
+    global logger
-    # global_logger = get_global_logger(unique_id)
+    logger = get_global_logger(unique_id)
    # Preprocess files and get necessary data paths and knowledge index
    processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id)
    if not processed_data:
@ -180,7 +179,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
                result = futures[key].result()
                comprehensive_responses.append(result)
            except Exception as exc:
-                g.logger.error(f"Error processing {key}: {exc}")
+                logger.error(f"Error processing {key}: {exc}")
    # 合并 JSON 结果
    combined_final_result = combine_json_results(comprehensive_responses)
    includes = ["基础信息", "资格审查", "商务标", "技术标", "无效标与废标项", "投标文件要求", "开评定标流程"]
@ -190,7 +189,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
    final_result_path = os.path.join(output_folder, "final_result.json")
    with open(final_result_path, 'w', encoding='utf-8') as file:
        json.dump(modified_json, file, ensure_ascii=False, indent=2)
-        g.logger.info("final_result.json has been saved")
+        logger.info("final_result.json has been saved")
    deleteKnowledge(processed_data['knowledge_index'])
    return final_result_path
--- a/flask_app/main/按页读取pdf.py
+++ b/flask_app/main/按页读取pdf.py
@ -27,5 +27,5 @@ def extract_text_by_page(file_path):
                print(f"Page {page_num + 1} is empty or text could not be extracted.")
        return result
 if __name__ == '__main__':
-    file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output\\ztb_20240903100337\\ztb_20240903100337_14-20.pdf"
+    file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件正文(1)(1).pdf"
    extract_text_by_page(file_path)
--- a/flask_app/main/提取打勾符号.py
+++ b/flask_app/main/提取打勾符号.py
@ -2,9 +2,6 @@ import re
 import PyPDF2
 import json
 from flask import g
 def extract_key_value_pairs(text):
    # 更新正则表达式来包括对"团"的处理和行尾斜线
    pattern = r'\d+\.\d+\s*([\w\s（）\u4e00-\u9fff]+)[\x01\x02☑团]([\w\s\u4e00-\u9fff]+)'
@ -75,7 +72,7 @@ def read_pdf_and_judge_main(file_path, output_json_path):
        with open(output_json_path, "w", encoding="utf-8") as json_file:
            json.dump(all_data, json_file, ensure_ascii=False, indent=4)
-        g.logger.info(f"da_gou signal: Data extraction complete and saved to '{output_json_path}'.")
+        print(f"da_gou signal: Data extraction complete and saved to '{output_json_path}'.")
 if __name__ == "__main__":
--- a/flask_app/main/无效标和废标和禁止投标整合.py
+++ b/flask_app/main/无效标和废标和禁止投标整合.py
@ -3,9 +3,6 @@ import json
 import os.path
 import time
 import re
 from flask import g
 from flask_app.main.json_utils import combine_json_results, nest_json_under_key
 from flask_app.main.通义千问long import upload_file, qianwen_long
 from concurrent.futures import ThreadPoolExecutor
@ -332,7 +329,7 @@ def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,t
            results.append(future.result())
    #禁止投标
-    g.logger.info("starting不得存在的情形...")
+    print("starting不得存在的情形...")
    forbidden_res = find_forbidden(truncate_json_path, clause_path, truncate3)
    results.append(forbidden_res)
@ -340,7 +337,7 @@ def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,t
    for d in results:
        combined_dict.update(d)
-    g.logger.info("无效标与废标done...")
+    print("无效标与废标done...")
    return nest_json_under_key(combined_dict, "无效标与废标项")
--- a/flask_app/main/知识库操作.py
+++ b/flask_app/main/知识库操作.py
@ -1,7 +1,5 @@
 import os
 import uuid
 from flask import g
 from llama_index.readers.dashscope.base import DashScopeParse
 from llama_index.readers.dashscope.utils import ResultType
 from llama_index.indices.managed.dashscope import DashScopeCloudIndex
@ -16,7 +14,7 @@ def addfileToKnowledge(filepath,knowledge_name):
        knowledge_name,
        verbose=True,
    )
-    g.logger.info("knowledge created successfully!!!")
+    print("knowledge created successfully!!!")
    # index = DashScopeCloudIndex(knowledge_name)
    # index._insert(documents)
    # return index, documents
--- a/flask_app/main/禁止投标情形.py
+++ b/flask_app/main/禁止投标情形.py
@ -4,7 +4,6 @@ import os
 import re
 from PyPDF2 import PdfWriter, PdfReader
 from flask import g
 from flask_app.main.通义千问long import upload_file, qianwen_long
@ -55,9 +54,9 @@ def extract_and_format_from_paths(json_paths, includes, excludes):
                # 将当前文件的结果添加到总结果列表
                all_formatted_results.extend(formatted_results)
        except FileNotFoundError:
-            g.logger.error(f"禁止投标情形: Error: The file '{path}' does not exist.")
+            print(f"禁止投标情形: Error: The file '{path}' does not exist.")
        except json.JSONDecodeError:
-            g.logger.error(f"禁止投标情形: Error: The file '{path}' contains invalid JSON.")
+            print(f"禁止投标情形: Error: The file '{path}' contains invalid JSON.")
    return all_formatted_results
@ -126,9 +125,9 @@ def merge_pdfs(paths, output_filename):
    if output_path:
        with open(output_path, 'wb') as out:
            pdf_writer.write(out)
-        g.logger.info(f"禁止投标情形: Merged PDF saved to {output_path}")
+        print(f"禁止投标情形: Merged PDF saved to {output_path}")
    else:
-        g.logger.error("禁止投标情形: No files to merge.")
+        print("禁止投标情形: No files to merge.")
    return output_path
 def process_string_list(string_list):
@ -153,7 +152,7 @@ def process_string_list(string_list):
            actual_list = ast.literal_eval(formatted_list)
            return actual_list
        except SyntaxError as e:
-            g.logger.error(f"禁止投标情形: Error parsing list: {e}")
+            print(f"禁止投标情形: Error parsing list: {e}")
            return []
    else:
        # 如果没有匹配到内容，返回空列表
--- a/flask_app/main/读取docx.py
+++ b/flask_app/main/读取docx.py
@ -14,5 +14,5 @@ def read_docx(file_path):
 if __name__ == "__main__":
-    file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile.docx"
+    file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件正文(1)(1).docx"
    read_docx(file_path)
--- a/flask_app/main/资格评审.py
+++ b/flask_app/main/资格评审.py
@ -2,9 +2,6 @@
 #资格审查中，首先排除'联合体投标'和'不得存在的情况',有'符合'等的，加入matching_keys列表，否则保留原字典
 import json
 import re
 from flask import g
 from flask_app.main.json_utils import clean_json_string, combine_json_results, add_keys_to_json, nest_json_under_key
 from flask_app.main.多线程提问 import multi_threading,read_questions_from_file
 from flask_app.main.通义千问long import upload_file
@ -19,7 +16,7 @@ def merge_dictionaries_under_common_key(dicts, common_key):
            # 使用字典解包来合并字典
            merged_dict[common_key].update(d[common_key])
        else:
-            g.logger.error(f"资格评审： Warning: Dictionary does not contain the key {common_key}")
+            print(f"资格评审： Warning: Dictionary does not contain the key {common_key}")
    return merged_dict
 def generate_qual_question(matching_keys_list):    #这里假设资质、信誉与人员要求 要不都有、要不都没
@ -73,15 +70,14 @@ def get_consortium_dict(knowledge_name):
            if response and len(response) > 1:  # 检查response存在且有至少两个元素
                qualify_list.append(response[1])
            else:
-                g.logger.error(f"资格评审： Warning: Missing or incomplete response data for query index {_}.")
+                print(f"资格评审： Warning: Missing or incomplete response data for query index {_}.")
        except Exception as e:
-            g.logger.error(f"资格评审： Error processing response for query index {_}: {e}")
+            print(f"资格评审： Error processing response for query index {_}: {e}")
    consortium_dict = combine_json_results(qualify_list)
    return consortium_dict
 def get_all_dict(knowledge_name):
-    # qualification_review_file_path = '../static/提示词/资格评审.txt'  # 替换为你的txt文件路径
+    qualification_review_file_path='flask_app/static/提示词/资格评审.txt'
    qualification_review_file_path='flask_app/static/提示词/资格评审问题.txt'
    questions = read_questions_from_file(qualification_review_file_path)
    qualification_list = []
    res1 = multi_threading(questions, knowledge_name)
@ -90,9 +86,9 @@ def get_all_dict(knowledge_name):
            if response and len(response) > 1:  # 检查response存在且有至少两个元素
                qualification_list.append(response[1])
            else:
-                g.logger.error(f"资格评审： Warning: Missing or incomplete response data for query index {_}.")
+                print(f"资格评审： Warning: Missing or incomplete response data for query index {_}.")
        except Exception as e:
-            g.logger.error(f"资格评审： Error processing response for query index {_}: {e}")
+            print(f"资格评审： Error processing response for query index {_}: {e}")
    qualification_combined_res = combine_json_results(qualification_list)
    return {'资格评审': qualification_combined_res}
 def process_qualification(qualification_review,truncate3,knowledge_name):
@ -101,14 +97,14 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
    if not matching_keys_list:             #此时要求全部写在评分办法前附表中，不需要额外提取。
        if not non_matching_dict:          #古法提取
            if truncate3!="":
-                g.logger.info("资格评审： type1")
+                print("资格评审： type1")
                matching_keys_list=["资质条件","财务要求","业绩要求","信誉要求","其他要求"]
                ques=generate_qual_question(matching_keys_list)
                file_id2 = upload_file(truncate3)
                results2 = multi_threading(ques, "", file_id2, 2)  # 资格评审表，调用qianwen-long
                res_list = []
                if not results2:
-                    g.logger.error("资格评审： 调用大模型未成功获取资格评审文件中的要求！")
+                    print("资格评审： 调用大模型未成功获取资格评审文件中的要求！")
                else:
                    # 打印结果
                    for question, response in results2:
@ -119,11 +115,11 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
                updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict)  # 合并字典
                return updated_qualify_json
            else:
-                g.logger.info("资格评审： type2")
+                print("资格评审： type2")
                return get_all_dict(knowledge_name)
        else:
-            g.logger.info("资格评审： type3")
+            print("资格评审： type3")
            new_non_matching_json={'资格评审':non_matching_dict}
            substring = '联合体'
            found_key = any(substring in key for key in non_matching_dict.keys())   #没有联合体投标，则需生成，防止重复
@ -135,18 +131,18 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
                return new_non_matching_json
    elif matching_keys_list and truncate3=="":    #这种情况是评分办法前附表中有要求，但是没有正确截取到'资格审查表'
-        g.logger.info("资格评审： type4")
+        print("资格评审： type4")
        final_qualification=get_all_dict(knowledge_name)
        final_qualify_json = add_keys_to_json(final_qualification, non_matching_dict)
        return final_qualify_json
    else:                                       #大多数情况
-        g.logger.info("资格评审： type5")
+        print("资格评审： type5")
        user_querys = generate_qual_question(matching_keys_list)  # 生成提问->‘附件：资格审查’
        file_id2 = upload_file(truncate3)
        results2 = multi_threading(user_querys, "", file_id2, 2)  # 资格评审表，调用qianwen-long
        res_list = []
        if not results2:
-            g.logger.error("资格评审： 调用大模型未成功获取资格评审文件中的要求！")
+            print("资格评审： 调用大模型未成功获取资格评审文件中的要求！")
        else:
            # 打印结果
            for question, response in results2:
--- a/flask_app/main/转化格式/pypandoc_d2p.py
+++ b/flask_app/main/转化格式/pypandoc_d2p.py
@ -0,0 +1,9 @@
 import pypandoc
 def docx_to_pdf(docx_path, output_pdf_path):
    output = pypandoc.convert_file(docx_path, 'pdf', outputfile=output_pdf_path)
    assert output == ""
 docx_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件正文(1)(1).docx'
 output_pdf_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\output.pdf'
 docx_to_pdf(docx_path, output_pdf_path)
--- a/flask_app/货物标/extract_procurement_requirements.py
+++ b/flask_app/货物标/extract_procurement_requirements.py
@ -1,60 +0,0 @@
 import json
 from flask_app.货物标.货物标截取pdf import truncate_pdf_main
 from flask_app.main.format_change import docx2pdf, pdf2docx
 from flask_app.main.多线程提问 import multi_threading
 from flask_app.main.通义千问long import upload_file,qianwen_long
 from flask_app.main.json_utils import clean_json_string,combine_json_results
 def generate_key_paths(data, parent_key=''):
    key_paths = []
    for key, value in data.items():
        current_key = f"{parent_key}.{key}" if parent_key else key
        if isinstance(value, dict):
            if value:  # 字典非空时，递归处理
                key_paths.extend(generate_key_paths(value, current_key))
            else:  # 字典为空时，直接添加键路径
                key_paths.append(current_key)
        else:
            # 如果到达了末端，添加当前键路径
            key_paths.append(current_key)
    return key_paths
 #获取采购清单
 def fetch_purchasing_list(file_path,output_folder,file_type):
    global pdf_path,docx_path
    if file_type==1:
        docx_path=file_path
        pdf_path = docx2pdf(file_path)
    elif file_type==2:
        pdf_path=file_path
        docx_path=pdf2docx(file_path)
    technical_requirements=[]
    truncate_path=truncate_pdf_main(pdf_path,output_folder,1)
    user_query1="这是一份货物标中采购要求部分的内容，你需要摘取出需要采购的系统（货物），一个大系统（大项）中可能包含多个小系统（小项），你需要保留这种层次关系，给出货物名称，请以json格式返回，外层键名为\"采购需求\"，嵌套键名为对应的系统名称或货物名称，无需给出采购数量和单位，如有未知内容，在对应键值处填\"未知\"。"
    file_id=upload_file(truncate_path[0])
    res=qianwen_long(file_id,user_query1)
    cleaned_res=clean_json_string(res)
    keys_list=generate_key_paths(cleaned_res['采购需求'])    #提取需要采购的货物清单
    user_query_template = "这是一份货物标中采购要求部分的内容，请你给出\"{}\"的具体型号参数要求，请以json格式返回结果，外层键名为\"{}\", 键值对中的键是你对该要求的总结，而值需要完全与原文保持一致，不可擅自总结删减。"
    queries=[]
    for key in keys_list:
        # 替换 user_query2 中的 "网络硬盘录像机" 为当前 key
        new_query = user_query_template.format(key, key)
        print(new_query)
        queries.append(new_query)
    results=multi_threading(queries,"",file_id,2)
    if not results:
        print("errror!")
    else:
        # 打印结果
        for question, response in results:
            technical_requirements.append(response)
    technical_requirements_combined_res=combine_json_results(technical_requirements)
    json_string = json.dumps(technical_requirements_combined_res, ensure_ascii=False, indent=4)
    print(json_string)
 if __name__ == "__main__":
    output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
    file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\磋商文件.doc"
    fetch_purchasing_list(file_path,output_folder,1)
--- a/flask_app/货物标/find_zbfile.py
+++ b/flask_app/货物标/find_zbfile.py
@ -12,7 +12,9 @@ def find_and_copy_files(input_folder, output_folder):
    for root, dirs, files in os.walk(input_folder):
        for file in files:
            # 检查文件名是否包含“招标”或“竞争性”并且文件格式正确
-            if ('竞争性' in file or '招标' in file or '磋商' in file) and file.endswith(supported_formats):
+            if ('响应' not in file and '投标' not in file) and \
                    ('竞争性' in file or '招标文件' in file or '磋商' in file) and \
                    file.endswith(supported_formats):
                # 构造完整的文件路径
                file_path = os.path.join(root, file)
                # 构造输出路径
--- a/flask_app/货物标/test.py
+++ b/flask_app/货物标/test.py
@ -1,115 +1,33 @@
-import json
+def postprocess(data):
-import re
+    """转换字典中的值为列表，如果所有键对应的值都是'/', '{}' 或 '未知'"""
    for key, value in data.items():
        if all(v in ['/', '未知', {}] for v in value.values()):
            data[key] = list(value.keys())
    return data
 # 示例数据
 data = {
    "第一包.耗材": {
        "服务器": "未知",
        "台式计算机": "未知",
        "便携式计算机": "/",
        "信息安全设备": {},
        "喷墨打印机": "/",
        "激光打印机": "/",
        "针式打印机": "/",
        "液晶显示器": "/",
        "扫描仪": "/",
        "基础软件": "/",
        "信息安全软件": "/",
        "复印机": "/",
        "投影仪": "/",
        "多功能一体机": "/",
        "触控一体机": "/",
        "碎纸机": "/",
        "复印纸": "/"
    }
 }
-def extract_and_format_from_paths(json_paths, includes, excludes):
+# 转换字典
-    """
+converted_data = postprocess(data)
-    从多个 JSON 文件路径读取数据，提取包含特定关键词的内容，并按照格式要求合并。
+print(converted_data)
    参数:
    json_paths (list): 包含多个 JSON 文件路径的列表。
    includes (list): 包含要检查的关键词的列表。
    excludes (list): 包含要排除的关键词的列表。
    返回:
    list: 包含所有文件中满足条件的格式化字符串列表。
    """
    all_formatted_results = []
    # 遍历每个文件路径
    for path in json_paths:
        try:
            with open(path, 'r', encoding='utf-8') as file:
                # 加载 JSON 数据
                json_data = json.load(file)
                formatted_results = []
                # 遍历 JSON 数据的每个键值对
                for key, value in json_data.items():
                    if isinstance(value, dict):
                        # 如果值是字典，检查嵌套字典的每个键值对
                        for sub_key, sub_value in value.items():
                            if any(include in sub_key for include in includes):
                                # 如果子值包含关键词，格式化并添加到结果列表
                                formatted_results.append(f"{sub_value}")
                    elif isinstance(value, str):  # clause
                        # 检查是否包含任何 include 关键词
                        for include in includes:
                            if include in value:
                                # 找到 include 之前的内容
                                prefix = value.split(include)[0]
                                # 检查 prefix 是否不包含任何 exclude 关键词
                                if not any(exclude in prefix for exclude in excludes):
                                    # 如果不包含任何 exclude 关键词，添加整个 value 到结果列表
                                    if '\n' in value:
                                        value = value.split('\n', 1)[-1]
                                    formatted_results.append(value)
                                    break  # 找到一个符合条件的就跳出循环
                # 将当前文件的结果添加到总结果列表
                all_formatted_results.extend(formatted_results)
        except FileNotFoundError:
            print(f"Error: The file '{path}' does not exist.")
        except json.JSONDecodeError:
            print(f"Error: The file '{path}' contains invalid JSON.")
    return all_formatted_results
 def extract_unique_items_from_texts(texts):
    # 更新正则表达式以包括更广泛的序号类型，包括中文序号
    pattern = re.compile(r'(?:\d+\.|\（\d+\）|\(\d+\)|\d+\)|\①|\②|\③|\④|\⑤|\⑥|\⑦|\⑧|\⑨|\⑩|\⑪|\⑫)\s*')
    intro_pattern = re.compile(r'^.*?[:：]')
    punctuation_pattern = re.compile(r'[；。，、．.,:;!?！？]+$')
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    content_check_pattern = re.compile(r'[\u4e00-\u9fa5a-zA-Z0-9]{2,}')  # 检查是否含有至少2个连续的字母、数字或汉字
    all_results = []
    seen = set()
    for text in texts:
        # 去除文本中的制表符和换行符
        text = text.replace('\t', '').replace('\n', '')
        # 删除引导性的文本（直到冒号，但保留冒号后的内容）
        text = intro_pattern.sub('', text)
        # 替换URL为占位符，并保存URL以便后续还原
        urls = []
        def url_replacer(match):
            urls.append(match.group(0))
            return f"{{URL{len(urls)}}}"
        text = url_pattern.sub(url_replacer, text)
        # 使用数字和括号的模式分割文本
        items = pattern.split(text)
        for item in items:
            cleaned_item = item.strip()
            if cleaned_item:
                # 进一步清理每个条目
                cleaned_item = pattern.sub('', cleaned_item)
                cleaned_item = punctuation_pattern.sub('', cleaned_item)
                cleaned_item = cleaned_item.strip()
                # 还原URL
                for i, url in enumerate(urls, 1):
                    cleaned_item = cleaned_item.replace(f"{{URL{i}}}", url)
                # 添加未见过的独特条目，确保它包含足够的实质内容并长度大于3个字符
                if cleaned_item and cleaned_item not in seen and len(cleaned_item) > 3 and content_check_pattern.search(cleaned_item):
                    seen.add(cleaned_item)
                    all_results.append(cleaned_item)
    return all_results
 # 使用上面定义的函数
 truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\truncate_output.json"
 clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\clause1.json"
 json_paths = [truncate_json_path,clause_path]  # 根据实际存放的路径来填写
 includes = ["不得存在","不得与","禁止投标","对投标人的纪律"]
 excludes=["招标","评标","定标"]
 # 调用函数
 results = extract_and_format_from_paths(json_paths, includes,excludes)
 print(results)
 res=extract_unique_items_from_texts(results)
 print(res)
--- a/flask_app/货物标/技术服务商务要求提取.py
+++ b/flask_app/货物标/技术服务商务要求提取.py
@ -0,0 +1,128 @@
 # -*- encoding:utf-8 -*-
 import json
 import os
 from flask_app.main.多线程提问 import multi_threading
 from flask_app.main.通义千问long import upload_file, qianwen_long
 from flask_app.main.json_utils import clean_json_string, combine_json_results
 def generate_key_paths(data, parent_key=''):
    key_paths = []
    for key, value in data.items():
        current_key = f"{parent_key}.{key}" if parent_key else key
        if isinstance(value, dict):
            if value:
                # 检查字典中的值是否为字典、列表或字符串'未知'
                contains_dict_list_or_unknown = any(isinstance(v, (dict, list)) or v == "未知" for v in value.values())
                if contains_dict_list_or_unknown:
                    # 递归生成键路径
                    sub_paths = generate_key_paths(value, current_key)
                    if sub_paths:
                        # 如果子路径非空，则扩展
                        key_paths.extend(sub_paths)
                    else:
                        # 当前字典内部为空或值全为"未知"
                        key_paths.append(current_key)
                else:
                    # 字典中所有值都不是字典、列表或"未知"，添加当前键
                    key_paths.append(current_key)
            else:
                # 空字典，直接添加键路径
                key_paths.append(current_key)
        elif isinstance(value, list):
            # 列表类型，添加包含列表的键的路径
            if value:  # 只有当列表非空时才添加
                key_paths.append(current_key)
        elif value == "未知":
            # 值为"未知"，添加键路径
            key_paths.append(current_key)
    return key_paths
 def get_technical_requirements(truncate_file):
    user_query1 = "这是一份货物标中采购要求部分的内容，请告诉我需要采购的系统（货物），如果有采购清单，请直接根据清单上的货物名称给出结果，若没有采购清单，你要从文中摘取需要采购的系统（货物），采购需求中可能包含层次关系，如大系统中包含若干子系统，你需要保留这种层次关系，给出系统（货物）名称，请以json格式返回，外层键名为\"采购需求\"，嵌套键名为对应的系统名称或货物名称，需与原文保持一致，无需给出采购数量和单位，如有未知内容，在对应键值处填\"未知\"。"
    file_id = upload_file(truncate_file)
    res = qianwen_long(file_id, user_query1)
    print(res)
    cleaned_res = clean_json_string(res)
    keys_list = generate_key_paths(cleaned_res['采购需求'])  # 提取需要采购的货物清单
    user_query_template = "这是一份货物标中采购要求部分的内容，请你给出\"{}\"的技术参数（或采购要求）和数量，请以json格式返回结果，外层键名为\"{}\", 键值对中的键是你对该要求的总结，而值需要完全与原文保持一致，不可擅自总结删减。"
    queries = []
    for key in keys_list:
        # 替换 user_query2 中的 "网络硬盘录像机" 为当前 key
        new_query = user_query_template.format(key, key)
        print(new_query)
        queries.append(new_query)
    results = multi_threading(queries, "", file_id, 2)
    technical_requirements = []
    if not results:
        print("errror!未获得大模型的回答！")
    else:
        # 打印结果
        for question, response in results:
            technical_requirements.append(response)
    technical_requirements_combined_res = combine_json_results(technical_requirements)
    """根据所有键是否已添加处理技术要求"""
    # 更新原始采购需求字典
    combine_and_update_results(cleaned_res['采购需求'], technical_requirements_combined_res)
    final_res = postprocess(cleaned_res['采购需求'])
    print("更新后的采购需求处理完成.")
    # 输出最终的 JSON 字符串
    json_string = json.dumps(final_res, ensure_ascii=False, indent=4)
    return json_string
 def combine_and_update_results(original_data, updates):
    def recursive_update(data, key, value):
        # 处理点分隔的键，递归定位并更新嵌套字典
        keys = key.split('.')
        for k in keys[:-1]:
            data = data.setdefault(k, {})
        if isinstance(value, dict) and isinstance(data.get(keys[-1], None), dict):
            data[keys[-1]] = {**data.get(keys[-1], {}), **value}
        else:
            data[keys[-1]] = value
    for key, value in updates.items():
        recursive_update(original_data, key, value)
    return original_data
 def postprocess(data):
    """转换字典中的值为列表，如果所有键对应的值都是'/', '{}' 或 '未知'"""
    for key, value in data.items():
        if all(v in ['/', '未知', {}] for v in value.values()):
            data[key] = list(value.keys())
    return data
 def test_all_files_in_folder(input_folder, output_folder):
    # 确保输出文件夹存在
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    # 遍历指定文件夹中的所有文件
    for filename in os.listdir(input_folder):
        file_path = os.path.join(input_folder, filename)
        # 检查是否是文件
        if os.path.isfile(file_path):
            print(f"处理文件: {file_path}")
            # 调用函数处理文件
            try:
                json_result = get_technical_requirements(file_path)
                # 定义输出文件的路径
                output_file_path = os.path.join(output_folder, os.path.splitext(filename)[0] + '.json')
                # 保存JSON结果到文件
                with open(output_file_path, 'w', encoding='utf-8') as json_file:
                    json_file.write(json_result)
                print(f"结果已保存到: {output_file_path}")
            except Exception as e:
                print(f"处理文件 {file_path} 时出错: {e}")
 if __name__ == "__main__":
    truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\招标文件(107国道)_procurement.pdf"
    res=get_technical_requirements(truncate_file)
    print(res)
    # input_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
    # output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output3"
    # test_all_files_in_folder(input_folder, output_folder)
--- a/flask_app/货物标/提取采购需求main.py
+++ b/flask_app/货物标/提取采购需求main.py
@ -0,0 +1,24 @@
 import json
 from flask_app.货物标.货物标截取pdf import truncate_pdf_main
 from flask_app.main.format_change import docx2pdf, pdf2docx
 #获取采购清单
 def fetch_purchasing_list(file_path,output_folder,file_type):
    if file_type==1:
        docx_path=file_path
        pdf_path = docx2pdf(file_path)
    elif file_type==2:
        pdf_path=file_path
        docx_path=pdf2docx(file_path)
    else:
        print("未传入指定格式的文件！")
        return None
    truncate_path=truncate_pdf_main(pdf_path,output_folder,1)
 if __name__ == "__main__":
    output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output"
    file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\磋商文件.doc"
    fetch_purchasing_list(file_path,output_folder,1)
--- a/flask_app/货物标/提示词/prompt1.txt
+++ b/flask_app/货物标/提示词/prompt1.txt
@ -1,10 +1,13 @@
 #这是一份货物标中采购要求部分的内容，你需要摘取出采购清单，一个大系统（大项）中可能包含多个小系统（小项），你需要保留这种层次关系，给出名称和数量和单位，请以json格式返回，外层键名为"采购需求"，如有未知内容，在对应键值处填"未知"。
-这是一份货物标中采购要求部分的内容，你需要摘取出需要采购的系统（货物），一个大系统（大项）中可能包含多个小系统（小项），小系统中也可能包含多个货物，你需要保留这种层次关系，给出货物名称，请以json格式返回，外层键名为"采购需求"，嵌套键名为对应的系统名称或货物名称，无需给出采购数量和单位，如有未知内容，在对应键值处填"未知"。
+#这是一份货物标中采购要求部分的内容，你需要摘取出需要采购的系统（货物），一个大系统（大项）中可能包含多个小系统（小项），小系统中也可能包含多个货物，你需要保留这种层次关系，给出货物名称，请以json格式返回，外层键名为"采购需求"，嵌套键名为对应的系统名称或货物名称，无需给出采购数量和单位，如有未知内容，在对应键值处填"未知"。
 "这是一份货物标中采购要求部分的内容，你需要摘取出需要采购的系统（货物），一个大系统（大项）中可能包含多个小系统（小项），你需要保留这种层次关系，给出系统（货物）名称，请以json格式返回，外层键名为\"采购需求\"，嵌套键名为对应的系统名称或货物名称，无需给出采购数量和单位，如有未知内容，在对应键值处填\"未知\"。"
 #这是一份货物标中采购要求部分的内容，请你给出所需的设备名称以及设备的具体型号参数要求，请以json格式返回结果，外层键名为采购要求。
-
+这是一份货物标中采购要求部分的内容，请你给出\"{}\"的具体型号参数要求和数量，请以json格式返回结果，外层键名为\"{}\", 键值对中的键是你对该要求的总结，而值需要完全与原文保持一致，不可擅自总结删减。
-这是一份货物标中采购要求部分的内容，请你给出"网络硬盘录像机"的具体型号参数要求，请以json格式返回结果，外层键名为"网络硬盘录像机",键值对中的键是你对该要求的总结，而值需要完全与原文保持一致，不可擅自总结删减。
+#这是一份货物标中采购要求部分的内容，请你给出"网络硬盘录像机"的具体型号参数要求，请以json格式返回结果，外层键名为"网络硬盘录像机",键值对中的键是你对该要求的总结，而值需要完全与原文保持一致，不可擅自总结删减。
 {
  "采购需求": {
@ -76,5 +79,71 @@
    }
  }
 }
 {
  "采购需求": {
    "第一包": {
      "办公电子设备": [
        "服务器",
        "台式计算机",
        "便携式计算机",
        "信息安全设备",
        "喷墨打印机",
        "激光打印机",
        "针式打印机",
        "液晶显示器",
        "扫描仪",
        "基础软件",
        "信息安全软件",
        "复印机",
        "投影仪",
        "多功能一体机",
        "触控一体机",
        "碎纸机"
      ],
      "软件": [
        "基础软件",
        "信息安全软件"
      ],
      "耗材": [
        "复印纸"
      ]
    },
    "第二包": {
      "办公家电": [
        "空调机"
      ]
    },
    "第三包": {
      "家具用具": [
        "床类",
        "台、桌类",
        "椅凳类",
        "沙发类",
        "柜类",
        "架类",
        "屏风类",
        "厨卫用具",
        "组合家具",
        "家用家具零配件",
        "其他家具用具"
      ]
    },
    "第四包": {
      "印刷服务": "未知"
    },
    "第五包": {
      "汽车维修和保养服务": "未知"
    },
    "第六包": {
      "会计服务": "未知"
    },
    "第七包": {
      "工程造价咨询服务": "未知"
    },
    "第八包": {
      "机动车保险服务": "未知"
    }
  }
 }
--- a/flask_app/货物标/货物标截取pdf.py
+++ b/flask_app/货物标/货物标截取pdf.py
@ -1,91 +1,179 @@
 from PyPDF2 import PdfReader, PdfWriter
 import re  # 导入正则表达式库
 import os  # 用于文件和文件夹操作
 from flask_app.main.format_change import docx2pdf
-def clean_page_numbers(text):
+
-    # 使用正则表达式删除页码
+def clean_page_content(text, common_header):
-    # 假设页码在文本的最开始，紧跟着文字且无空格分隔
+    # 首先删除抬头公共部分
-    cleaned_text = re.sub(r'^\s*\d+\s*(?=\D)', '', text)  # 删除开头的页码，仅当紧跟非数字字符时
+    if common_header:  # 确保有公共抬头才进行替换
-    # 删除结尾的页码
+        for header_line in common_header.split('\n'):
-    cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text)
+            if header_line.strip():  # 只处理非空行
-    # 删除形如 /129 的页码
+                # 替换首次出现的完整行
-    cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text)
+                text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
-    return cleaned_text
+
-def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
+    # 删除页码 eg:89/129   这个代码分三步走可以把89/129完全删除
-    # 打开PDF文件
+    text = re.sub(r'^\s*\d+\s*(?=\D)', '', text)  # 删除开头的页码，仅当紧跟非数字字符时
    text = re.sub(r'\s+\d+\s*$', '', text)  # 删除结尾的页码
    text = re.sub(r'\s*\/\s*\d+\s*', '', text)  # 删除形如 /129 的页码
    return text
 def extract_common_header(pdf_path):
    pdf_document = PdfReader(pdf_path)
-    start_page = None
+    headers = []
-    end_page = None
+    total_pages = len(pdf_document.pages)
-    # 遍历文档的每一页，查找开始和结束短语的位置
+    middle_page = total_pages // 2  # 计算中间页
-    for i in range(len(pdf_document.pages)):
+
    # 确保中间页前后各一页（共3页），如果不足3页，则尽可能取足
    start_page = max(0, middle_page - 1)
    num_pages_to_read = 3
    for i in range(start_page, min(start_page + num_pages_to_read, total_pages)):
        page = pdf_document.pages[i]
-        text = page.extract_text()
+        text = page.extract_text() or ""
        if text:
-            cleaned_text = clean_page_numbers(text)
+            # 只取每页的前三行
-            if re.search(begin_pattern, cleaned_text) and i > begin_page:
+            first_lines = text.strip().split('\n')[:3]
-                start_page = i
+            headers.append(first_lines)
            if start_page is not None and re.search(end_pattern, cleaned_text) and i > (start_page+1):
                end_page = i
                break
    # 确保找到了起始和结束页面
    if start_page is None or end_page is None:
        print(f"未找到起始或结束页在文件 {pdf_path} 中！")
        return None
-    # 创建一个新的PDF文档保存截取的页面
+    if len(headers) < 2:
-    base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]  # Get the base name without extension
+        return ""  # 如果没有足够的页来比较，返回空字符串
    output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
    output_doc = PdfWriter()
-    # 添加需要的页面，从 start_page 开始，包括 end_page
+    # 寻找每一行中的公共部分
-    for page_num in range(start_page, end_page + 1):
+    common_headers = []
-        output_doc.add_page(pdf_document.pages[page_num])
+    for lines in zip(*headers):
-    # 保存新的PDF文件
+        # 在每一行中寻找公共单词
-    with open(output_pdf_path, 'wb') as f:
+        common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]])
-        output_doc.write(f)
+        if common_line:
            common_headers.append(' '.join(common_line))
-    print(f"已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
+    return '\n'.join(common_headers)
-    return output_pdf_path
+def is_pdf_or_doc(filename):
    # 判断文件是否为PDF或Word文档
    return filename.lower().endswith(('.pdf', '.doc', '.docx'))
 def convert_to_pdf(file_path):
    # 假设 docx2pdf 函数已经被定义，这里仅根据文件扩展名来决定是否需要转换
    if file_path.lower().endswith(('.doc', '.docx')):
        return docx2pdf(file_path)
    return file_path
 def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
    # 确保输出文件夹存在
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    if os.path.isdir(input_path):
    generated_files = []
    if os.path.isdir(input_path):
        # 遍历文件夹内的所有PDF文件
        for file in os.listdir(input_path):
-            if file.endswith(".pdf"):
+            file_path = os.path.join(input_path, file)
-                pdf_path = os.path.join(input_path, file)
+            if is_pdf_or_doc(file):
-                output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
+                pdf_path = convert_to_pdf(file_path)
-                if output_pdf_path and os.path.isfile(output_pdf_path):
+                output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern,
                                                output_suffix)
                if output_pdf_path:
                    generated_files.append(output_pdf_path)
        return generated_files
    elif os.path.isfile(input_path) and input_path.endswith(".pdf"):
        # 处理单个PDF文件
        output_pdf_path = extract_pages(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
-        if output_pdf_path and os.path.isfile(output_pdf_path):
+        if output_pdf_path:
-            return [output_pdf_path]  # 以列表形式返回，以保持一致性
+            generated_files.append(output_pdf_path)
    else:
        print("提供的路径既不是文件夹也不是PDF文件。")
-    return []
+    return generated_files
 def extract_pages_twice(pdf_path, output_folder, output_suffix,common_header):
    pdf_document = PdfReader(pdf_path)
    begin_page=5
    start_page = None
    end_page = None
    # 定义用于检测是否包含"文件的构成"的正则表达式
    exclusion_pattern = re.compile(r'文件的构成|文件的组成')
    if output_suffix == "procurement":
        begin_pattern = re.compile(
            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|'
            r'^[一二三四五六七八九十百千]+、\s*采购清单',
            re.MULTILINE
        )
        end_pattern = re.compile(
            r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
            re.MULTILINE
        )
        for i in range(len(pdf_document.pages)):
            page = pdf_document.pages[i]
            text = page.extract_text() or ""
            cleaned_text = clean_page_content(text,common_header)
            if re.search(begin_pattern, cleaned_text) and i > begin_page and not re.search(exclusion_pattern, cleaned_text):
                start_page = i
            if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
                end_page = i
                break
        if start_page is None or end_page is None:
            print(f"twice: 未找到起始或结束页在文件 {pdf_path} 中！")
            return ""
        else:
            return save_extracted_pages(pdf_document,start_page, end_page, pdf_path,output_folder, output_suffix)
 def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
    try:
        common_header = extract_common_header(pdf_path)
        pdf_document = PdfReader(pdf_path)
        start_page = None
        end_page = None
        for i in range(len(pdf_document.pages)):
            page = pdf_document.pages[i]
            text = page.extract_text() or ""
            cleaned_text = clean_page_content(text,common_header)
            if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
                start_page = i
            if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
                end_page = i
                break
        if start_page is None or end_page is None:
            if output_suffix == "procurement":
                return extract_pages_twice(pdf_path, output_folder, output_suffix,common_header)
            else:
                print(f"未找到起始或结束页在文件 {pdf_path} 中！")
                return None
        return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return None
 def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):
    base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
    output_doc = PdfWriter()
    for page_num in range(start_page, end_page + 1):
        output_doc.add_page(pdf_document.pages[page_num])
    with open(output_pdf_path, 'wb') as f:
        output_doc.write(f)
    print(f"已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
    return output_pdf_path
 def truncate_pdf_main(input_path, output_folder, selection):
    if selection == 1:
-        # Configure patterns and phrases for "第三章 项目技术、服务及商务要求"
+        # 更新的正则表达式以匹配"第x章"和"第x部分"，考虑到可能的空格和其他文字
-        begin_pattern = re.compile(r'第[一二三四五六七八九十百千]+章.*?(?:服务|项目|商务).*?要求|第[一二三四五六七八九十百千]+章.*?采购.*')
+        begin_pattern = re.compile(
            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?采购.*',
            # r'^[一二三四五六七八九十百千]+、\s*采购清单',
        )
        begin_page = 5
-        end_pattern = re.compile(r'第[一二三四五六七八九十百千]+章\s*(资格审查|评标方法|评审办法|评定办法)')
+        end_pattern = re.compile(
-        # 示例文本进行测试
+            r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
-        output_suffix = "tobidders_notice_table"
+        )
        output_suffix = "procurement"
    else:
        print("无效的选择")
        return None
-    # Process the selected input
+    # 调用相应的处理函数
    return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
 def truncate_pdf_multiple(input_path, output_folder):
@ -96,10 +184,10 @@ def truncate_pdf_multiple(input_path, output_folder):
    return truncate_files
 if __name__ == "__main__":
-    input_path = "C:\\Users\\Administrator\\WPSDrive\\292826853\\WPS云盘\\应用\\输出为PDF\\公安局音视频监控系统设备采购项目(定稿）_20240829133603.pdf"
+    input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件(107国道).pdf"
    output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
-    # truncate_pdf_multiple(input_path,output_folder)
+    truncate_pdf_multiple(input_path,output_folder)
-    selection = 1  # 例如：1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-招标公告-合同条款前
+    # selection = 1  # 例如：1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-招标公告-合同条款前
-    generated_files = truncate_pdf_main(input_path, output_folder, selection)
+    # generated_files = truncate_pdf_main(input_path, output_folder, selection)
    # print("生成的文件:", generated_files)