10.18小解析

2024-10-19 15:33:55 +08:00 · 2024-10-19 15:33:55 +08:00 · 91b4f1efc1
commit 91b4f1efc1
parent 389d0f7fad
9 changed files with 128 additions and 102 deletions
--- a/flask_app/main/download.py
+++ b/flask_app/main/download.py
@ -62,9 +62,14 @@ def download_file(url, local_filename):

 if __name__ == '__main__':
    # 测试下载的URL
-    test_url ="https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/01%20%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6-%E5%A4%A7%E8%8D%94%E5%8E%BF%E5%85%AC%E5%AE%89%E5%B1%80%E6%83%85%E6%8C%87%E5%8B%A4%E8%88%86%E4%B8%80%E4%BD%93%E5%8C%96%E5%B9%B3%E5%8F%B0%E5%BB%BA%E8%AE%BE%E9%A1%B9%E7%9B%AE%28%E4%BA%8C%E6%AC%A1%29.pdf?Expires=1728751377&OSSAccessKeyId=TMP.3KeYhAGeJr2LGiNctSPvSmdSwxhrU8pbaDYRJcNCCgv8ijyWN613QahKb3nhXydfAvHaqpw4nTHXMzq7hmTHmNnPA77DgL&Signature=mwPHW8v7dVmHP1udTDL%2ByzllwCE%3D"
+    test_url ="http://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/tender/444d68db95c3438ca481f029500f4f9b.pdf?Expires=1729325467&OSSAccessKeyId=LTAI5tJmL4Nvg95VjgUNvVL6&Signature=2%2Bax895ik7DoFs%2FHVYeN29%2BEiTo%3D&x-oss-process=doc%2Fedit"
    local_file_name = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output\\testdownload'
-    file_path,file_type = download_file(test_url, local_file_name)
-    if file_path:
-        print(f"Downloaded file path: {file_path}")
+    downloaded = download_file(test_url, local_file_name)
+    if not downloaded:
+        print("下载文件失败或不支持的文件类型")
+    downloaded_filepath, file_type = downloaded
+    print(downloaded_filepath)
    print(file_type)
+    # 检查文件类型
+    if file_type == 4:
+        print("error")
--- a/flask_app/main/start_up.py
+++ b/flask_app/main/start_up.py
@ -257,7 +257,7 @@ def process_and_stream(file_url, zb_type):
    """
    logger = g.logger
    unique_id = g.unique_id
-    output_folder = f"flask_app/static/output1/{unique_id}"
+    output_folder = f"flask_app/static/output/{unique_id}"
    filename = "ztbfile"
    downloaded_filename = os.path.join(output_folder, filename)

--- a/flask_app/main/test.py
+++ b/flask_app/main/test.py
@ -1,36 +1,43 @@
-def second_query(baseinfo_list):
-    result_list = []
-    for item in baseinfo_list:
-        for key, value in item.items():
-            if isinstance(value, dict):
-                # 检查所有最内层的值是否都是 "未知"
-                if all(v == "未知" for v in value.values()):
-                    result_list.append(key)
-    return result_list
+import re

-# 示例用法
-baseinfo_list = [
-    {
-        "招标人联系方式": {
-            "名称": "未知",
-            "地址": "未知",
-        },
-        "项目名称": {
-            "名称": "项目A",
-            "地址": "未知",
-        }
-    },
-    {
-        "供应商信息": {
-            "名称": "未知",
-            "联系方式": "未知",
-        },
-        "合同详情": {
-            "金额": "100万",
-            "期限": "未知",
-        }
-    }
-]

-result = second_query(baseinfo_list)
-print(result)  # 输出: ['招标人联系方式', '供应商信息']
+def read_questions_from_file(file_path):
+    questions = []
+    current_question = ""
+    current_number = 0
+
+    with open(file_path, 'r', encoding='utf-8') as file:
+        for line in file:
+            line = line.strip()
+
+            if not line:  # 跳过空行
+                continue
+
+            if line.startswith('#'):  # 跳过以#开头的行
+                continue
+
+            # 检查是否是新的问题编号，例如 "1."
+            match = re.match(r'^(\d+)\.', line)
+            if match:
+                # 如果有之前的问题，保存它
+                if current_question:
+                    questions.append(current_question.strip())
+
+                # 开始新的问题
+                current_number = int(match.group(1))
+                # 提取问题内容，去掉编号和点
+                current_question = line.split('.', 1)[1].strip() + "\n"
+            else:
+                # 继续添加到当前问题
+                current_question += line + "\n"
+
+    # 添加最后一个问题（如果存在）
+    if current_question:
+        questions.append(current_question.strip())
+
+    return questions
+
+
+qualification_review_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\资格评审.txt'
+ques=read_questions_from_file(qualification_review_file_path)
+print(ques)
--- a/flask_app/main/基础信息整合快速版.py
+++ b/flask_app/main/基础信息整合快速版.py
@ -1,7 +1,8 @@
+import copy
 import json
 import threading
 import time
-
+import concurrent.futures
 from flask_app.main.json_utils import clean_json_string, nest_json_under_key,rename_outer_key
 from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
 from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge, merge_json_to_list
@ -113,20 +114,6 @@ def judge_consortium_bidding(baseinfo_list):
    baseinfo_list[:] = updated_list
    return accept_bidding

-def generate_query(baseinfo_list):
-    questions_list = []
-    for item in baseinfo_list:
-        # print(json.dumps(item, ensure_ascii=False, indent=4))
-        for key, value in item.items():
-            if value == "未知" or (isinstance(value, dict) and all(v == "未知" for v in value.values())):
-                question = (
-                    f"根据该招标文件中的信息，{key}的内容是怎样的？"
-                    f"请按json格式给我提供信息，键名是'{key}'，"
-                    f"若存在未知信息，在对应的键值中填'未知'。"
-                )
-                questions_list.append(question)
-    return questions_list
-
 def update_baseinfo_lists(baseinfo_list1, baseinfo_list2):
    # 创建一个字典，用于存储 baseinfo_list1 中的所有键值对
    combined_dict = {}
@ -167,7 +154,18 @@ def process_judge_questions(judge_file_path, chosen_numbers, truncate0, baseinfo
        for question, response in res2:
            baseinfo_list1.append(clean_json_string(response))

-def process_questions_list(questions_list, truncate2):
+def process_baseinfo_list(baseinfo_list, truncate2):
+    questions_list = []
+    for item in baseinfo_list:
+        # print(json.dumps(item, ensure_ascii=False, indent=4))
+        for key, value in item.items():
+            if value == "未知" or (isinstance(value, dict) and all(v == "未知" for v in value.values())):
+                question = (
+                    f"根据该招标文件中的信息，{key}的内容是怎样的？"
+                    f"请按json格式给我提供信息，键名是'{key}'，"
+                    f"若存在未知信息，在对应的键值中填'未知'。"
+                )
+                questions_list.append(question)
    if questions_list:
        file_id = upload_file(truncate2)
        baseinfo_results = multi_threading(questions_list, "", file_id, 2)
@ -200,35 +198,27 @@ def combine_basic_info(merged_baseinfo_path,truncate0,truncate2, clause_path):
    baseinfo_list1 = [clean_json_string(res) for _, res in baseinfo_results] if baseinfo_results else []

    chosen_numbers, merged = merge_json_to_list(baseinfo_list1.pop())
+    baseinfo_list1_copy = copy.deepcopy(baseinfo_list1)
    baseinfo_list1.append(merged)
-
-    questions_list=generate_query(baseinfo_list1)
-
    # judge_file_path = 'flask_app/static/提示词/是否相关问题qianwen-long.txt'
    judge_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题qianwen-long.txt'
-    # 创建两个线程
-    thread1 = threading.Thread(target=process_judge_questions,
-                               args=(judge_file_path, chosen_numbers, truncate0, baseinfo_list1))
-    thread2 = threading.Thread(target=process_questions_list, args=(questions_list, truncate2))

-    # 启动线程
-    thread1.start()
-    thread2.start()
+    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+        # 提交两个任务
+        future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, truncate0, baseinfo_list1)
+        future2 = executor.submit(process_baseinfo_list, baseinfo_list1_copy, truncate2)

-    # 等待两个线程完成
-    thread1.join()
-    thread2.join()
-
-    # 处理结果
-    # baseinfo_list1 已经在 process_judge_questions 中被更新
-    baseinfo_list2 = process_questions_list(questions_list, truncate2)
+        # 等待两个任务完成并获取结果
+        future1.result()  # process_judge_questions 直接修改 baseinfo_list1，不需要返回值
+        baseinfo_list2 = future2.result()

+    # 如果需要，合并或处理 baseinfo_list1 和 baseinfo_list2
    updated_list = update_baseinfo_lists(baseinfo_list1, baseinfo_list2)

-    rebidding_situation = extract_from_notice(clause_path, 3)  # "重新招标, 不再招标和终止招标"需从投标人须知正文提取
+    rebidding_situation = extract_from_notice(clause_path, 3)
    update_json = rename_outer_key(rebidding_situation, "重新招标、不再招标和终止招标")
    updated_list.append(update_json)
-    aggregated_baseinfo = aggregate_basic_info_engineering(updated_list)  # 现在是一个字典
+    aggregated_baseinfo = aggregate_basic_info_engineering(updated_list)
    return {"基础信息": aggregated_baseinfo}

 #TODO:先不带投标人须知正文，如果是未知，再直接问正文，
--- a/flask_app/main/多线程提问.py
+++ b/flask_app/main/多线程提问.py
@ -40,10 +40,14 @@ def read_questions_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
+
            if not line:  # 跳过空行
                continue

-            # 检查是否是新的问题编号
+            if line.startswith('#'):  # 跳过以#开头的行
+                continue
+
+            # 检查是否是新的问题编号，例如 "1."
            match = re.match(r'^(\d+)\.', line)
            if match:
                # 如果有之前的问题，保存它
@ -52,12 +56,13 @@ def read_questions_from_file(file_path):

                # 开始新的问题
                current_number = int(match.group(1))
+                # 提取问题内容，去掉编号和点
                current_question = line.split('.', 1)[1].strip() + "\n"
            else:
                # 继续添加到当前问题
                current_question += line + "\n"

-    # 添加最后一个问题
+    # 添加最后一个问题（如果存在）
    if current_question:
        questions.append(current_question.strip())

--- a/flask_app/main/截取pdf.py
+++ b/flask_app/main/截取pdf.py
@ -58,7 +58,7 @@ def extract_common_header(pdf_path):
    return '\n'.join(common_headers)


-def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page):
+def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header):
    """
    从原始PDF截取指定范围的页面并保存到新的PDF文件中。
    如果 output_suffix 是 'notice'，则额外保存 start_page 之前的页面。
@ -89,11 +89,25 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
            print(f"无效的页面范围: {start_page} 到 {end_page}")
            return ""

-        # 如果 output_suffix 是 'notice'，保存 start_page 之前的页面
+        # 如果 output_suffix 是 'notice'，保存 start_page 之前的页面(若遇到'目录'，则截取到'目录')
        if output_suffix == 'notice' and start_page > 0:
            before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
            before_doc = PdfWriter()
-            for page_num in range(0, start_page):
+            toc_page = -1
+
+            # 查找目录页
+            for page_num in range(min(start_page, total_pages)):
+                page_text = pdf_document.pages[page_num].extract_text()
+                cleaned_text = clean_page_content(page_text, common_header)
+                if re.search(r'目\s*录', cleaned_text, re.MULTILINE):
+                    toc_page = page_num
+                    break
+
+            # 确定截取的页数
+            pages_to_extract = toc_page + 1 if toc_page != -1 else start_page
+
+            # 提取页面
+            for page_num in range(pages_to_extract):
                before_doc.add_page(pdf_document.pages[page_num])
            with open(before_pdf_path, 'wb') as f_before:
                before_doc.write(f_before)
@ -159,7 +173,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix):
            print(f"twice: 未找到起始或结束页在文件 {pdf_path} 中！")
            return ""
        else:
-            return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
+            return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header)
    elif output_suffix == "invalid":
        pdf_document = PdfReader(pdf_path)
        total_pages = len(pdf_document.pages)
@ -167,7 +181,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix):
        total = int(total_pages * 2 / 3)
        start_page = last_begin_index
        end_page = min(90, total)
-        return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
+        return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header)


 def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
@ -208,7 +222,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
            print(f"first: 未找到起始或结束页在文件 {pdf_path} 中！")
            return ""
    else:
-        return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
+        return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header)


 def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
@ -337,8 +351,7 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na
    desired_suffixes = [
        f'{base_file_name}_before.pdf',
        f'{base_file_name}_notice.pdf',
-        f'{base_file_name}_tobidders_notice_table.pdf',
-        f'{base_file_name}_tobidders_notice.pdf'
+        f'{base_file_name}_tobidders_notice_table.pdf'
    ]

    all_pdfs_to_merge = []
@ -416,7 +429,7 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder,selections):
 if __name__ == "__main__":
    # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74\\ztbfile.pdf"
    # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74"
-    input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf"
+    input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest16.pdf"
    output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\special_output"
    files=truncate_pdf_multiple(input_path,output_folder)
    # selections = [5, 1]  # 仅处理 selection 5、1 和 3
--- a/flask_app/main/招标文件解析.py
+++ b/flask_app/main/招标文件解析.py
@ -289,7 +289,6 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_

 if __name__ == "__main__":
    output_folder = "flask_app/static/output/zytest1"
-
    # truncate0 = os.path.join(output_folder, "ztb_tobidders_notice_table.pdf")
    # truncate1=os.path.join(output_folder,"ztb_evaluation_method.pdf")
    # clause_path = convert_clause_to_json(truncate1, output_folder)
@ -299,7 +298,6 @@ if __name__ == "__main__":
    file_type = 1        #1:docx 2:pdf 3:其他
    input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest11.pdf"
    # file_path = main_processing(output_folder, input_file, file_type, "uuidzyzy11")
-    print("zy")

    preprocess_files(output_folder, input_file, file_type, "zytest1")
    end_time = time.time()
--- a/flask_app/main/资格审查模块.py
+++ b/flask_app/main/资格审查模块.py
@ -3,7 +3,7 @@ import os
 from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
 from flask_app.main.json_utils import nest_json_under_key, extract_content_from_json
 from flask_app.main.形式响应评审 import process_reviews
-from flask_app.main.资格评审 import process_qualification
+from flask_app.main.资格评审old import process_qualification
 from flask_app.main.通义千问long import upload_file, qianwen_long
 from concurrent.futures import ThreadPoolExecutor

--- a/flask_app/货物标/货物标截取pdf.py
+++ b/flask_app/货物标/货物标截取pdf.py
@ -340,12 +340,16 @@ def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix,

    return path1, path2

+
 def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
    try:
        exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
        pdf_document = PdfReader(pdf_path)
        patterns = None
        begin_page = 0
+        start_page = None
+        end_page = None
+
        if output_suffix == "procurement":
            patterns = [get_patterns_for_procurement()]
            begin_page = 5
@ -353,18 +357,20 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
            patterns = [get_patterns_for_evaluation_method()]
            begin_page = 5
        elif output_suffix == "qualification1":
-            patterns = [get_patterns_for_qualification()]  # This now returns a tuple of pattern pairs
+            patterns = [get_patterns_for_qualification()]
            begin_page = 5
        elif output_suffix == "notice":
            patterns = [get_patterns_for_notice()]
            begin_page = 0
-        # Try each set of patterns until a valid range is found
+
+        if patterns:
            for pattern_pair in patterns:
                start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
                                                             common_header,
                                                             exclusion_pattern, output_suffix)
                if start_page is not None and end_page is not None:
                    break
+
        if start_page is None or end_page is None:
            if output_suffix == "qualification1":
                print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！")
@ -373,14 +379,16 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
                if len(temp) > 0:
                    return temp[0]
                else:
-                    return ""  # 返回空字符串
+                    return ""
            else:
                print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！")
-                return ""  # 返回空字符串
-        return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
+                return ""
+
+        return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix,
+                                    )
    except Exception as e:
        print(f"Error in extract_pages_twice: {e}")
-        return ""  # 返回空字符串
+        return ""


 # def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):