12.9 修复解析bug

2024-12-09 17:38:01 +08:00 · 2024-12-09 17:38:01 +08:00 · c0c7871767
commit c0c7871767
parent e5e63e400b
13 changed files with 258 additions and 178 deletions
--- a/flask_app/general/clean_pdf.py
+++ b/flask_app/general/clean_pdf.py
@ -76,7 +76,7 @@ def clean_page_content(text, common_header):
    text = re.sub(r'^第\d+页\s*', '', text)
    # 删除页码 eg:89/129   这个代码分三步走可以把89/129完全删除
    text = re.sub(r'^\s*\d+\s*(?=\D)', '', text)  # 删除开头的页码，仅当紧跟非数字字符时  投标人须知这块， 页码和顶部序号混在一起的时候也会把序号给去除了。'2018.' 20为页码 18.为序号
-    text = re.sub(r'^\s*\/\s*(共\s*)?\d+\s*(页)?\s*', '', text)   #删除/123  /共123 /共123页 /123页
+    text = re.sub(r'^\s*\/?\s*(共\s*)?\d+\s*(页)?\s*', '', text)   #删除/123  /共123 /共123页 /123页
    text = re.sub(r'^\s*[—-]\s*\d{1,3}\s*[—-]\s*', '', text)  # 删除形如 '—2—' 或 '-2-' 的页码
    return text

--- a/flask_app/general/投标人须知正文提取指定内容.py
+++ b/flask_app/general/投标人须知正文提取指定内容.py
@ -4,7 +4,7 @@ import re
 from functools import cmp_to_key

 from flask_app.general.json_utils import clean_json_string
-from flask_app.general.通义千问long import upload_file, qianwen_long
+from flask_app.general.通义千问long import upload_file, qianwen_long_stream


 def compare_headings(a, b):
@ -321,7 +321,11 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection):
        #         }
        # """,
        1:"""
-            该招标文件中对投标文件的要求有哪些？请以json格式给我返回结果，外层键名为文中提到的有关投标文件的若干要求，对于各子要求，你可以用嵌套键值对组织回答，其中嵌套键名为你对相关子要求的总结或是原文中的标题，而最内层的键值应该完全与原文内容保持一致；若存在多个并列内容，那么键值为一个列表，列表中包含若干描述该要求的字符串，要求键值与原文内容一致，不得总结、删减。示例如下，仅供格式参考：
+            该招标文件中对投标文件的要求有哪些？请以json格式给我返回结果，外层键名为文中提到的有关投标文件的若干要求，对于各子要求，你可以用嵌套键值对组织回答，其中嵌套键名为你对相关子要求的总结或是原文中的标题，而最内层的键值应该完全与原文内容保持一致；若存在多个并列内容，那么键值为一个列表，列表中包含若干描述该要求的字符串，要求键值与原文内容一致，不得总结、删减。
+            其他要求：
+            1.若原文中为对应内容有表格表述，请将该表格用markdown语法返回，每行写在键值中的一个字符串中。
+            
+            示例如下，仅供格式参考：
            {
                 "编写要求":"编写要求xxx",
                 "格式要求":{
@ -355,7 +359,11 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection):
        #     }
        # """,
        2:"""
-        该招标文件中开评定标要求(或磋商流程内容)是什么？请以json格式给我返回结果，外层键名为文中提到的有关开评定标要求(或磋商流程内容)的若干要求，对于各子要求，你可以用嵌套键值对组织回答，其中嵌套键名为你对相关子要求的总结或是原文中的标题，而最内层的键值应该完全与原文内容保持一致；若存在多个并列内容，那么键值为一个列表，列表中包含若干描述该要求的字符串，要求键值与原文内容一致，不得总结、删减。示例如下，仅供格式参考：
+        该招标文件中开评定标要求(或磋商流程内容)是什么？请以json格式给我返回结果，外层键名为文中提到的有关开评定标要求(或磋商流程内容)的若干要求，对于各子要求，你可以用嵌套键值对组织回答，其中嵌套键名为你对相关子要求的总结或是原文中的标题，而最内层的键值应该完全与原文内容保持一致；若存在多个并列内容，那么键值为一个列表，列表中包含若干描述该要求的字符串，要求键值与原文内容一致，不得总结、删减。
+        其他要求：
+            1.若原文中为对应内容有表格表述，请将该表格用markdown语法返回，每行写在键值中的一个字符串中。
+        
+        示例如下，仅供格式参考：
            {
                "开标":"招标文件关于项目开标的要求",
                "开标异议":"招标文件中关于开标异议的项",
@ -387,7 +395,7 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection):
        return {"error": f"无效的 selection 值: {selection}. 请选择 1、2 或 3。"}
    # 调用大模型并处理响应
    try:
-        res = qianwen_long(file_id, user_query)
+        res = qianwen_long_stream(file_id, user_query)
        cleaned_res = clean_json_string(res)
        return cleaned_res
    except Exception as e:
--- a/flask_app/general/投标人须知正文条款提取成json文件.py
+++ b/flask_app/general/投标人须知正文条款提取成json文件.py
@ -51,15 +51,25 @@ def parse_text_by_heading(text):
    # 定义所有需要的正则表达式模式
    pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')   # 一、
    pattern_parentheses = re.compile(r'^\s*[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*') # （一）
-    pattern_letter_initial = re.compile(r'^([A-Z])\.?\s*(.*)$')  # 初始扫描时匹配 A内容 或 A. 内容
-    pattern_letter = re.compile(r'^([A-Z])\.\s*(.*)$')  # 主循环中严格匹配 A. 内容
-    pattern_arabic = re.compile(r'^(\d+、)\s*(.+)$')    # 1、内容
+    pattern_letter_initial = re.compile(r'^([A-Z])[.．、]\s*(.*)$')  # 初始扫描时匹配 A内容 或 A. 内容
+    pattern_letter = re.compile(r'^([A-Z])[.．、]\s*(.*)$')  # 主循环中严格匹配 A. 内容
+    # pattern_arabic = re.compile(r'^(\d+、)\s*(.+)$')    # 1、内容

    initial_heading_pattern = None
    special_section_keywords = ['文件的组成', '文件的构成','文件包括:','文件包括：','雷同认定','包括以下内容']  # 定义特殊章节关键词
    in_special_section = False  # 标志是否在特殊章节中
    lines = text.split('\n')

+    def get_current_number(key_chinese):
+        chinese_to_number = {
+            '一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
+            '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
+            '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15
+        }
+        if not key_chinese:
+            return -1
+        return chinese_to_number.get(key_chinese, -1)
+
    def check_year_pattern(line):
        # 检查是否为年份模式，如 '2014年'
        line_no_spaces = line.replace(' ', '')
@ -78,21 +88,26 @@ def parse_text_by_heading(text):
                return True
        return False

-    # 预先扫描前5行，检查是否有匹配任何标题模式
+    # 预先扫描前5行，检查是否有匹配任何标题模式 '一、总则' 这种
    first_five_lines = lines[:5]
    has_initial_heading_patterns = False
    for line in first_five_lines:
        line_stripped = line.strip().replace('．', '.')
+        if line_stripped.startswith("##"):
+            line_stripped = line_stripped[2:]  # Remove "##"
        if (pattern_numbered.match(line_stripped) or
            pattern_parentheses.match(line_stripped) or
-            pattern_letter_initial.match(line_stripped) or
-            pattern_arabic.match(line_stripped)):
+            pattern_letter_initial.match(line_stripped)
+            ):
            has_initial_heading_patterns = True
            break

    for i, line in enumerate(lines):
+        in_double_hash_mode = False
        line_stripped = line.strip().replace('．', '.')
-
+        if line_stripped.startswith("##"):
+            in_double_hash_mode = True
+            line_stripped = line_stripped[2:]  # Remove "##"
        # 如果在一个章节内，跳过年份模式的行
        if check_year_pattern(line_stripped) and current_key is not None:
            current_content.append(line_stripped)
@ -115,10 +130,10 @@ def parse_text_by_heading(text):
        dot_match = re.match(r'^[．.](\d+(?:[．.]\d+)*)\s*(.+)$', line_stripped)

        # 匹配以点号开头但不带数字的情况，例如 '.投标文件的制作和签署'
-        dot_text_match = re.match(r'^[．.]\s*(\D.+)$', line_stripped)
+        dot_text_match = re.match(r'^[．.、]\s*(\D.+)$', line_stripped)

        # 匹配不带点号的纯数字开头的情况，例如 '27xxxxx'
-        pure_number_match = re.match(r'^(\d+)([^.\d)(）].*)', line_stripped)  # 不允许出现右括号
+        pure_number_match = re.match(r'^(\d+)([^.\d)(）号条款].*)', line_stripped)  # 不允许出现右括号

        if match:
            new_key, line_content = match.groups()
@ -132,7 +147,7 @@ def parse_text_by_heading(text):
            # 保存中文标题的内容
            if current_key_chinese is not None:
                data[current_key_chinese] = current_value_chinese
-                current_key_chinese = None
+                # current_key_chinese = None

            # 处理临时标题与新主标题的关联
            if temp_title:
@ -156,6 +171,7 @@ def parse_text_by_heading(text):
                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)

        elif dot_match:
+            if in_double_hash_mode:
                # 处理以点号开头并带有数字的情况
                sub_number, line_content = dot_match.groups()
                sub_number = sub_number.replace('．', '.')
@ -169,7 +185,11 @@ def parse_text_by_heading(text):
                    append_newline = True
                else:
                    append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
+            else:
+                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
+                                                       in_special_section)
        elif dot_text_match:
+            if in_double_hash_mode:
                # 处理以点号开头但不带数字的情况，自动生成下一个一级序号
                temp_content = dot_text_match.group(1).strip()
                if last_main_number:
@ -194,7 +214,9 @@ def parse_text_by_heading(text):
                append_newline = True  # 设置标志，下一次内容添加时需要 '\n'

                continue  # 跳过进一步处理该行
-
+            else:
+                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
+                                                       in_special_section)
        elif pure_number_match:
            # 处理不带点号的纯数字开头的情况
            new_key_candidate, line_content = pure_number_match.groups()
@ -217,6 +239,8 @@ def parse_text_by_heading(text):
                    temp_title = None  # 重置临时标题

                # 开始新的标题
+                if current_key_chinese is not None:
+                    data[current_key_chinese] = current_value_chinese
                if current_key is not None:
                    content_string = ''.join(current_content).strip()
                    data[current_key] = data.get(current_key, '') + content_string
@ -231,13 +255,13 @@ def parse_text_by_heading(text):
        else:
            # 根据预先设置的标志决定是否执行这部分代码
            if has_initial_heading_patterns and not skip_subheadings and not in_special_section:
-                numbered_match = pattern_numbered.match(line_stripped)
-                parentheses_match = pattern_parentheses.match(line_stripped)
-                letter_match = pattern_letter.match(line_stripped)
-                arabic_match = pattern_arabic.match(line_stripped)
+                numbered_match = pattern_numbered.match(line_stripped)          #一、
+                parentheses_match = pattern_parentheses.match(line_stripped)     #（一）
+                letter_match = pattern_letter.match(line_stripped)            #A. 内容
+                # arabic_match = pattern_arabic.match(line_stripped)          #1、内容

                # 判断当前行是否匹配了任何标题模式
-                if numbered_match or parentheses_match or letter_match or arabic_match:
+                if numbered_match or parentheses_match or letter_match:
                    # 如果初始标题模式尚未设置，则记录当前匹配的标题模式
                    if initial_heading_pattern is None:
                        if numbered_match:
@ -246,8 +270,7 @@ def parse_text_by_heading(text):
                            initial_heading_pattern = 'parentheses'
                        elif letter_match:
                            initial_heading_pattern = 'letter'
-                        elif arabic_match:
-                            initial_heading_pattern = 'arabic'
+

                    # 确定当前匹配的标题模式
                    if numbered_match:
@ -256,44 +279,52 @@ def parse_text_by_heading(text):
                        current_heading_pattern = 'parentheses'
                    elif letter_match:
                        current_heading_pattern = 'letter'
-                    elif arabic_match:
-                        current_heading_pattern = 'arabic'
+

                    # 如果当前标题模式与初始标题模式一致，创建新的键值对
                    if current_heading_pattern == initial_heading_pattern:
+                        new_key_chinese = None
+                        new_value = None
                        # 保存之前的 key 的内容
-                        if current_key is not None:
-                            content_string = ''.join(current_content).strip()
-                            data[current_key] = data.get(current_key, '') + content_string
-                            current_content=[]
                        if current_key_chinese is not None:
                            data[current_key_chinese] = current_value_chinese
-                            current_key_chinese = None
-
+                        if current_key is not None:
+                            content_string = ''.join(current_content).strip()
+                            # 如果已经有内容,添加换行符
+                            if data.get(current_key):
+                                data[current_key] = data.get(current_key) + '\n' + content_string
+                            else:
+                                data[current_key] = content_string
+                            current_content = []
                        # 处理匹配到的标题
                        if current_heading_pattern == 'numbered':
-                            current_key_chinese = numbered_match.group(1)
-                            current_value_chinese = line_stripped[numbered_match.end():].lstrip('.．、,').replace(' ', '')
+                            new_key_chinese = numbered_match.group(1)
+                            new_value = line_stripped[numbered_match.end():].lstrip('.．、,').replace(' ', '')
                        elif current_heading_pattern == 'parentheses':
-                            current_key_chinese = parentheses_match.group(1)
-                            current_value_chinese = line_stripped[parentheses_match.end():].lstrip('.．、,').replace(' ', '')
+                            new_key_chinese = parentheses_match.group(1)
+                            new_value = line_stripped[parentheses_match.end():].lstrip('.．、,').replace(' ', '')
                        elif current_heading_pattern == 'letter':
-                            # 字母标题处理
                            letter_key, letter_value = letter_match.groups()
                            letter_to_chinese = {
                                'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五',
                                'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十',
                                'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
                            }
-                            current_key_chinese = letter_to_chinese.get(letter_key, letter_key)
-                            current_value_chinese = letter_value.lstrip('.．、,').replace(' ', '')
-                        elif current_heading_pattern == 'arabic':
-                            arabic_key, arabic_value = arabic_match.groups()
-                            current_key = arabic_key.replace('、', '.')
-                            current_content = [arabic_value.replace(' ', '')]
-                            append_newline = True
-                            last_main_number = current_key.rstrip('.')
-                        continue
+                            new_key_chinese = letter_to_chinese.get(letter_key, letter_key)
+                            new_value = letter_value.lstrip('.．、,').replace(' ', '')
+                        # 比较数字大小
+                        current_number = get_current_number(current_key_chinese) if current_key_chinese else -1
+                        new_number = get_current_number(new_key_chinese)
+                        if new_number > current_number or current_key_chinese is None:
+                            # 设置新的键值对
+                            current_key_chinese = new_key_chinese
+                            current_value_chinese = new_value
+                            current_key=None
+                        else:
+                            # 如果新数字小于等于当前数字，将该行视为内容
+                            if line_stripped:
+                                append_newline = handle_content_append(current_content, line_stripped, append_newline,
+                                                                       keywords, in_special_section)
                    else:
                        # 当前标题模式与初始模式不一致，将该行视为内容
                        if line_stripped:
@ -349,6 +380,12 @@ def extract_text_from_pdf(file_path, start_word, end_pattern):
                end_index = matches[-1].start()
                cleaned_text = cleaned_text[:end_index]

+        lines = cleaned_text.split('\n')
+        for j, line in enumerate(lines):
+            if j == 0:
+                lines[j] = '##' + line
+        cleaned_text = '\n'.join(lines)
+
        all_pages_text.append(cleaned_text)

    # 合并所有页面的文本
--- a/flask_app/general/读取文件/按页读取pdf.py
+++ b/flask_app/general/读取文件/按页读取pdf.py
@ -12,9 +12,10 @@ def extract_text_by_page(file_path):
            page = reader.pages[page_num]
            text = page.extract_text()
            if text:
+                print(text)
                cleaned_text = clean_page_content(text,common_header)
                # cleaned_text=text
-                print(cleaned_text)
+                # print(cleaned_text)
                print("-----------------"+str(page_num))
                result += cleaned_text
                # print(f"Page {page_num + 1} Content:\n{cleaned_text}")
@ -119,7 +120,7 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
 if __name__ == '__main__':
    # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
    # file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\东莞支队查验招标文件.pdf"
-    file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_tobidders_notice_part2.pdf'
+    file_path = r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
    # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
    # file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
--- a/flask_app/general/通义千问long.py
+++ b/flask_app/general/通义千问long.py
@ -13,6 +13,8 @@ from openai import OpenAI
 import os

 file_write_lock = threading.Lock()
+@sleep_and_retry
+@limits(calls=2, period=1)
 def upload_file(file_path,output_folder=""):
    """
    Uploads a file to DashScope and returns the file ID.
--- a/flask_app/test_case/test_函数并发限制.py
+++ b/flask_app/test_case/test_函数并发限制.py
@ -10,7 +10,7 @@ from functools import wraps
 def rate_limiter():
    pass  # 此函数仅用于限流控制，不执行任何操作

-# 创建一个共享的装饰器
+# 创建一个共享的装饰器，在被装饰函数 (func) 执行之前，先调用 rate_limiter()。
 def shared_rate_limit(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
--- a/flask_app/test_case/test_正则表达式.py
+++ b/flask_app/test_case/test_正则表达式.py
@ -1,17 +1,12 @@
 import re
-
-# 合并后的正则表达式
-begin_pattern = re.compile(
-                        r'(?<!见)'  # 确保前面不是“见”
-                        r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*'  # 匹配“第X章”或“第X部分”
-                        r'[\u4e00-\u9fff、()（）]*?'  # 匹配允许的字符（中文、顿号、括号）
-                        r'(?=.*(?:磋商|谈判|评标|评定|评审))'  # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审”
-                        r'(?=.*(?:办法|方法))'  # 确保包含“办法”或“方法”
-                        r'[\u4e00-\u9fff、()（）]*\s*$'  # 继续匹配允许的字符直到行尾
-                        r'|\s*评标(办法|方法)前附表\s*$',  # 或匹配“评标办法前附表”或“评标方法前附表”
-                        re.MULTILINE
-                    )
-
+line_stripped="""1.采购人：陕西省某单位
+2、采购代理机构：陕西坤硕项目管理有限公司
+3、供应商：响应招标并且符合招标文件规定资格条件和参加投标竞
+争的法人、其他组织或者自然人
+"""
+pure_number_match = re.match(r'^(\d+)([^.\d)(）、].*)', line_stripped)  # 不允许出现右括号
+if pure_number_match:
+    print("yes")

 # 测试字符串
 test_strings = [
@ -52,9 +47,9 @@ test_strings = [
    """
 ]

-for test_string in test_strings:
-    match = re.search(begin_pattern, test_string)
-    if match:
-        print("Matched Content:", match.group())  # 输出匹配的内容
-    else:
-        print("No match found.")
+# for test_string in test_strings:
+#     match = re.search(begin_pattern, test_string)
+#     if match:
+#         print("Matched Content:", match.group())  # 输出匹配的内容
+#     else:
+#         print("No match found.")
--- a/flask_app/工程标/商务评分技术评分整合.py
+++ b/flask_app/工程标/商务评分技术评分整合.py
@ -113,7 +113,9 @@ def combine_evaluation_standards(evaluation_method):
    # user_query_2 = (
    #     "根据该文档中的评标办法前附表，请你列出该文件的技术评分，商务评分，投标报价评审标准以及它们对应的具体评分要求，若对应内容中存在其他信息，在键名如'技术评分'中新增子键名'备注'存放该信息。如果评分内容（因素）不是这3个，则返回文档中给定的评分内容（因素）以及它的评分要求。请以json格式返回结果，不要回答有关形式、资格、响应性评审标准的内容")
    user_query_2 = (
-           """根据该文档中的评标办法表格，请你列出该文件的技术评分，商务评分，投标报价评审以及它们对应的具体评分要求，请以json格式返回结果，最外层键名分别是'技术评分','商务评分','投标报价评审',请在这三大项评分中分别用若干键值对表示具体评分项，外层键名为各评审因素，可能存在嵌套关系，但最内层键值为一个列表，列表中包含若干（可为一）描述该评审因素的评分及要求的字典，内层键名分别是'评分'和'要求'，若无评分，可删去'评分'键值对，'要求'中说明了该评审因素的评分标准；若这三大项评分中存在其他信息，则在相应评分大块内部新增键名'备注'存放该信息，键值为具体的要求，否则不需要。如果评分内容（因素）不是这三大项，则返回文档中给定的评分内容（因素）以及它们的具体评分要求。
+           """你是一个对招投标业务非常熟悉的专家。根据该文档中的评标办法表格，请你列出该文件的技术评分，商务评分，投标报价评审以及它们对应的具体评分要求，请以json格式返回结果，最外层键名分别是'技术评分','商务评分','投标报价评审',请在这三大项评分中分别用若干键值对表示具体评分项，外层键名为各评审因素，可能存在嵌套关系，但最内层键值为一个列表，列表中包含若干（可为一）描述该评审因素的评分及要求的字典，内层键名分别是'评分'和'要求'，若无评分，可删去'评分'键值对，'要求'中说明了该评审因素的评分标准；若这三大项评分中存在其他信息，则在相应评分大块内部新增键名'备注'存放该信息，键值为具体的要求，否则不需要。如果评分内容（因素）不是这三大项，则返回文档中给定的评分内容（因素）以及它们的具体评分要求。
+特殊情况处理：
+如果评分内容（因素）不是这三大项，则返回表格中给定的评分内容（因素）代替最外层键名'技术评分','商务评分','投标报价评分'，并且返回它们的具体评分要求。

 要求与指南：
 1. 请首先定位评分细则的表格，不要回答有关资格审查的内容，也不要从评标办法正文中提取回答
@ -121,6 +123,7 @@ def combine_evaluation_standards(evaluation_method):
 3. 如果该招标活动有多个包，则最外层键名为对应的包名,否则最外层键名为各大评分项
 4. 你无需将表格的单元格内的内容进行拆分，需要将它视为一个整体
 5. '评分'的键值不能是一个范围数字，如'0-5分'，应该是一个具体数字，如'5分'，或者是一个定性的指标如'合格制'
+6. 若表格中商务和技术评分混合一起，请你手动将它们区别，商务评分通常包含'售后服务'、'质量保证'、'业绩'、'企业人员'、'企业信用'等商务因素。

 以下为示例输出，仅供格式参考：
    {
--- a/flask_app/工程标/截取pdf.py
+++ b/flask_app/工程标/截取pdf.py
@ -210,7 +210,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
                    pass
                else:
                    start_page = i
-            if start_page is not None and re.search(end_pattern, cleaned_text):
+            if start_page is not None and re.search(end_pattern, cleaned_text) and not re.search(begin_pattern,cleaned_text):
                condition = i > start_page
                if condition:
                    is_invalid_condition = output_suffix == "invalid" and i > 30  # 这边默认无效投标至少有30页
@ -362,7 +362,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
                (
                    re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))'),
                    # Alternative begin pattern
-                    re.compile(r'评标办法正文|评标办法|^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+')
+                    re.compile(r'评标办法正文|评标方法正文|^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+')
                # Alternative end pattern
                ),
                (
@ -591,17 +591,15 @@ if __name__ == "__main__":
    # input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标"
    # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf"
    # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
-    input_path=r"C:\Users\Administrator\Desktop\fsdownload\854fb19f-96d3-4b3e-b2ba-1a095344fd92\ztbfile.pdf"
-    output_folder = "C:\\Users\\Administrator\\Desktop\\new招标文件\\output3"
-    files=truncate_pdf_multiple(input_path,output_folder)
+    input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest20.pdf"
+    output_folder = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\all"
+    # files=truncate_pdf_multiple(input_path,output_folder)
+    # print(files)
    # selections = [4, 1]  # 仅处理 selection 4、1
    # files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
-    for i in files:
-        print(type(i))
-        print(i)
-    # selection = 1  # 例如：1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
-    # generated_files = truncate_pdf_main(input_path, output_folder, selection)
-    # print(generated_files)
+    selection = 2  # 例如：1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
+    generated_files = truncate_pdf_main(input_path, output_folder, selection)
+    print(generated_files)
    # print("生成的文件:", generated_files)
    end_time = time.time()
    print("耗时：" + str(end_time - start_time))
--- a/flask_app/工程标/提取json工程标版.py
+++ b/flask_app/工程标/提取json工程标版.py
@ -8,7 +8,7 @@ def convert_clause_to_json(file_path, output_folder, type=1):
    if type == 1:
        # 对于 type=1，使用原始字符串定义 start_word 和 end_pattern
        start_word = (
-            r'^\s*(?:[（(]?\s*[一二12]?\s*[)）]?\s*[、．.]*\s*)?(说\s*明|总\s*则)'
+            r'^\s*(?:[（(]?\s*[一二12]?\s*[)）]?\s*[、．.]*\s*)?(说\s*明|总\s*则|名\s*词\s*解\s*释)'
            r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文)'
        )
        end_pattern = (
@ -94,7 +94,7 @@ def convert_clause_to_json(file_path, output_folder, type=1):

 if __name__ == "__main__":
    # file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zbtest20_tobidders_notice.pdf'
-    file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zbtest4_tobidders_notice.pdf'
+    file_path=r'C:\Users\Administrator\Desktop\招标文件\招标test文件夹\all\招标02_tobidders_notice.pdf'
    output_folder = r'C:\Users\Administrator\Desktop\招标文件\招标test文件夹\tmp'
    try:
        output_path = convert_clause_to_json(file_path,output_folder)
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -8,12 +8,15 @@ from flask_app.general.clean_pdf import clean_page_content, extract_common_heade
 from flask_app.general.format_change import docx2pdf
 from flask_app.general.merge_pdfs import merge_and_cleanup, merge_pdfs, merge_selected_pdfs_for_goods
 import concurrent.futures
+
+
 def get_global_logger(unique_id):
    if unique_id is None:
        return logging.getLogger()  # 获取默认的日志器
    logger = logging.getLogger(unique_id)
    return logger

+
 # fitz库版本
 # def extract_common_header(pdf_path):
 #     doc = fitz.open(pdf_path)
@ -59,6 +62,7 @@ def convert_to_pdf(file_path):
        return docx2pdf(file_path)
    return file_path

+
 def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
    pdf_path = convert_to_pdf(file_path)
    result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
@ -91,7 +95,13 @@ def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
        if start_page is None and re.search(begin_pattern, cleaned_text):
            if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
                start_page = i
-        if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
+        if start_page is not None:
+            if output_suffix == "tobidders_notice":
+                if re.search(end_pattern, cleaned_text) and i > start_page:
+                    end_page = i
+                    break
+            else:
+                if re.search(end_pattern, cleaned_text) and i > start_page and not re.search(begin_pattern,cleaned_text):
                    end_page = i
                    break
    return start_page, end_page
@ -105,7 +115,8 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
        total_pages = len(pdf_document.pages) - 1  # 获取总页数

        if output_suffix == "tobidders_notice":
-            exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
+            exclusion_pattern = re.compile(
+                r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
            start_page, mid_page, end_page = extract_pages_tobidders_notice(
                pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern
            )
@ -115,16 +126,22 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
            # if start_page is None or end_page is None or mid_page is None:
            #     print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！尝试备用提取策略。")
            #     return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page)
-
-            path1 = save_extracted_pages(pdf_document, start_page, mid_page, pdf_path, output_folder, "tobidders_notice_part1")
-            path2 = save_extracted_pages(pdf_document, mid_page, end_page, pdf_path, output_folder, "tobidders_notice_part2")
+            path1 = save_extracted_pages(pdf_document, start_page, mid_page, pdf_path, output_folder,
+                                         "tobidders_notice_part1")
+            if mid_page != end_page:
+                path2 = save_extracted_pages(pdf_document, mid_page, end_page, pdf_path, output_folder,
+                                         "tobidders_notice_part2")
+            else:
+                path2=path1
            return path1, path2

        else:
            # 原有的处理逻辑保持不变
            if output_suffix == "qualification1" or output_suffix == "procurement" or output_suffix == "evaluation_method":
-                exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
-            start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern, output_suffix)
+                exclusion_pattern = re.compile(
+                    r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
+            start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
+                                                         common_header, exclusion_pattern, output_suffix)
            # 针对 selection = 6 的特殊处理
            if output_suffix == "format":
                if start_page is None:
@ -151,6 +168,7 @@ def get_patterns_for_procurement():
    #     r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十1-9]+(?:章|部分).*(?:采购|需求).*',
    #     re.MULTILINE)
    begin_pattern = re.compile(
+        r'(?<!见)'
        r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*'  # 匹配“第X章”或“第X部分”
        r'[\u4e00-\u9fff、()（）]*?'  # 匹配允许的字符
        r'(?:(?:服务|项目|商务|技术)[\u4e00-\u9fff、()（）]*?要求[\u4e00-\u9fff、()（）]*?\s*$|'  # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求”
@ -178,6 +196,8 @@ def get_patterns_for_evaluation_method():
    end_pattern = re.compile(
        r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$', re.MULTILINE)
    return begin_pattern, end_pattern
+
+
 def get_patterns_for_notice():
    begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*')
    end_pattern = re.compile(
@ -186,6 +206,8 @@ def get_patterns_for_notice():
        re.MULTILINE
    )
    return begin_pattern, end_pattern
+
+
 def get_patterns_for_notice_twice():
    begin_pattern = re.compile(
        r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*', re.MULTILINE
@ -197,6 +219,7 @@ def get_patterns_for_notice_twice():
    )
    return begin_pattern, end_pattern

+
 # def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header,
 #                                    exclusion_pattern):
 #     start_page = None
@ -294,7 +317,7 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h

                            # 定义基础的 mid_pattern
                            base_mid_pattern = r'^\s*(?:[（(]\s*[一二12]?\s*[)）]\s*[、．.]*|' \
-                                               r'[一二12][、．.]+|[、．.]+)\s*(说\s*明|总\s*则)'
+                                               r'[一二12][、．.]+|[、．.]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'

                            # 合并基础模式和额外模式
                            if additional_mid_pattern:
@ -316,14 +339,14 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h

                            # 定义基础的 mid_pattern
                            base_mid_pattern = r'^\s*(?:[（(]\s*[一二12]?\s*[)）]\s*[、．.]*|' \
-                                               r'[一二12][、．.]+|[、．.]+)\s*(说\s*明|总\s*则)'
+                                               r'[一二12][、．.]+|[、．.]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
                            combined_mid_pattern = re.compile(
                                rf'{base_mid_pattern}',
                                re.MULTILINE
                            )
                    else:
                        # 如果提供了固定的 end_pattern，则使用默认的 mid_pattern
-                        base_mid_pattern = r'.*[（(]?\s*[一二12]?[)）]?\s*[、．.]*\s*(说\s*明|总\s*则)\s*$'   #可以匹配"东莞市粤隆招标有限公司编制 46一、说明"
+                        base_mid_pattern = r'.*[（(]?\s*[一二12]?[)）]?\s*[、．.]*\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)\s*$'  # 可以匹配"东莞市粤隆招标有限公司编制 46一、说明"
                        combined_mid_pattern = re.compile(
                            rf'{base_mid_pattern}',
                            re.MULTILINE
@ -375,10 +398,12 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h

    return start_page, mid_page, end_page

+
 def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page):
    output_suffix = "tobidders_notice"
    begin_pattern = re.compile(
-        r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+',re.MULTILINE
+        r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+',
+        re.MULTILINE
    )
    end_pattern = re.compile(
        r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', re.MULTILINE  # 捕获中文部分
@ -436,7 +461,6 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
        r'^(?:附录.*?[：:]|附件.*?[：:]|附表.*?[：:]|附件\s*\d+).*$',
        re.MULTILINE
    )
-
    # 结束匹配模式 - 章节标题
    end_pattern_chapter = re.compile(
        r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$',
@ -495,7 +519,8 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):

 def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page):
    try:
-        exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
+        exclusion_pattern = re.compile(
+            r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
        pdf_document = PdfReader(pdf_path)
        patterns = None
        start_page = None
@ -568,10 +593,12 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo
        print(f"Error in save_extracted_pages: {e}")
        return ""  # 返回空字符串

+
 def get_start_and_common_header(input_path):
    common_header = extract_common_header(input_path)
    last_begin_index = 0
-    begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$',re.MULTILINE)
+    begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$',
+                               re.MULTILINE)
    pdf_document = PdfReader(input_path)
    for i, page in enumerate(pdf_document.pages):
        if i > 10:
@ -631,11 +658,13 @@ def process_input(input_path, output_folder, selection, output_suffix):

        # 根据选择设置对应的模式和结束模式
        if selection == 1:
-            begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$', re.MULTILINE)
+            begin_pattern = re.compile(
+                r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$', re.MULTILINE)
            end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$', re.MULTILINE)
            local_output_suffix = "notice"
        elif selection == 2:
-            begin_pattern = re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法|内容))')
+            begin_pattern = re.compile(
+                r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法|内容))')
            end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
            local_output_suffix = "evaluation_method"
        elif selection == 3:
@ -720,6 +749,7 @@ def truncate_pdf_multiple(pdf_path, output_folder,logger):
    logger.info("已截取文件路径" + str(truncate_files))
    return truncate_files

+
 # 小解析，只需要前三章内容
 def truncate_pdf_specific_goods(pdf_path, output_folder, selections, unique_id="123"):
    """
@ -740,7 +770,9 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
    # 使用 ThreadPoolExecutor 进行多线程处理
    with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
        # 提交所有任务并保持 selection 顺序
-        future_to_selection = {selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default") for selection in selections}
+        future_to_selection = {
+            selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default")
+            for selection in selections}

        # 按 selection 顺序收集结果
        for selection in selections:
@ -769,22 +801,23 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
        logger.warning(f"合并失败，没有生成merged_baseinfo for {pdf_path}")
    return truncate_files

+
 # TODO:交通智能系统和招标(1)(1)文件有问题  包头 绍兴     工程标中，判断是符合性审查之后，可以将它们设为同一章

 # ztbfile.pdf少资格评审  包头少符合性评审
 if __name__ == "__main__":
    logger = get_global_logger("123")
    # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
-    input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2020-安徽-安徽省生态环境厅电梯采购.pdf"
+    input_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-陕西-陕西省某单位2024年执勤化妆服采购项目.pdf"
    # input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
    # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
    # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
    # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
-    output_folder=r"C:\Users\Administrator\Desktop\招标文件-采购类\tmp2"
-    # files = truncate_pdf_multiple(input_path, output_folder,logger)
+    output_folder = r"C:\Users\Administrator\Desktop\招标文件-采购类\all"
+    files = truncate_pdf_multiple(input_path, output_folder,logger)
    # selections = [3,5]
    # files=truncate_pdf_specific_goods(input_path,output_folder,selections)
-    # print(files)
-    selection = 2# 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求
-    generated_files = truncate_pdf_main(input_path, output_folder, selection)
-    print(generated_files)
+    print(files)
+    # selection = 4  # 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求
+    # generated_files = truncate_pdf_main(input_path, output_folder, selection)
+    # print(generated_files)
--- a/flask_app/货物标/提取json货物标版.py
+++ b/flask_app/货物标/提取json货物标版.py
@ -134,13 +134,14 @@ def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json
        print(f"The specified file does not exist: {file_path}")
        return ""
    if type == 1:
-        start_word = r'^\s*(?:[（(]?\s*[一二12]?\s*[)）]?\s*[、．.]*\s*)?(说\s*明|总\s*则)'
+        start_word = r'^\s*(?:[（(]?\s*[一二12]?\s*[)）]?\s*[、．.]*\s*)?(说\s*明|总\s*则|名\s*词\s*解\s*释)'
        end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*)$'
    else:
        start_word = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*|.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*)$'
        end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
    if file_path.endswith('.pdf'):
        text = extract_text_from_pdf(file_path, start_word, end_pattern)
+        # print(text)
    else:
        raise ValueError("Unsupported file format")
    parsed_data = parse_text_by_heading(text)
@ -174,13 +175,14 @@ def process_folder(input_folder, output_folder):
        except ValueError as e:
            print(f"Error processing {file_name}: {e}")

-#TODO:招标文件111_tobidders_notice_part2.pdf   陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目(1)_tobidders_notice_part2.pdf
-#TODO：19、竞争性磋商响应文件的加密 暂时没处理'19'缺失的情况
+#TODO:招标文件111_tobidders_notice_part2.pdf   陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目(1)_tobidders_notice_part2.pdf  唐山市公安交通警察支队机动车查验机构视频存储回放系统竞争性谈判-招标文件正文(1)_tobidders_notice_part1.pdf
+#TODO:2024-陕西-陕西省某单位2024年执勤化妆服采购项目.pdf
 #TODO: .不予受理的情形 ，‘.后面必须跟中文或者空格’
 if __name__ == "__main__":
    # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
-    file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf'
-    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_tobidders_notice_part2.pdf'
+    file_path=r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf'
+    # file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf'
+    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\6.2定版视频会议磋商文件_tobidders_notice_part2.pdf'
    output_folder = r'C:\Users\Administrator\Desktop\招标文件\output4\tmp'
    try:
        output_path = convert_clause_to_json(file_path,output_folder,1)
--- a/flask_app/货物标/评分标准提取main.py
+++ b/flask_app/货物标/评分标准提取main.py
@ -224,13 +224,14 @@ def combine_evaluation_standards(truncate_file):
            # user_query = "根据该文档中的评标办法前附表或者评分标准表，请你列出该文件的技术评分，商务评分，投标报价评审标准以及它们对应的具体评分要求，外层键名分别为'技术评分','商务评分','投标报价'。如果评分内容不是这3个，则返回文档中给定的评分内容以及它的评分要求，都以json的格式返回结果，如果该采购活动有多个包，则最外层键名为对应的包名。请不要回答有关资格审查的内容"
            user_query = (
                """
-根据该文档中的评分标准表格中的内容，请你列出该文件的技术评分，商务评分，投标报价评审以及它们对应的具体评分要求，请以json格式返回结果，最外层键名分别是'技术评分','商务评分','投标报价评分',请在这三大项评分中分别用若干键值对表示具体评分项，外层键名为各评审因素，键值为一个列表，列表中包含若干（可为一）描述该评审因素的评分及要求的字典，内层键名分别是'评分'和'要求'，若无评分，可删去'评分'键值对，'要求'中说明了该评审因素的评分标准；若这三大项评分中存在其他信息，则在相应评分大块内部新增键名'备注'存放该信息，键值为具体的要求，否则不需要。如果评分内容（因素）不是这三大项，则返回文档中给定的评分内容（因素）以及它们的具体评分要求。
+你是一个对招投标业务非常熟悉的专家。根据该文档中的评分标准表格中的内容，请你列出该文件的技术评分，商务评分，投标报价评审以及它们对应的具体评分要求，请以json格式返回结果，最外层键名分别是'技术评分','商务评分','投标报价评分',请在这三大项评分中分别用若干键值对表示具体评分项，外层键名为各评审因素，键值为一个列表，列表中包含若干（可为一）描述该评审因素的评分及要求的字典，内层键名分别是'评分'和'要求'，若无评分，可删去'评分'键值对，'要求'中说明了该评审因素的评分标准；若这三大项评分中存在其他信息，则在相应评分大块内部新增键名'备注'存放该信息，键值为具体的要求，否则不需要。如果评分内容（因素）不是这三大项，则返回表格中给定的评分内容（因素）以及它们的具体评分要求。
 要求与指南：
 1.请首先定位评分细则的表格，不要回答有关资格审查、符合性审查的内容，也不要从评标办法正文中（表格外）提取回答 
-2.若大项的'xx评分'要求未在文中说明，则键名'xx评分'的键值设为'本项目无xx评分项'，例如"技术评分":"本项目无技术评分项" 
+2.若大项的'xx评分'要求未在文中说明，则键名'xx评分'的键值设为'本项目无xx评分项'，例如{"技术评分":"本项目无技术评分项"} 
 3. 如果该招标活动有多个包，则最外层键名为对应的包名,否则不需要 
 4.你无需将表格的单元格内的内容进行拆分，需要将它视为一个整体。
 5. '评分'的键值不能是一个范围数字，如'0-5分'，应该是一个具体数字，如'5分'，或者是一个定性的指标如'合格制'
+6. 若表格中商务和技术评分混合一起，请你手动将它们区别，商务评分通常包含'售后服务'、'质量保证'、'业绩'、'企业人员'、'企业信用'等商务因素。

 以下为示例输出，仅供格式参考：
    {
@ -311,7 +312,7 @@ def combine_evaluation_standards(truncate_file):

 if __name__ == "__main__":
    start_time=time.time()
-    truncate_file=r"C:\Users\Administrator\Desktop\fsdownload\1ca1d27d-fc21-4697-8075-9027103df030\ztbfile_evaluation_method.pdf"
+    truncate_file=r"C:\Users\Administrator\Desktop\招标文件-采购类\tmp2\2024-新疆-塔城地区公安局食药环分局快检实验室项目_evaluation_method.pdf"
    # truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output2\\2-招标文件（统计局智能终端二次招标）_evaluation_method.pdf"
    # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output2\\广水市妇幼招标文件最新（W改）_evaluation_method.pdf"
    # truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\2d481945-1f82-45a5-8e56-7fafea4a7793\\ztbfile_evaluation_method.pdf"