12.23 无效标废标更新

2024-12-23 10:15:49 +08:00 · 2024-12-23 10:15:49 +08:00 · bc0ad7f89d
commit bc0ad7f89d
parent 008df2fc4a
3 changed files with 143 additions and 37 deletions
--- a/flask_app/general/无效标和废标公共代码.py
+++ b/flask_app/general/无效标和废标公共代码.py
@ -325,7 +325,7 @@ def clean_dict_datas(extracted_contents, keywords, excludes):  # 让正则表达
                if any(exclude in data for exclude in excludes):
                    continue  # 如果包含任何排除字符串，跳过这个数据
                # 去掉开头的序号，eg:1 | (1) |（2） | 1. | 2．（全角点）| 3、 | 1.1 | 2.3.4 | A1 | C1.1 | 一、
-                pattern = r'^\s*(?:[（(]\d+[)）)]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、．)\）]+|[一二三四五六七八九十]+、)'
+                pattern = r'^\s*(?:[（(]\d+[)）)]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、．)\）]+|[一二三四五六七八九十]+、|[A-Z][)）\.、．]?\s*)'
                data = re.sub(pattern, '', data).strip()
                keyword_match = re.search(keywords, data)
                if keyword_match:
@ -355,7 +355,7 @@ def clean_dict_datas(extracted_contents, keywords, excludes):  # 让正则表达
            # print("*********")
            new_text_list = preprocess_text_list(text_list)
            # 用于处理结构化文本，清理掉不必要的序号，并将分割后的段落合并，最终形成更简洁和格式化的输出。
-            pattern = r'^\s*(?:[（(]\d+[)）)]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、．)\）]+|[一二三四五六七八九十]+、)'
+            pattern = r'^\s*(?:[（(]\d+[)）)]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、．)\）]+|[一二三四五六七八九十]+、|[A-Z][)）]\s+|[A-Z]\.\s*)'

            data = re.sub(pattern, '', new_text_list[0]).strip()  # 去除序号
            # 将修改后的第一个元素和剩余的元素连接起来
@ -593,8 +593,22 @@ def combine_find_invalid(invalid_docpath, output_dir):
        ),
        (
            r'不\s*得|禁\s*止\s*投\s*标',
-            """以下是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，每条信息规定了各方不得存在的情形，请回答：在这些信息中，投标人或中标人或供应商或联合体投标各方或磋商小组不得存在的情形或禁止投标的情形有哪些？不要返回主语是招标人或采购人或评标委员会的信息，请你筛选所需的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，示例返回为[1,4,6]，若情形不存在，返回[]。以下为需要考虑的注意事项：请返回包含实际内容的信息，若信息内容诸如'投标人不得存在的其他关联情形'这样笼统的表格，而未说明具体的情形，则无需添加这条信息。
-            文本内容：{full_text}
+            """以下是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，每条信息规定了各方不得存在的情形。请根据以下要求进行筛选：
+**筛选要求**：
+1. **仅筛选**出主语为“投标人”、“中标人”、“供应商”、“联合体投标各方”或“磋商小组”或其他投标相关主体的条款。
+2. **排除**主语为“招标人”、“采购人”、“评标委员会”或其他非投标相关主体的条款。
+3. **仅包含**明确描述禁止情形的条款，不包含笼统或未具体说明情形的条款。若信息内容诸如'投标人不得存在的其他关联情形'这样笼统的内容，而未说明具体的情形，则无需添加这条信息。
+
+**示例**：
+- **符合条件**：
+  - `1. 投标人不得...` → 包含，返回序号 1。
+  - `6. 联合体投标各方不得...` → 包含，返回序号 6。
+- **不符合条件**：
+  - `14. 采购人不得...` → 主语为“采购人”，排除。
+  
+请根据上述筛选要求，阅读以下文本内容，并以 `[x,x,x]` 格式返回符合条件的条款序号，如果没有符合条件的条款，返回 `[]`。
+            
+文本内容：{full_text}
            """,
            os.path.join(output_dir, "temp3.txt"),
            "不得存在的情形"
--- a/flask_app/general/通义千问long.py
+++ b/flask_app/general/通义千问long.py
@ -207,26 +207,30 @@ def qianwen_long_stream(file_id, user_query, max_retries=2, backoff_factor=1.0,
            full_response = ""  # 用于存储完整的响应内容

            for chunk in completion:
-                print(chunk)
+                # print(chunk.model_dump_json())
                if hasattr(chunk, 'to_dict'):
                    chunk_data = chunk.to_dict()
                else:
                    chunk_data = json.loads(chunk.model_dump_json())
+
+                # 处理 usage 信息
                usage = chunk_data.get('usage')
                if usage is not None:
                    completion_tokens = usage.get('completion_tokens', 0)
+
+                # 处理 choices 信息
                choices = chunk_data.get('choices', [])
-                if not choices:
-                    continue
-                choice = choices[0]
-                delta = choice.get('delta', {})
-                content = delta.get('content', '')
-                if content:
-                    full_response += content
-                    # 实时打印内容（可以取消注释下面一行以实时输出）
-                    # print(content, end='', flush=True)
-                if choice.get('finish_reason'):
-                    break
+                if choices:
+                    choice = choices[0]
+                    delta = choice.get('delta', {})
+                    content = delta.get('content', '')
+                    if content:
+                        full_response += content
+                        # 实时打印内容（可以取消注释下面一行以实时输出）
+                        # print(content, end='', flush=True)
+                    if choice.get('finish_reason'):
+                        # 不再提前跳出循环，允许处理最后一个包含 usage 的块
+                        pass  # 或者记录 finish_reason 以供后续使用

            if need_extra:
                return full_response, completion_tokens
--- a/flask_app/test_case/test_正则表达式.py
+++ b/flask_app/test_case/test_正则表达式.py
@ -1,25 +1,113 @@
-import regex
+import re

-begin_pattern = regex.compile(
-            r'第[一二三四五六七八九十]+章\s*合同|[：:]清标报告|^第二卷',
-            regex.MULTILINE
-        )
+# 定义清理函数
+def clean_data(data):
+    pattern = r'^\s*(?:[（(]\d+[)）)]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、．)\）]+|[一二三四五六七八九十]+、|[A-Z][)）\.、．]?\s*)'
+    return re.sub(pattern, '', data).strip()

-
-# 测试示例
-test_strings = [
-    '投标人须知正文',          # 匹配
-    '”投标人须知正文',         # 不匹配
-    '”  投标人须知正文',       # 不匹配
-    '与 投标人须知正文',         # 不匹配
-    '见 投标人须知正文',         # 不匹配
-    '“ 投标人须知正文',         # 不匹配
-    '供应商须知正文',          # 匹配
-    '谈判供应商须知正文'         # 匹配
+# 定义测试用例
+test_cases = [
+    {
+        "description": "阿拉伯数字加逗号",
+        "input": "2 、单位负责人为同一人或者存在直接控股 、管理关系的不同投标人， 不得 参加本项目同一合同项下的政府采购活动。",
+        "expected": "单位负责人为同一人或者存在直接控股 、管理关系的不同投标人， 不得 参加本项目同一合同项下的政府采购活动。"
+    },
+    {
+        "description": "阿拉伯数字加点",
+        "input": "3. 为本采购项目提供整体设计 、规范编制或者项目管理 、监理 、检测等服务的， 不得再参加本项目的其他招标采购活动。",
+        "expected": "为本采购项目提供整体设计 、规范编制或者项目管理 、监理 、检测等服务的， 不得再参加本项目的其他招标采购活动。"
+    },
+    {
+        "description": "阿拉伯数字加圆括号",
+        "input": "(4) 详细说明",
+        "expected": "详细说明"
+    },
+    {
+        "description": "全角括号",
+        "input": "（5）项目概述",
+        "expected": "项目概述"
+    },
+    {
+        "description": "字母加点",
+        "input": "A. 项目简介",
+        "expected": "项目简介"
+    },
+    {
+        "description": "字母加逗号",
+        "input": "B、详细内容",
+        "expected": "详细内容"
+    },
+    {
+        "description": "中文数字加逗号",
+        "input": "一、项目背景",
+        "expected": "项目背景"
+    },
+    {
+        "description": "中文数字加逗号和空格",
+        "input": "二 、项目目标",
+        "expected": "项目目标"
+    },
+    {
+        "description": "混合分隔符",
+        "input": "C) 项目计划",
+        "expected": "项目计划"
+    },
+    {
+        "description": "没有序号",
+        "input": "这是一个没有序号的段落。",
+        "expected": "这是一个没有序号的段落。"
+    },
+    {
+        "description": "多个分隔符",
+        "input": "3  、  为本采购项目提供整体设计",
+        "expected": "为本采购项目提供整体设计"
+    },
+    {
+        "description": "复杂的编号格式",
+        "input": "A.1.1. 详细步骤",
+        "expected": "详细步骤"
+    },
+    {
+        "description": "中英文混合的中文数字",
+        "input": "（六）第五部分",
+        "expected": "第五部分"
+    },
+    {
+        "description": "无空格的分隔符",
+        "input": "7、没有空格的情况",
+        "expected": "没有空格的情况"
+    },
+    {
+        "description": "分隔符前有空格",
+        "input": "8 、 有空格的情况",
+        "expected": "有空格的情况"
+    },
+    {
+        "description": "点和逗号混合的分隔符",
+        "input": "9. 、 混合分隔符",
+        "expected": "混合分隔符"
+    },
+    {
+        "description": "多重括号",
+        "input": "((10)) 多重括号",
+        "expected": "多重括号"
+    },
+{
+        "description": "只有数字",
+        "input": "10 多重括号",
+        "expected": "多重括号"
+    }
 ]

-for s in test_strings:
-    if begin_pattern.search(s):
-        print(f"匹配: {s}")
-    else:
-        print(f"不匹配: {s}")
+# 执行测试
+for idx, case in enumerate(test_cases, 1):
+    input_data = case["input"]
+    expected = case["expected"]
+    result = clean_data(input_data)
+    passed = result == expected
+    status = "通过" if passed else "失败"
+    print(f"测试 {idx}: {case['description']}")
+    print(f"原始数据: {input_data}")
+    print(f"预期结果: {expected}")
+    print(f"实际结果: {result}")
+    print(f"测试结果: {status}\n{'-'*50}\n")