10.9子空间调用问题/货物标提取json文件更加健壮

2024-10-09 20:51:33 +08:00 · 2024-10-09 20:51:33 +08:00 · 04040541e2
commit 04040541e2
parent dce1a793d2
6 changed files with 233 additions and 212 deletions
--- a/flask_app/main/test.py
+++ b/flask_app/main/test.py
@ -1,175 +1,166 @@
 import json
-import re
+
 data={
  "技术评分": {
    "工程概况": {
      "评分": "1分",
      "要求": "工程主要情况、各专业设计简介、工程施工条件描述准确清晰 1分；基本准确 0.5-0.9分，不准确得 0分。"
    },
    "施工部署": {
      "评分": "3分",
      "要求": "工程施工目标、主要施工内容、施工流水段划分、施工的重点和难点分析、工程管理的组织机构形式、项目经理部的工作岗位设置及其职责划分、新技术、新工艺部署及其技术和管理要求（如有）、主要分包工程施工单位的选择要求及管理方式（如有）等各项安排科学、合理、针对性强 2.8-3分；合理、可行 2.1-2.7分；欠合理，基本可行 1-2.0分；不可行，不能满足工程需要得 0分。"
    },
    "施工进度计划": {
      "评分": "3分",
      "要求": "网络图或横道图安排的进度计划科学、合理、针对性强 2.8-3分；合理、可行 1.6-2.7分；欠合理，基本可行 1-1.5分；不可行，不能满足招标文件要求 0分。"
    },
    "施工准备与资源配置计划": {
      "评分": "2分",
      "要求": "技术准备、现场准备和资金准备；资源配置计划包括劳动力配置计划、主要工程材料和设备配置计划、主要周转材料和施工机具配置计划等内容完备，合理、针对性强 1.9-2分；内容完备，可行 1.3-1.8分；内容欠完备，基本可行 0.8-1.2分；不可行 0分。"
    },
    "主要施工方案": {
      "评分": "5分",
      "要求": "主要分部、分项工程施工方案合理、可行；脚手架工程、起重吊装工程、临时用水用电工程、季节性施工等专项工程的施工方案（如有）有必要的验算和说明；对易发生质量通病、易出现安全问题、施工难度大、技术含量高的分项工程（工序）等有重点说明。科学、合理、针对性强 4.6-5分；合理、可行 3.1-4.5；欠合理，基本可行 2.0-3.0；不可行，不能满足工程需要 0分。"
    },
    "施工现场平面布置": {
      "评分": "2分",
      "要求": "拟建建（构）筑物的位置轮廓、尺寸、层数；加工设施、存储设施、办公和生活用房的位置和面积；布置垂直运输设施、供电设施、供水、供热设施、排水排污设施和临时施工道路；必备的安全、消防、保卫和环境保护设施；相邻的地上地下既有建（构）筑物及相关环境等要素齐全，现场布置合理 1.9-2分；现场布置可行 1.3-1.8分；现场布置基本可行 0.8-1.2分；现场布置不可行 0分。"
    },
    "主要施工管理计划": {
      "评分": "4分",
      "要求": "进度管理计划、质量管理计划、安全管理计划、环境管理计划、成本管理计划、治安保卫管理计划、合同管理计划，组织协调管理计划、成品保护管理计划、质量保修管理计划、人力资源、施工机具、材料设备等管理计划内容完备，合理、针对性强 3.5-4分；内容基本完备，可行 2.7-3.4分；内容欠完备，基本可行 2.0-2.6分；不可行 0分。"
    }
  },
  "商务评分": {
    "项目经理资格": {
      "学历": {
        "评分": "1分",
        "要求": "大学专科及以上 1分；中专 0.5分；其他 0分。"
      }
    },
    "技术负责人资格": {
      "职称": {
        "评分": "1分",
        "要求": "相关专业中级职称（工程师） 1分；其他 0分。"
      },
      "学历": {
        "评分": "1分",
        "要求": "大学专科及以上 1分；中专 0.5分；其他 0分。"
      }
    },
    "其他主要人员": {
      "评分": "2分",
      "要求": "人员配备合理，满足需要 1.6-2分；人员配备基本合理,基本满足需要 0.1-1.5分；人员配备不合理 0分。"
    },
    "认证体系": {
      "评分": "3分",
      "要求": "投标人具有有效的 IS09001(GB/T19001)质量管理体系认证证书、IS014001(GB/T24001)环境管理体系认证证书、OHSAD18001(GB/T28001)职业健康安全管理体系认证证书的，一项得 1分，三个得 3分(须提供认证证书的扫描件或复印件及在“全国认证认可信息公共服务平台”查询结果截图，未提供或不符合要求不得分。)"
    },
    "工程奖项": {
      "评分": "3分",
      "要求": "投标人企业近三年（2018-2020）连续获得过县级及以上行政主管部门表彰的得 3分；其他不得分。"
    },
    "信用": {
      "评分": "1分",
      "要求": "在信用中国查询未受到行政监督部门行政处罚的得 1分，被行政监督部门作出行政处罚的得 0分。提供网上查询截图（以开标当天“信用中国”网站查询为准），否则不得分。(行政处罚是指通过在“信用中国”查询投标人有行政处罚记录信息且在公示期内)"
    }
  },
  "投标报价评审标准": {
    "有效投标报价确定": {
      "评分": "未知",
      "要求": "低于招标控制价。"
    },
    "有效投标报价的排序": {
      "评分": "未知",
      "要求": "有效投标报价由低到高排序。"
    },
    "投标文件基础数据分析和整理": {
      "评分": "未知",
      "要求": "清标报告。"
    },
    "投标报价成本分析": {
      "评分": "未知",
      "要求": "清标报告。"
    },
    "投标竞争下浮率": {
      "评分": "未知",
      "要求": "取值：1%（投标竞争下浮率的值为整数，具体由招标人结合项目特点和需要在招标文件中明确）。"
    },
    "投标报价最终排序": {
      "评分": "未知",
      "要求": "有效投标报价由低到高排序。"
    },
    "投标报价不合格的情形": {
      "评分": "未知",
      "要求": "投标报价不合格的情形仅限于投标报价高于最高投标限价或低于成本的情形。投标报价文件评定为“合格”的，评标委员会应列出报价组成不合理、不平衡报价、签订合同前应注意和澄清的事项。评标委员会在不改变投标人投标文件实质性内容的前提下，对投标文件已标价工程量清单进行符合性检查、算术错误修正，对其报价进行合理性分析，投标报价一致或呈规律性差异，评标委员会应当进行比较分析，判定应当否决投标，并形成清标报告。"
    }
  }
 }
-def compare_headings(current, new):
+def remove_unknown_scores(data):
-    current_nums = [int(num) for num in current.split('.') if num.isdigit()]
+  if isinstance(data, dict):
-    new_nums = [int(num) for num in new.split('.') if num.isdigit()]
+    return {
-
+      k: remove_unknown_scores(v)
-    for c, n in zip(current_nums, new_nums):
+      for k, v in data.items()
-        if n > c:
+      if not (k == "评分" and v in ["未知", "/", ""])
-            return True
+    }
-        elif n < c:
+  elif isinstance(data, list):
-            return False
+    return [remove_unknown_scores(item) for item in data]
-
+  else:
    return len(new_nums) > len(current_nums)
 def should_add_newline(content, keywords, max_length=20):
    content_str = ''.join(content).strip()
    return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
 def handle_content_append(current_content, line_content, append_newline, keywords):
    if append_newline:
        if should_add_newline(current_content, keywords):
            current_content.append('\n')
        append_newline = False
    current_content.append(line_content)
    return append_newline
 # def parse_text_by_heading(text):
 #     keywords = ['包含', '以下']
 #     data = {}
 #     current_key = None
 #     current_content = []
 #     append_newline = False
 #     skip_subheadings = False
 #
 #     lines = text.split('\n')
 #     for i, line in enumerate(lines):
 #         line_stripped = line.strip().replace('．', '.')
 #         # print(line_stripped)
 #         # 匹配二级、三级标题形如 '1.1'、'2.2.3' 并确保其前后没有字母或括号
 #         match = re.match(r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
 #         if not match:
 #             #匹配一级标题'1.'
 #             match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
 #
 #         if match:
 #             new_key, line_content = match.groups()     #new_key:1.1   line_content:本次竞争性磋商文件仅适用于本次竞争性磋商公告中所叙述的广水农商行门禁控制主机及基础验证设备采购项
 #             line_content = line_content.lstrip('.．、,')   #删除左边的若干个.．、,
 #
 #             # 检查是否包含关键词，决定是否跳过子标题
 #             if any(keyword in line_content for keyword in keywords):
 #                 skip_subheadings = True
 #             else:
 #                 skip_subheadings = False
 #
 #             # 检查是否应该更新当前键和内容
 #             if current_key is None or (compare_headings(current_key, new_key) and (
 #                     len(current_content) == 0 or current_content[-1][-1] != '第')):
 #                 if current_key is not None:
 #                     content_string = ''.join(current_content).strip()
 #                     data[current_key] = content_string.replace(' ', '')
 #                 current_key = new_key
 #                 current_content = [line_content]
 #                 append_newline = len(new_key.rstrip('.').split('.')) <= 2
 #             else:
 #                 append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
 #         else:
 #             if not skip_subheadings:
 #                 # 匹配中文数字标题，包括带括号和不带括号的情况
 #                 pattern_title = re.compile(
 #                     r'^\s*(?:[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
 #                 chinese_match = pattern_title.match(line_stripped)
 #
 #                 # 匹配字母标题，如 B.磋商说明
 #                 letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
 #
 #                 # 匹配阿拉伯数字开头后跟顿号的情况，如 "7、竞争性磋商采购文件的修改"
 #                 arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
 #
 #                 if chinese_match or letter_match or arabic_match:
 #                     if current_key is not None:
 #                         content_string = ''.join(current_content).strip()
 #                         data[current_key] = content_string.replace(' ', '')
 #                     if chinese_match:
 #                         pattern_key = re.compile(r'[一二三四五六七八九十]{1,2}')
 #                         pattern_value = re.compile(r'[^\s、）)]+')
 #                         chinese_key_match = pattern_key.search(line_stripped)
 #                         chinese_value_match = pattern_value.search(
 #                             line_stripped[chinese_key_match.end():]) if chinese_key_match else None
 #                         if chinese_key_match and chinese_value_match:
 #                             current_key = chinese_key_match.group()
 #                             current_content = [chinese_value_match.group()]
 #                     elif letter_match:
 #                         letter_key, letter_value = letter_match.groups()
 #                         # 将字母转换为对应的中文数字
 #                         letter_to_chinese = {
 #                             'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五',
 #                             'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十',
 #                             'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
 #                         }
 #                         current_key = letter_to_chinese.get(letter_key, letter_key)
 #                         current_content = [letter_value]
 #                     elif arabic_match:
 #                         arabic_key, arabic_value = arabic_match.groups()
 #                         current_key = arabic_key.replace('、', '.')
 #                         current_content = [arabic_value]
 #
 #                     append_newline = True
 #                     continue
 #
 #             if line_stripped:
 #                 append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
 #
 #     if current_key is not None:
 #         content_string = ''.join(current_content).strip()
 #         data[current_key] = content_string.replace(' ', '')
 #
 #     return data
 def parse_text_by_heading(text):
    keywords = ['包含', '以下']
    data = {}
    current_key = None
    current_content = []
    append_newline = False
    lines = text.split('\n')
    for i, line in enumerate(lines):    #由于本身就是按行读取处理，因此保存的时候不带'\n'
        line_stripped = line.strip().replace('．', '.')
        # 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题，并确保其前后没有字母或括号
        match = re.match(r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
        if not match:
            match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
        if match:
            new_key, line_content = match.groups()
            line_content = line_content.lstrip('.．')
            # 检查是否应该更新当前键和内容
            if current_key is None or (compare_headings(current_key, new_key) and (
                    len(current_content) == 0 or current_content[-1][-1] != '第')):   #current_content： 内容占两行则是： ['本招标项目已具备招标条件，现对本项目施工监理进行招标，项目概况见本', '章附件一。']
                if current_key is not None:
                    # 将之前的内容保存到data中，保留第一个换行符，后续的换行符转为空字符
                    content_string = ''.join(current_content).strip()
                    data[current_key] = content_string.replace(' ', '')
                current_key = new_key
                current_content = [line_content]
                # 只有当标题级别为两级（如 1.1）时，才设置 append_newline 为 True
                # append_newline = len(new_key.split('.')) == 2
                append_newline = len(new_key.rstrip('.').split('.')) <= 2
            else:
                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
        else:
            if line_stripped:
                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
    if current_key is not None:
        # 保存最后一部分内容
        content_string = ''.join(current_content).strip()
        data[current_key] = content_string.replace(' ', '')
    return data
 def combine_technical_and_business(data, target_values):
  data = remove_unknown_scores(data)
  extracted_data = {}  # 根级别存储所有数据
  technical_found = False
  business_found = False
-# 测试文本
+  def extract_nested(data, parent_key='', is_technical=False, is_business=False):
-text = """
+    nonlocal technical_found, business_found
-22.3 从提交响应文件截止时间至磋商有效期期满这段时间，供应商不得修改第
+    if isinstance(data, dict):
-23.1 条的内容
+      for key, value in data.items():
-"""
+        current_key = f"{parent_key}.{key}" if parent_key else key
-res = parse_text_by_heading(text)
+        # 检查是否为技术标的内容
-print(json.dumps(res, ensure_ascii=False, indent=4))
+        if any(target in key for target in target_values):
          if not is_technical:
            extracted_data[key] = value
            technical_found = True
            continue
        # 默认其他所有内容都归为商务标
        else:
          if not is_business:
            if '商务评分' not in extracted_data:
              extracted_data['商务评分'] = {}
            extracted_data['商务评分'][key] = value
            business_found = True
            continue
        if isinstance(value, dict) or isinstance(value, list):
          extract_nested(value, current_key, is_technical, is_business)
    elif isinstance(data, list):
      for index, item in enumerate(data):
        extract_nested(item, f"{parent_key}[{index}]", is_technical, is_business)
  extract_nested(data)
  if not technical_found:
    extracted_data['技术评分'] = ''
  if not business_found:
    extracted_data['商务评分'] = ''
  return extracted_data
 target_values1 = ['技术标','技术部分','设计', '实施',"技术评分"]
 evaluation_standards_res=combine_technical_and_business(data,target_values1)
 # 从结果中提取"商务标"和"技术标"
 technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})}
 commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})}
    # 返回技术标和商务标
 print(json.dumps(technical_standards,ensure_ascii=False,indent=4))
 print(json.dumps(commercial_standards, ensure_ascii=False, indent=4))
--- a/flask_app/main/商务标技术标整合.py
+++ b/flask_app/main/商务标技术标整合.py
@ -53,8 +53,19 @@ from flask_app.main.通义千问long import upload_file, qianwen_long
 #         extracted_data['商务标'] = ''
 #
 #     return extracted_data
-
+def remove_unknown_scores(data):
    if isinstance(data, dict):
        return {
            k: remove_unknown_scores(v)
            for k, v in data.items()
            if not (k == "评分" and v in ["未知", "/", ""])
        }
    elif isinstance(data, list):
        return [remove_unknown_scores(item) for item in data]
    else:
        return data
 def combine_technical_and_business(data, target_values):
  data=remove_unknown_scores(data)
  extracted_data = {}  # 根级别存储所有数据
  technical_found = False
  business_found = False
@ -99,11 +110,22 @@ def combine_technical_and_business(data, target_values):
 def combine_evaluation_standards(truncate2):
    # 商务标、技术标评分项：千问
    file_id = upload_file(truncate2)
    user_query_2 = (
        "根据该文档中的评标办法前附表，请你列出该文件的技术评分，商务评分，投标报价评审标准以及它们对应的具体评分要求，若对应内容中存在其他信息，在键名如'技术评分'中新增子键名'备注'存放该信息。如果评分内容（因素）不是这3个，则返回文档中给定的评分内容（因素）以及它的评分要求。请以json格式返回结果，不要回答有关形式、资格、响应性评审标准的内容")
    # user_query_2 = (
-    #         "根据该文档中的评标办法前附表，请你列出该文件的技术评分，商务评分，投标报价评审标准以及它们对应的具体评分要求，请以json格式返回结果，请在这三大块评分中分别用若干键值对表示评分要求，最内层的键名为'分数'及'要求'，若这三大块评分中存在其他信息，则在相应评分大块中新增嵌套键名'备注'存放该信息，键值为具体的要求，否则不需要。如果评分内容（因素）不是这3个，则返回文档中给定的评分内容（因素）以及它的评分要求。不要回答有关形式、资格、响应性评审标准的内容")
+    #     "根据该文档中的评标办法前附表，请你列出该文件的技术评分，商务评分，投标报价评审标准以及它们对应的具体评分要求，若对应内容中存在其他信息，在键名如'技术评分'中新增子键名'备注'存放该信息。如果评分内容（因素）不是这3个，则返回文档中给定的评分内容（因素）以及它的评分要求。请以json格式返回结果，不要回答有关形式、资格、响应性评审标准的内容")
    user_query_2 = (
           """
           根据该文档中的评标办法前附表，请你列出该文件的技术评分，商务评分，投标报价评审标准以及它们对应的具体评分要求，请以json格式返回结果，请在这三大块评分中分别用若干键值对表示具体要求，其内层的键名为'评分'及'要求'，若这三大块评分中存在其他信息，则在相应评分大块中新增键名'备注'存放该信息，键值为具体的要求，否则不需要。如果评分内容（因素）不是这3个，则返回文档中给定的评分内容（因素）以及它们的具体评分要求。不要回答有关形式、资格、响应性评审标准的内容，若存在未知信息，填充'未知'。以下为示例输出：
           "技术评分": {
            "主要监理岗位的职责": {
                "评分": "4分",
                "要求": "1、总监理工程师的职责全面、清晰、合理得 1.2-2分；一般的1.2分。2、其他主要监理人员及岗位的职责全面、清晰、合理得 1.2-2分；一般的 1.2分。"
             }
            }
           """
           )
    evaluation_res = qianwen_long(file_id, user_query_2)
    print(evaluation_res)
    target_values1 = ['技术标','技术部分','设计', '实施',"技术评分"]
    # target_values2=['投标报价','商务标','商务部分','报价部分','业绩','信誉','分值','计算公式','信用','人员','资格','奖项','认证','荣誉']
    # update_json=combine_technical_and_business(clean_json_string(evaluation_res),target_values1,target_values2)
@ -112,7 +134,7 @@ def combine_evaluation_standards(truncate2):
    # return evaluation_combined_res
    return update_json              #商务标技术标整合
 if __name__ == "__main__":
-    truncate2="C:\\Users\\Administrator\\Desktop\\fsdownload\d2a70f3d-5648-4971-9c1f-87d877285304\\ztbfile_evaluation_method.pdf"
+    truncate2="C:\\Users\\Administrator\\Desktop\\招标文件\\test4\\招标02_evaluation_method.pdf"
    evaluation_standards_res=combine_evaluation_standards(truncate2)
    # 从结果中提取"商务标"和"技术标"
    technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})}
--- a/flask_app/main/截取pdf.py
+++ b/flask_app/main/截取pdf.py
@ -265,8 +265,8 @@ def truncate_pdf_multiple(input_path, output_folder):
 if __name__ == "__main__":
    # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74\\ztbfile.pdf"
    # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74"
-    input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest17.pdf"
+    input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标02.pdf"
-    output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp"
+    output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\test4"
    truncate_pdf_multiple(input_path,output_folder)
    # selection = 3 # 例如：1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
    # generated_files = truncate_pdf_main(input_path, output_folder, selection)
--- a/flask_app/main/读取文件/按页读取pdf.py
+++ b/flask_app/main/读取文件/按页读取pdf.py
@ -146,6 +146,7 @@ def extract_text_by_page(file_path):
 if __name__ == '__main__':
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件（定稿）（三次）_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_tobidders_notice_part2.pdf'
-    file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
+    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
    file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\竞争性谈判文件(3)_tobidders_notice_part2.pdf'
    res=extract_text_by_page(file_path)
    # print(res)磋商文件_tobidders_notice_part2.pdf
--- a/flask_app/货物标/投标人须知正文提取指定内容货物标版.py
+++ b/flask_app/货物标/投标人须知正文提取指定内容货物标版.py
@ -219,7 +219,7 @@ def extract_from_notice(clause_path, type):
    if type == 1:
        target_values = ["投标文件","响应文件","响应性文件"]
    elif type == 2:
-        target_values = ["开标", "评标", "定标","磋商程序","中标","程序"]
+        target_values = ["开标", "评标", "定标","磋商程序","中标","程序","步骤"]
    elif type == 3:
        target_values = ["重新招标、不再招标和终止招标", "重新招标", "不再招标", "终止招标"]
    elif type == 4:
@ -237,7 +237,7 @@ def extract_from_notice(clause_path, type):
 #TODO: 再审视一下zbtest20的处理是否合理
 if __name__ == "__main__":
-    file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause26.json'
+    file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\'
    try:
        res = extract_from_notice(file_path, 1)  # 可以改变此处的 type 参数测试不同的场景
        res2=json.dumps(res,ensure_ascii=False,indent=4)
--- a/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py
+++ b/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py
@ -86,13 +86,6 @@ def extract_text_from_pdf(file_path, start_word, end_pattern):
 #
 #     return full_text
 def chinese_to_arabic(chinese_num):
    cn_num = {
        '一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
        '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
        '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15
    }
    return cn_num.get(chinese_num, 0)
 def compare_headings(current, new):
    # 使用过滤来确保只处理非空且为数字的部分
@ -141,6 +134,7 @@ def parse_text_by_heading(text):
    current_content = []
    append_newline = False
    skip_subheadings = False
    last_main_number = None
    lines = text.split('\n')
    for i, line in enumerate(lines):
@ -149,6 +143,9 @@ def parse_text_by_heading(text):
        if not match:
            match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
        # 新增：处理以点号开头的情况
        dot_match = re.match(r'^\.(\d+)\s*(.+)$', line_stripped)
        if match:
            new_key, line_content = match.groups()
            line_content = line_content.lstrip('.．、,')
@ -172,6 +169,20 @@ def parse_text_by_heading(text):
                current_key = new_key
                current_content = [line_content]
                append_newline = len(new_key.rstrip('.').split('.')) <= 2
                last_main_number = new_key.split('.')[0]
            else:
                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
        elif dot_match:
            # 处理以点号开头的情况
            sub_number, line_content = dot_match.groups()
            if last_main_number:
                new_key = f"{last_main_number}.{sub_number}"
                if current_key is not None:
                    content_string = ''.join(current_content).strip()
                    data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
                current_key = new_key
                current_content = [line_content]
                append_newline = True
            else:
                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
        else:
@ -190,12 +201,12 @@ def parse_text_by_heading(text):
                        data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
                    if current_key_chinese is not None:
                        data[current_key_chinese] = current_value_chinese
-                        current_key_chinese = None
+                    current_key_chinese = None
                    # 处理中文标题
                    if chinese_match:
                        current_key_chinese = chinese_match.group(1) or chinese_match.group(2)
-                        current_value_chinese = line_stripped[chinese_match.end():].lstrip('.．、,')
+                        current_value_chinese = line_stripped[chinese_match.end():].lstrip('.．、,').replace(' ', '')
                        if current_key_chinese in data:
                            handle_content_append(current_content, '\n' + line_stripped, append_newline, keywords)
                            current_key_chinese = None
@ -208,7 +219,7 @@ def parse_text_by_heading(text):
                            'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
                        }
                        current_key_chinese = letter_to_chinese.get(letter_key, letter_key)
-                        current_value_chinese = letter_value.lstrip('.．、,')
+                        current_value_chinese = letter_value.lstrip('.．、,').replace(' ', '')
                        if current_key_chinese in data:
                            handle_content_append(current_content, '\n' + line_stripped, append_newline, keywords)
                            current_key_chinese = None
@ -217,7 +228,8 @@ def parse_text_by_heading(text):
                        arabic_key, arabic_value = arabic_match.groups()
                        current_key = arabic_key.replace('、', '.')
                        current_content = [arabic_value]
-                    append_newline = True
+                        append_newline = True
                        last_main_number = current_key.rstrip('.')
                    continue
            # 如果没有匹配标题，继续处理正文内容
@ -262,8 +274,8 @@ def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        print(f"Created output folder: {output_folder}")
-    file_name = "clause1.json" if type == 1 else "clause2.json"
+    # file_name = "clause1.json" if type == 1 else "clause2.json"
-    # file_name = f"clause{suffix_counter}.json"
+    file_name = f"clause{suffix_counter}.json"
    output_path = os.path.join(output_folder, file_name)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=4, ensure_ascii=False)
@ -275,39 +287,34 @@ def process_folder(input_folder, output_folder):
    # 获取输入文件夹中的所有文件，过滤掉不以 'part2' 结尾的文件
    files = [f for f in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, f)) and f.endswith('part2.pdf')]
    # 初始化后缀计数器
    suffix_counter = 1
    # 遍历文件并对每个文件进行处理
    for file_name in files:
        file_path = os.path.join(input_folder, file_name)
-        suffix = str(suffix_counter)  # 后缀为递增的数字
+        # 去掉文件扩展名
        file_name_without_extension = os.path.splitext(file_name)[0]
        try:
            # 调用 convert_clause_to_json，传递文件路径、输出文件夹和递增的后缀
-            output_path = convert_clause_to_json(file_path, output_folder, 1, suffix)
+            output_path = convert_clause_to_json(file_path, output_folder, 1, file_name_without_extension)
            print(f"Processed file: {file_name}, JSON saved to: {output_path}")
        except ValueError as e:
            print(f"Error processing {file_name}: {e}")
        # 后缀递增
        suffix_counter += 1
 #TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件（定稿）（三次）_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏
 #TODO:标题'一'应该越来越大，与'1.1'的逻辑分开'；
 #TODO:911904c3-4d6e-47e0-acd8-b05e582209cb文件夹运行有问题，百炼出错了
 if __name__ == "__main__":
-    # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
+    # # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
-    file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
+    # file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\竞争性谈判文件(3)_tobidders_notice_part2.pdf'
-    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_tobidders_notice_part2.pdf'
+    # # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_tobidders_notice_part2.pdf'
-    output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2'
+    # output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2'
-    try:
+    # try:
-        output_path = convert_clause_to_json(file_path,output_folder)
+    #     output_path = convert_clause_to_json(file_path,output_folder)
-        print(f"Final JSON result saved to: {output_path}")
+    #     print(f"Final JSON result saved to: {output_path}")
-    except ValueError as e:
+    # except ValueError as e:
-        print("Error:", e)
+    #     print("Error:", e)
-    # input_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4'
+    input_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4'
-    # output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1'
+    output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1'
-    #
+
-    # # 调用 process_folder 来处理整个文件夹中的所有文件
+    # 调用 process_folder 来处理整个文件夹中的所有文件
-    # process_folder(input_folder, output_folder)
+    process_folder(input_folder, output_folder)