9.27

2024-09-27 18:07:34 +08:00 · 2024-09-27 18:07:34 +08:00 · 2c036d8504
commit 2c036d8504
parent 1bc628fcf2
4 changed files with 216 additions and 61 deletions
--- a/flask_app/main/json_utils.py
+++ b/flask_app/main/json_utils.py
@ -1,11 +1,11 @@
 import json
 import re

-def extract_content_from_json(json_data):
-    """提取 { 和 } 之间的内容，并将其解析为字典"""
-    if not json_data.strip():
+def extract_content_from_json(string):
+    """输入字符串，提取 { 和 } 之间的内容，并将其解析为字典"""
+    if not string.strip():
        return {}
-    match = re.search(r'\{[\s\S]*\}', json_data)
+    match = re.search(r'\{[\s\S]*\}', string)
    if match:
        try:
            json_data = match.group(0)
--- a/flask_app/main/ttt.py
+++ b/flask_app/main/ttt.py
@ -1,7 +1,210 @@
-dict1 = {"a": {}}
-dict2 = {"b": {}}
+import json
+import docx
+import re
+import os
+from PyPDF2 import PdfReader
+from flask_app.main.截取pdf import clean_page_content,extract_common_header

-# 使用 update() 方法
-dict1.update(dict2)
+def extract_text_from_docx(file_path):
+    doc = docx.Document(file_path)
+    return '\n'.join([para.text for para in doc.paragraphs])

-print(dict1)
+
+def extract_text_from_pdf(file_path):
+    # 从PDF文件中提取文本
+    common_header = extract_common_header(file_path)
+    pdf_document = PdfReader(file_path)
+    text = ""
+    # 遍历每一页
+    for page in pdf_document.pages:
+        # 提取当前页面的文本
+        page_text = page.extract_text() if page.extract_text() else ""
+        # 清洗页面文本
+        page_text = clean_page_content(page_text, common_header)
+        # 将清洗后的文本添加到总文本中
+        text += page_text+"\n"
+    return text
+
+def extract_section(text, start_pattern, end_phrases):
+    # 查找开始模式
+    start_match = re.search(start_pattern, text)
+    if not start_match:
+        return ""  # 如果没有找到匹配的开始模式，返回空字符串
+    start_index = start_match.end()  # 从匹配的结束位置开始
+
+    # 初始化结束索引为文本总长度
+    end_index = len(text)
+
+    # 遍历所有结束短语，查找第一个出现的结束短语
+    for phrase in end_phrases:
+        match = re.search(phrase, text[start_index:], flags=re.MULTILINE)
+        if match:
+            end_index = start_index + match.start()  # 更新结束索引为匹配到的开始位置
+            break  # 找到第一个匹配后立即停止搜索
+
+    # 提取并返回从开始模式后到结束模式前的内容
+    return text[start_index:end_index]
+
+def compare_headings(current, new):
+    # 使用过滤来确保只处理非空且为数字的部分
+    current_nums = [int(num) for num in current.split('.') if num.isdigit()]
+    new_nums = [int(num) for num in new.split('.') if num.isdigit()]
+
+    # 比较数字序列以确定标题的层次关系
+    for c, n in zip(current_nums, new_nums):
+        if n > c:
+            return True
+        elif n < c:
+            return False
+
+    # 如果新标题有更多层次，认为是新的子章节
+    return len(new_nums) > len(current_nums)
+
+
+def should_add_newline(content, keywords, max_length=20):
+    content_str = ''.join(content).strip()
+    return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
+
+def handle_content_append(current_content, line_content, append_newline, keywords):
+    if append_newline:
+        if should_add_newline(current_content, keywords):
+            current_content.append('\n')  # 添加换行符
+        append_newline = False
+    current_content.append(line_content)
+    return append_newline
+
+#对二级标题如x.x进行额外处理：如果当前处理内容包含keywords中的内容，则必须保留换行符/如果当前内容字数大于20，不保留换行。
+def parse_text_by_heading(text):
+    keywords = ['包含', '以下']
+    data = {}
+    current_key = None
+    current_content = []
+    append_newline = False
+
+    lines = text.split('\n')
+    for i, line in enumerate(lines):
+        line_stripped = line.strip()
+        # 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题，并确保其前后没有字母或括号
+        match = re.match(r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
+        if not match:
+            match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
+
+        if match:
+            new_key, line_content = match.groups()
+            line_content = line_content.lstrip('.')
+            # 检查是否应该更新当前键和内容
+            if current_key is None or (compare_headings(current_key, new_key) and (
+                    len(current_content) == 0 or current_content[-1][-1] != '第')):
+                if current_key is not None:
+                    # 将之前的内容保存到data中，保留第一个换行符，后续的换行符转为空字符
+                    content_string = ''.join(current_content).strip()
+                    data[current_key] = content_string.replace(' ', '')
+                current_key = new_key
+                current_content = [line_content]
+                # 只有当标题级别为两级（如 1.1）时，才设置 append_newline 为 True
+                append_newline = len(new_key.split('.')) == 2
+            else:
+                append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
+        else:
+            if line_stripped:
+                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
+
+    if current_key is not None:
+        # 保存最后一部分内容
+        content_string = ''.join(current_content).strip()
+        data[current_key] = content_string.replace(' ', '')
+
+    return data
+
+def convert_to_json(file_path, start_word, end_phrases):
+    if file_path.endswith('.docx'):
+        text = extract_text_from_docx(file_path)
+    elif file_path.endswith('.pdf'):
+        text = extract_text_from_pdf(file_path)
+    else:
+        raise ValueError("Unsupported file format")
+    # 提取从 start_word 开始到 end_phrases 结束的内容
+    text = extract_section(text, start_word, end_phrases)
+    # print(text)
+    parsed_data = parse_text_by_heading(text)
+    return parsed_data
+
+def convert_clause_to_json(input_path,output_folder,type=1):
+    if not os.path.exists(input_path):
+        print(f"The specified file does not exist: {input_path}")
+        return ""
+    if type==1:
+        start_word = "投标人须知正文"
+        end_phrases = [
+            r'^第[一二三四五六七八九十]+章\s*评标办法', r'^评标办法前附表', r'^附录：', r'^附录一：', r'^附件：', r'^附件一：',
+            r'^附表：', r'^附表一：', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
+        ]
+    else:
+        start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号：'
+        end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
+    result = convert_to_json(input_path, start_word, end_phrases)
+    file_name = "clause1.json" if type == 1 else "clause2.json"
+    output_path = os.path.join(output_folder, file_name)
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(result, f, indent=4, ensure_ascii=False)
+    post_process_json(output_path)
+    print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
+    return output_path
+
+def post_process_json(json_file_path):   #处理一级标题如'5.1'过长的内容 zbtest20
+    # 读取 JSON 文件
+    with open(json_file_path, 'r', encoding='utf-8') as file:
+        data = json.load(file)
+
+    processed_data = {}
+
+    for key, value in data.items():
+        # 检查是否是一级标题（如 '5.'），并且其值包含 '\n'
+        if re.match(r'^\d+\.\s*$', key) and '\n' in value:
+            # 分割标题和正文
+            title, content = value.split('\n', 1)
+
+            # 添加原来的标题作为 '5.0'，其值为原来标题的内容（即 title）
+            processed_data[key] = title.strip()
+            sub_key = f"{key.rstrip('.')}." + "0"  # 自动生成 '5.0'，与 '5.' 一致，保证点号的存在
+
+            processed_data[sub_key] = title.strip()
+
+            # 初始化计数器
+            sub_count = 1
+
+            # 根据子序号 '1.' 或 '1、' 进行分割
+            sub_sections = re.split(r'(\d+[\.\、])\s*', content)
+
+            current_sub_content = ""
+            for i in range(1, len(sub_sections), 2):
+                sub_number = sub_sections[i].strip()  # 获取子序号
+                sub_content = sub_sections[i + 1].strip()  # 获取内容
+
+                # 生成三级标题，如 '5.0.1', '5.0.2'
+                sub_key_with_number = f"{sub_key}.{sub_count}"
+                processed_data[sub_key_with_number] = sub_content
+                sub_count += 1
+
+        else:
+            # 如果没有分割需求，保留原数据
+            processed_data[key] = value
+
+    # 将修改后的数据重新写入到原来的 JSON 文件中
+    with open(json_file_path, 'w', encoding='utf-8') as file:
+        json.dump(processed_data, file, ensure_ascii=False, indent=4)
+
+if __name__ == "__main__":
+    # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
+    file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output3\\2-招标文件（广水市教育局封闭管理）_qualification1.pdf'
+    # start_word = "投标人须知正文"
+    # end_phrases = [
+    #     r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录：', r'^附录一：', r'^附件：', r'^附件一：',
+    #     r'^附表：', r'^附表一：', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
+    # ]
+    output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output3\\tmp'
+    try:
+        output_path = convert_clause_to_json(file_path,output_folder)
+        print(f"Final JSON result saved to: {output_path}")
+    except ValueError as e:
+        print("Error:", e)
--- a/flask_app/货物标/test.py
+++ b/flask_app/货物标/test.py
@ -3,11 +3,6 @@ import re
 #这个字典可能有嵌套，你需要遍历里面的键名，对键名作判断，而不是键值，具体是这样的：如果处于同一层级的键的数量>1并且键名全由数字或点号组成。那么就将这些序号键名全部删除，重新组织成一个字典格式的数据，你可以考虑用字符串列表来保持部分平级的数据
 #对于同级的键，如果数量>1且键名都统一，那么将键名去掉，用列表保持它们的键值
 #对于同一个字典中，可能存在若干键值对，若它们的键值都是""或者"/" 你就将它们的键值删去，它们的键名用字符串列表保存
-def is_numeric_key(key):
-    # 这个正则表达式匹配由数字、点、括号中的数字或单个字母（小写或大写）组成的字符串，
-    # 字母后跟数字，或数字后跟字母，单个字母后跟点，但不能是字母-数字-字母的组合
-    pattern = r'^[\d.]+$|^\(\d+\)$|^（\d+）$|^[a-zA-Z]$|^[a-zA-Z]\d+$|^\d+[a-zA-Z]$|^[a-zA-Z]\.$'
-    return re.match(pattern, key) is not None

 #zbtest20也有问题
 def contains_number_or_index(key, value):
@ -47,56 +42,13 @@ def preprocess_dict(data):
        return [preprocess_dict(item) for item in data]
    else:
        return data
-def process_dict(data):
-    if not isinstance(data, dict):
-        return data

-    result = {}
-    numeric_keys = []
-    non_numeric_keys = {}
-
-    for key, value in data.items():
-        if is_numeric_key(key):
-            numeric_keys.append((key, value))
-        else:
-            non_numeric_keys[key] = value
-
-    if numeric_keys:
-        result['items'] = [process_dict(item[1]) for item in sorted(numeric_keys)]
-
-    for key, value in non_numeric_keys.items():
-        if isinstance(value, list):
-            processed_list = []
-            for item in value:
-                if isinstance(item, dict):
-                    # 处理字典中只有一个键值对的情况
-                    if len(item) == 1:
-                        processed_item = process_dict(list(item.values())[0])
-                    else:
-                        processed_item = process_dict(item)
-                else:
-                    processed_item = process_dict(item)
-
-                # 如果处理后的项是只包含一个元素的列表，则展平它
-                if isinstance(processed_item, list) and len(processed_item) == 1:
-                    processed_item = processed_item[0]
-
-                processed_list.append(processed_item)
-
-            result[key] = processed_list
-        else:
-            result[key] = process_dict(value)
-
-    if len(result) == 1 and 'items' in result:
-        return result['items']
-
-    return result

 # 测试代码
-#TODO:同一层全部都是数字才成功删除
+#TODO:同一层全部都是数字才成功删除,没需求了
 input_data = {
    "符合性审查": {
-        "说明": "1",
+        "说明": "1ha",
        "www":"哈哈",
        "审查标准": [
            {
--- a/flask_app/货物标/货物标截取pdf.py
+++ b/flask_app/货物标/货物标截取pdf.py
@ -388,9 +388,9 @@ def truncate_pdf_multiple(input_path, output_folder):
    return truncate_files


-# TODO:交通智能系统和招标(1)(1)文件有问题
+# TODO:交通智能系统和招标(1)(1)文件有问题  sele=4的时候excludsion有问题
 if __name__ == "__main__":
-    input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
+    input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）.pdf"
    output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output4"
    # truncate_pdf_multiple(input_path,output_folder)
    selection = 4  # 例如：1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1和qualification2  4.投标人须知前附表