9.28

2024-09-28 17:38:53 +08:00 · 2024-09-28 17:38:53 +08:00 · 6637086547
commit 6637086547
parent 2c036d8504
8 changed files with 271 additions and 312 deletions
--- a/flask_app/main/table_content_extraction.py
+++ b/flask_app/main/table_content_extraction.py
@ -107,6 +107,6 @@ def extract_tables_main(path, output_folder):


 if __name__ == "__main__":
-    path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp\\ztbfile_tobidders_notice_table.docx'
-    output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp"  # 前附表json文件
+    path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件（一中多媒体报告厅教学设备）_tobidders_notice_part1.docx'
+    output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp"  # 前附表json文件
    extract_tables_main(path, output_folder)
--- a/flask_app/main/ttt.py
+++ b/flask_app/main/ttt.py
@ -1,210 +0,0 @@
-import json
-import docx
-import re
-import os
-from PyPDF2 import PdfReader
-from flask_app.main.截取pdf import clean_page_content,extract_common_header
-
-def extract_text_from_docx(file_path):
-    doc = docx.Document(file_path)
-    return '\n'.join([para.text for para in doc.paragraphs])
-
-
-def extract_text_from_pdf(file_path):
-    # 从PDF文件中提取文本
-    common_header = extract_common_header(file_path)
-    pdf_document = PdfReader(file_path)
-    text = ""
-    # 遍历每一页
-    for page in pdf_document.pages:
-        # 提取当前页面的文本
-        page_text = page.extract_text() if page.extract_text() else ""
-        # 清洗页面文本
-        page_text = clean_page_content(page_text, common_header)
-        # 将清洗后的文本添加到总文本中
-        text += page_text+"\n"
-    return text
-
-def extract_section(text, start_pattern, end_phrases):
-    # 查找开始模式
-    start_match = re.search(start_pattern, text)
-    if not start_match:
-        return ""  # 如果没有找到匹配的开始模式，返回空字符串
-    start_index = start_match.end()  # 从匹配的结束位置开始
-
-    # 初始化结束索引为文本总长度
-    end_index = len(text)
-
-    # 遍历所有结束短语，查找第一个出现的结束短语
-    for phrase in end_phrases:
-        match = re.search(phrase, text[start_index:], flags=re.MULTILINE)
-        if match:
-            end_index = start_index + match.start()  # 更新结束索引为匹配到的开始位置
-            break  # 找到第一个匹配后立即停止搜索
-
-    # 提取并返回从开始模式后到结束模式前的内容
-    return text[start_index:end_index]
-
-def compare_headings(current, new):
-    # 使用过滤来确保只处理非空且为数字的部分
-    current_nums = [int(num) for num in current.split('.') if num.isdigit()]
-    new_nums = [int(num) for num in new.split('.') if num.isdigit()]
-
-    # 比较数字序列以确定标题的层次关系
-    for c, n in zip(current_nums, new_nums):
-        if n > c:
-            return True
-        elif n < c:
-            return False
-
-    # 如果新标题有更多层次，认为是新的子章节
-    return len(new_nums) > len(current_nums)
-
-
-def should_add_newline(content, keywords, max_length=20):
-    content_str = ''.join(content).strip()
-    return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
-
-def handle_content_append(current_content, line_content, append_newline, keywords):
-    if append_newline:
-        if should_add_newline(current_content, keywords):
-            current_content.append('\n')  # 添加换行符
-        append_newline = False
-    current_content.append(line_content)
-    return append_newline
-
-#对二级标题如x.x进行额外处理：如果当前处理内容包含keywords中的内容，则必须保留换行符/如果当前内容字数大于20，不保留换行。
-def parse_text_by_heading(text):
-    keywords = ['包含', '以下']
-    data = {}
-    current_key = None
-    current_content = []
-    append_newline = False
-
-    lines = text.split('\n')
-    for i, line in enumerate(lines):
-        line_stripped = line.strip()
-        # 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题，并确保其前后没有字母或括号
-        match = re.match(r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
-        if not match:
-            match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
-
-        if match:
-            new_key, line_content = match.groups()
-            line_content = line_content.lstrip('.')
-            # 检查是否应该更新当前键和内容
-            if current_key is None or (compare_headings(current_key, new_key) and (
-                    len(current_content) == 0 or current_content[-1][-1] != '第')):
-                if current_key is not None:
-                    # 将之前的内容保存到data中，保留第一个换行符，后续的换行符转为空字符
-                    content_string = ''.join(current_content).strip()
-                    data[current_key] = content_string.replace(' ', '')
-                current_key = new_key
-                current_content = [line_content]
-                # 只有当标题级别为两级（如 1.1）时，才设置 append_newline 为 True
-                append_newline = len(new_key.split('.')) == 2
-            else:
-                append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
-        else:
-            if line_stripped:
-                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
-
-    if current_key is not None:
-        # 保存最后一部分内容
-        content_string = ''.join(current_content).strip()
-        data[current_key] = content_string.replace(' ', '')
-
-    return data
-
-def convert_to_json(file_path, start_word, end_phrases):
-    if file_path.endswith('.docx'):
-        text = extract_text_from_docx(file_path)
-    elif file_path.endswith('.pdf'):
-        text = extract_text_from_pdf(file_path)
-    else:
-        raise ValueError("Unsupported file format")
-    # 提取从 start_word 开始到 end_phrases 结束的内容
-    text = extract_section(text, start_word, end_phrases)
-    # print(text)
-    parsed_data = parse_text_by_heading(text)
-    return parsed_data
-
-def convert_clause_to_json(input_path,output_folder,type=1):
-    if not os.path.exists(input_path):
-        print(f"The specified file does not exist: {input_path}")
-        return ""
-    if type==1:
-        start_word = "投标人须知正文"
-        end_phrases = [
-            r'^第[一二三四五六七八九十]+章\s*评标办法', r'^评标办法前附表', r'^附录：', r'^附录一：', r'^附件：', r'^附件一：',
-            r'^附表：', r'^附表一：', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
-        ]
-    else:
-        start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号：'
-        end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
-    result = convert_to_json(input_path, start_word, end_phrases)
-    file_name = "clause1.json" if type == 1 else "clause2.json"
-    output_path = os.path.join(output_folder, file_name)
-    with open(output_path, 'w', encoding='utf-8') as f:
-        json.dump(result, f, indent=4, ensure_ascii=False)
-    post_process_json(output_path)
-    print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
-    return output_path
-
-def post_process_json(json_file_path):   #处理一级标题如'5.1'过长的内容 zbtest20
-    # 读取 JSON 文件
-    with open(json_file_path, 'r', encoding='utf-8') as file:
-        data = json.load(file)
-
-    processed_data = {}
-
-    for key, value in data.items():
-        # 检查是否是一级标题（如 '5.'），并且其值包含 '\n'
-        if re.match(r'^\d+\.\s*$', key) and '\n' in value:
-            # 分割标题和正文
-            title, content = value.split('\n', 1)
-
-            # 添加原来的标题作为 '5.0'，其值为原来标题的内容（即 title）
-            processed_data[key] = title.strip()
-            sub_key = f"{key.rstrip('.')}." + "0"  # 自动生成 '5.0'，与 '5.' 一致，保证点号的存在
-
-            processed_data[sub_key] = title.strip()
-
-            # 初始化计数器
-            sub_count = 1
-
-            # 根据子序号 '1.' 或 '1、' 进行分割
-            sub_sections = re.split(r'(\d+[\.\、])\s*', content)
-
-            current_sub_content = ""
-            for i in range(1, len(sub_sections), 2):
-                sub_number = sub_sections[i].strip()  # 获取子序号
-                sub_content = sub_sections[i + 1].strip()  # 获取内容
-
-                # 生成三级标题，如 '5.0.1', '5.0.2'
-                sub_key_with_number = f"{sub_key}.{sub_count}"
-                processed_data[sub_key_with_number] = sub_content
-                sub_count += 1
-
-        else:
-            # 如果没有分割需求，保留原数据
-            processed_data[key] = value
-
-    # 将修改后的数据重新写入到原来的 JSON 文件中
-    with open(json_file_path, 'w', encoding='utf-8') as file:
-        json.dump(processed_data, file, ensure_ascii=False, indent=4)
-
-if __name__ == "__main__":
-    # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
-    file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output3\\2-招标文件（广水市教育局封闭管理）_qualification1.pdf'
-    # start_word = "投标人须知正文"
-    # end_phrases = [
-    #     r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录：', r'^附录一：', r'^附件：', r'^附件一：',
-    #     r'^附表：', r'^附表一：', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
-    # ]
-    output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output3\\tmp'
-    try:
-        output_path = convert_clause_to_json(file_path,output_folder)
-        print(f"Final JSON result saved to: {output_path}")
-    except ValueError as e:
-        print("Error:", e)
--- a/flask_app/main/投标人须知正文提取指定内容.py
+++ b/flask_app/main/投标人须知正文提取指定内容.py
@ -143,7 +143,7 @@ def extract_from_notice(clause_path, type):
        return transformed_data


-# 假设原始数据文件路径
+#TODO:考虑5.1这种二级标题后面换行符，但是仍然存在5.1.1情况时该怎么办  再审视一下zbtest20的处理是否合理
 if __name__ == "__main__":
    file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp\\clause1.json'
    # file_path='C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\clause1.json'
--- a/flask_app/main/投标人须知正文条款提取成json文件.py
+++ b/flask_app/main/投标人须知正文条款提取成json文件.py
@ -10,21 +10,53 @@ def extract_text_from_docx(file_path):
    return '\n'.join([para.text for para in doc.paragraphs])


-def extract_text_from_pdf(file_path):
+# def extract_text_from_pdf(file_path):
+#     # 从PDF文件中提取文本
+#     common_header = extract_common_header(file_path)
+#     pdf_document = PdfReader(file_path)
+#     text = ""
+#     # 遍历每一页
+#     for page in pdf_document.pages:
+#         # 提取当前页面的文本
+#         page_text = page.extract_text() if page.extract_text() else ""
+#         # 清洗页面文本
+#         page_text = clean_page_content(page_text, common_header)
+#         # 将清洗后的文本添加到总文本中
+#         text += page_text+"\n"
+#     return text
+def extract_text_from_pdf(file_path, start_word, end_pattern):
    # 从PDF文件中提取文本
    common_header = extract_common_header(file_path)
    pdf_document = PdfReader(file_path)
-    text = ""
-    # 遍历每一页
-    for page in pdf_document.pages:
-        # 提取当前页面的文本
+    all_pages_text = []
+    start_index = None
+    # 处理所有页面
+    for i, page in enumerate(pdf_document.pages):
        page_text = page.extract_text() if page.extract_text() else ""
-        # 清洗页面文本
-        page_text = clean_page_content(page_text, common_header)
-        # 将清洗后的文本添加到总文本中
-        text += page_text+"\n"
-    return text
+        cleaned_text = clean_page_content(page_text, common_header)

+        # 在第一页查找开始位置
+        if i == 0 and start_index is None:
+            start_match = re.search(start_word, cleaned_text, re.MULTILINE)
+            if start_match:
+                start_index = start_match.start()
+                cleaned_text = cleaned_text[start_index:]
+
+        # 在最后一页查找结束位置
+        if i == len(pdf_document.pages) - 1:
+            for pattern in end_pattern:
+                matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE))
+                if matches:
+                    end_index = matches[-1].start()
+                    cleaned_text = cleaned_text[:end_index]
+                    break
+
+        all_pages_text.append(cleaned_text)
+
+    # 合并所有页面的文本
+    full_text = "\n".join(all_pages_text)
+    # print(full_text)
+    return full_text
 def extract_section(text, start_pattern, end_phrases):
    # 查找开始模式
    start_match = re.search(start_pattern, text)
@ -80,9 +112,8 @@ def parse_text_by_heading(text):
    current_key = None
    current_content = []
    append_newline = False
-
    lines = text.split('\n')
-    for i, line in enumerate(lines):
+    for i, line in enumerate(lines):    #由于本身就是按行读取处理，因此保存的时候不带'\n'
        line_stripped = line.strip()
        # 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题，并确保其前后没有字母或括号
        match = re.match(r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
@ -94,11 +125,12 @@ def parse_text_by_heading(text):
            line_content = line_content.lstrip('.')
            # 检查是否应该更新当前键和内容
            if current_key is None or (compare_headings(current_key, new_key) and (
-                    len(current_content) == 0 or current_content[-1][-1] != '第')):
+                    len(current_content) == 0 or current_content[-1][-1] != '第')):   #current_content： 内容占两行则是： ['本招标项目已具备招标条件，现对本项目施工监理进行招标，项目概况见本', '章附件一。']
                if current_key is not None:
                    # 将之前的内容保存到data中，保留第一个换行符，后续的换行符转为空字符
                    content_string = ''.join(current_content).strip()
                    data[current_key] = content_string.replace(' ', '')
+
                current_key = new_key
                current_content = [line_content]
                # 只有当标题级别为两级（如 1.1）时，才设置 append_newline 为 True
@ -120,11 +152,11 @@ def convert_to_json(file_path, start_word, end_phrases):
    if file_path.endswith('.docx'):
        text = extract_text_from_docx(file_path)
    elif file_path.endswith('.pdf'):
-        text = extract_text_from_pdf(file_path)
+        text = extract_text_from_pdf(file_path,start_word,end_phrases)
    else:
        raise ValueError("Unsupported file format")
    # 提取从 start_word 开始到 end_phrases 结束的内容
-    text = extract_section(text, start_word, end_phrases)
+    # text = extract_section(text, start_word, end_phrases)
    # print(text)
    parsed_data = parse_text_by_heading(text)
    return parsed_data
@ -136,13 +168,18 @@ def convert_clause_to_json(input_path,output_folder,type=1):
    if type==1:
        start_word = "投标人须知正文"
        end_phrases = [
-            r'^第[一二三四五六七八九十]+章\s*评标办法', r'^评标办法前附表', r'^附录：', r'^附录一：', r'^附件：', r'^附件一：',
-            r'^附表：', r'^附表一：', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
+            r'^第[一二三四五六七八九十]+章\s*评标办法',
+            r'^评标办法前附表',
+            r'^附(?:录|件|表)(?:一)?[:：]'
        ]
    else:
        start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号：'
        end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
    result = convert_to_json(input_path, start_word, end_phrases)
+    # 检查输出文件夹是否存在，如果不存在则创建
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+        print(f"Created output folder: {output_folder}")
    file_name = "clause1.json" if type == 1 else "clause2.json"
    output_path = os.path.join(output_folder, file_name)
    with open(output_path, 'w', encoding='utf-8') as f:
@ -155,9 +192,7 @@ def post_process_json(json_file_path):   #处理一级标题如'5.1'过长的内
    # 读取 JSON 文件
    with open(json_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
-
    processed_data = {}
-
    for key, value in data.items():
        # 检查是否是一级标题（如 '5.'），并且其值包含 '\n'
        if re.match(r'^\d+\.\s*$', key) and '\n' in value:
@ -195,14 +230,14 @@ def post_process_json(json_file_path):   #处理一级标题如'5.1'过长的内
        json.dump(processed_data, file, ensure_ascii=False, indent=4)

 if __name__ == "__main__":
-    # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
-    file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output3\\2-招标文件（广水市教育局封闭管理）_qualification1.pdf'
+    file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest20_tobidders_notice.pdf'
+    # file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件（一中多媒体报告厅教学设备）_tobidders_notice_part1.pdf'
    # start_word = "投标人须知正文"
    # end_phrases = [
    #     r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录：', r'^附录一：', r'^附件：', r'^附件一：',
    #     r'^附表：', r'^附表一：', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
    # ]
-    output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output3\\tmp'
+    output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp'
    try:
        output_path = convert_clause_to_json(file_path,output_folder)
        print(f"Final JSON result saved to: {output_path}")
--- a/flask_app/main/读取文件/按页读取pdf.py
+++ b/flask_app/main/读取文件/按页读取pdf.py
@ -72,6 +72,6 @@ def extract_text_by_page(file_path):
                print(f"Page {page_num + 1} is empty or text could not be extracted.")
        return result
 if __name__ == '__main__':
-    file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件正文(1)(1).pdf"
+    file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件（一中多媒体报告厅教学设备）_tobidders_notice_part2.pdf"
    res=extract_text_by_page(file_path)
    # print(res)
--- a/flask_app/货物标/test.py
+++ b/flask_app/货物标/test.py
@ -1,72 +1,14 @@
-import json
-import re
-#这个字典可能有嵌套，你需要遍历里面的键名，对键名作判断，而不是键值，具体是这样的：如果处于同一层级的键的数量>1并且键名全由数字或点号组成。那么就将这些序号键名全部删除，重新组织成一个字典格式的数据，你可以考虑用字符串列表来保持部分平级的数据
-#对于同级的键，如果数量>1且键名都统一，那么将键名去掉，用列表保持它们的键值
-#对于同一个字典中，可能存在若干键值对，若它们的键值都是""或者"/" 你就将它们的键值删去，它们的键名用字符串列表保存
+def test_append_newline():
+    def check_append_newline(key):
+        append_newline = len(key.split('.')) == 2
+        return append_newline

-#zbtest20也有问题
-def contains_number_or_index(key, value):
-    # 判断值是否是数字或数字字符串
-    is_number = isinstance(value, (int, float)) or (isinstance(value, str) and value.isdigit())
-    # 判断键是否包含 "序号"
-    contains_index = '序号' in key
-    # 判断值中是否包含数字
-    contains_digit = isinstance(value, str) and re.search(r'\d+', value)
-    # 判断值中是否包含中文字符
-    contains_chinese = isinstance(value, str) and re.search(r'[\u4e00-\u9fff]', value)
-    # 如果值中包含数字但也有中文字符，则保留（返回 False）
-    if contains_digit and contains_chinese:
-        return False
-    # 如果值是数字或包含数字，且不包含中文字符，或者键包含 "序号"，返回 True
-    return is_number or contains_index or contains_digit
+    # 测试用例
+    test_cases = ["1.1", "1."]

-#对于同一个字典中，可能存在若干键值对，若它们的键值都是""或者"/" 你就将它们的键值删去，它们的键名用字符串列表保存
-#如果键名是"序号"或者键值中全是数字，删去序号
-def preprocess_dict(data):
-    if isinstance(data, dict):
-        if len(data) > 1:
-            # 检查是否所有值都是 "" 或 "/"
-            if all(v == "" or v == "/" for v in data.values()):
-                return list(data.keys())
-            else:
-                processed = {}
-                for k, v in data.items():
-                    if not contains_number_or_index(k, v):
-                        processed_v = preprocess_dict(v)
-                        if processed_v != "":  # 只添加非空值
-                            processed[k] = processed_v
-                return processed
-        else:
-            return {k: preprocess_dict(v) for k, v in data.items()}
-    elif isinstance(data, list):
-        return [preprocess_dict(item) for item in data]
-    else:
-        return data
+    for case in test_cases:
+        result = check_append_newline(case)
+        print(f"序号 '{case}': append_newline = {result}")

-
-# 测试代码
-#TODO:同一层全部都是数字才成功删除,没需求了
-input_data = {
-    "符合性审查": {
-        "说明": "1ha",
-        "www":"哈哈",
-        "审查标准": [
-            {
-                "序号": 1,
-                "内容": "投标总报价超过项目（分包）预算金额或最高限价的；"
-            },
-            {
-                "序号": 2,
-                "内容": "《投标书》、《法定代表人授权书》、《开标一览表（含明细）》未提供或不符合招标文件要求的；"
-            },
-            {
-                "序号": 3,
-                "内容": "工期（服务期限）、质保期不符合招标文件要求的；"
-            },
-        ]
-    }
-}
-pred=preprocess_dict(input_data)
-print(json.dumps(pred, ensure_ascii=False, indent=4))
-# processed_data = process_dict(pred)
-# print(json.dumps(processed_data, ensure_ascii=False, indent=4))
+# 运行测试
+test_append_newline()
--- a/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py
+++ b/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py
@ -0,0 +1,184 @@
+import json
+import docx
+import re
+import os
+from PyPDF2 import PdfReader
+from flask_app.main.截取pdf import clean_page_content,extract_common_header
+
+def extract_text_from_docx(file_path):
+    doc = docx.Document(file_path)
+    return '\n'.join([para.text for para in doc.paragraphs])
+
+
+def extract_text_from_pdf(file_path, start_word, end_pattern):
+    # 从PDF文件中提取文本
+    common_header = extract_common_header(file_path)
+    pdf_document = PdfReader(file_path)
+    all_pages_text = []
+    start_index = None
+    # 处理所有页面
+    for i, page in enumerate(pdf_document.pages):
+        page_text = page.extract_text() if page.extract_text() else ""
+        cleaned_text = clean_page_content(page_text, common_header)
+
+        # 在第一页查找开始位置
+        if i == 0 and start_index is None:
+            start_match = re.search(start_word, cleaned_text, re.MULTILINE)
+            if start_match:
+                start_index = start_match.start()
+                cleaned_text = cleaned_text[start_index:]
+
+        # 在最后一页查找结束位置
+        if i == len(pdf_document.pages) - 1:
+            for pattern in end_pattern:
+                matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE))
+                if matches:
+                    end_index = matches[-1].start()
+                    cleaned_text = cleaned_text[:end_index]
+                    break
+
+        all_pages_text.append(cleaned_text)
+
+    # 合并所有页面的文本
+    full_text = "\n".join(all_pages_text)
+    # print(full_text)
+    return full_text
+
+def compare_headings(current, new):
+    # 使用过滤来确保只处理非空且为数字的部分
+    current_nums = [int(num) for num in current.split('.') if num.isdigit()]
+    new_nums = [int(num) for num in new.split('.') if num.isdigit()]
+
+    # 比较数字序列以确定标题的层次关系
+    for c, n in zip(current_nums, new_nums):
+        if n > c:
+            return True
+        elif n < c:
+            return False
+
+    # 如果新标题有更多层次，认为是新的子章节
+    return len(new_nums) > len(current_nums)
+
+
+def should_add_newline(content, keywords, max_length=20):
+    content_str = ''.join(content).strip()
+    return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
+
+def handle_content_append(current_content, line_content, append_newline, keywords):
+    if append_newline:
+        if should_add_newline(current_content, keywords):
+            current_content.append('\n')  # 添加换行符
+        append_newline = False
+    current_content.append(line_content)
+    return append_newline
+
+"""
+保存换行符的具体逻辑：
+
+对于二级标题（如 1.1），如果其后的内容包含关键词或内容较短（<=20字符），会在内容前添加一个换行符。
+这个换行符会被保留在 current_content 列表中。
+当处理下一个标题时，之前的内容（包括可能存在的换行符）会被合并并保存到 data 字典中。
+"""
+#提取json主函数
+def parse_text_by_heading(text):
+    keywords = ['包含', '以下']
+    data = {}
+    current_key = None
+    current_content = []
+    append_newline = False
+
+    lines = text.split('\n')  #['一、说明', '1.适用范围', '招标文件仅适用于第一章“投标邀请书”中所述项目的货物、工程及服务的采购。']
+    for i, line in enumerate(lines):    #由于本身就是按行读取处理，因此保存的时候不带'\n'
+        line_stripped = line.strip()
+
+        # 匹配中文数字标题，如 "一、说明"
+        chinese_match = re.match(r'^([一二三四五六七八九十]+、)\s*(.+)$', line_stripped)
+        if chinese_match:
+            chinese_key, chinese_value = chinese_match.groups()
+            chinese_key = chinese_key.rstrip('、')  # 移除顿号
+            data[chinese_key] = chinese_value
+            current_key = None
+            current_content = []
+            continue
+
+        # 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题，并确保其前后没有字母或括号
+        match = re.match(r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
+        if not match:
+            match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
+
+        if match:
+            new_key, line_content = match.groups()
+            line_content = line_content.lstrip('.')
+            # 检查是否应该更新当前键和内容
+            if current_key is None or (compare_headings(current_key, new_key) and (
+                    len(current_content) == 0 or current_content[-1][-1] != '第')):
+                if current_key is not None:
+                    # 将之前的内容保存到data中，保留第一个换行符，后续的换行符转为空字符
+                    # print(current_content)
+                    content_string = ''.join(current_content).strip()
+                    # print(content_string)
+                    data[current_key] = content_string.replace(' ', '')
+                current_key = new_key
+                current_content = [line_content]
+                # 只有当标题级别为两级（如 1.1）时，才设置 append_newline 为 True   #TODO:一级标题也是
+                append_newline = len(new_key.split('.')) == 2
+                print(new_key)
+            else:
+                append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
+        else:
+            if line_stripped:
+                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
+
+    if current_key is not None:
+        # 保存最后一部分内容
+        content_string = ''.join(current_content).strip()
+        data[current_key] = content_string.replace(' ', '')
+
+    return data
+
+# def convert_to_json(file_path, start_word, end_phrases):
+#     if file_path.endswith('.docx'):
+#         text = extract_text_from_docx(file_path)
+#     elif file_path.endswith('.pdf'):
+#         text = extract_text_from_pdf(file_path,start_word,end_phrases)
+#     else:
+#         raise ValueError("Unsupported file format")
+#     # print(text)
+#     parsed_data = parse_text_by_heading(text)
+#     return parsed_data
+
+def convert_clause_to_json(input_path,output_folder,type=1):
+    if not os.path.exists(input_path):
+        print(f"The specified file does not exist: {input_path}")
+        return ""
+    if type == 1:
+        start_word = r'^\s*[（(]?\s*[一1]\s*[)）]?\s*[、.]*\s*(说\s*明|总\s*则)'
+        end_pattern = [r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+']
+    else:
+        start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号：'
+        end_pattern=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表']
+    text = extract_text_from_pdf(file_path, start_word, end_pattern)
+    result = parse_text_by_heading(text)
+    # result = convert_to_json(input_path, start_word, end_pattern)
+    # 检查输出文件夹是否存在，如果不存在则创建
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+        print(f"Created output folder: {output_folder}")
+    file_name = "clause1.json" if type == 1 else "clause2.json"
+    output_path = os.path.join(output_folder, file_name)
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(result, f, indent=4, ensure_ascii=False)
+    print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
+    return output_path
+
+
+
+if __name__ == "__main__":
+    # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
+    file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件（一中多媒体报告厅教学设备）_tobidders_notice_part2.pdf'
+    output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1'
+    try:
+        output_path = convert_clause_to_json(file_path,output_folder)
+        print(f"Final JSON result saved to: {output_path}")
+    except ValueError as e:
+        print("Error:", e)
--- a/flask_app/货物标/货物标截取pdf.py
+++ b/flask_app/货物标/货物标截取pdf.py
@ -161,12 +161,16 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
    return generated_files


-def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None):
+def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,output_suffix="normal"):
    start_page = None
    end_page = None
    for i, page in enumerate(pdf_document.pages):
        text = page.extract_text() or ""
        cleaned_text = clean_page_content(text, common_header)
+        if output_suffix=="tobidders_notice":
+            if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and start_page is not None:
+                continue
+        else:
            if exclusion_pattern and re.search(exclusion_pattern, cleaned_text):
                continue
        if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
@ -183,8 +187,9 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
        pdf_document = PdfReader(pdf_path)
        exclusion_pattern = None
        if output_suffix == "tobidders_notice":
+            exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
            start_page, mid_page, end_page = extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern,
-                                                                            begin_page, common_header)
+                                                                            begin_page, common_header,exclusion_pattern)
            if start_page is None or mid_page is None or end_page is None:
                print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！尝试备用提取策略。")
                return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header)
@ -208,13 +213,15 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
        print(f"Error processing {pdf_path}: {e}")
        return None

-def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
+def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header,exclusion_pattern):
    start_page = None
    mid_page = None
    end_page = None
    for i, page in enumerate(pdf_document.pages):
        text = page.extract_text() or ""
        cleaned_text = clean_page_content(text, common_header)
+        if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and mid_page is not None:
+            continue
        if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
            start_page = i
        if start_page is not None and mid_page is None and re.search(
@ -266,6 +273,7 @@ def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix,
        r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
    )
    pdf_document = PdfReader(pdf_path)
+    exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
    # 提取第一部分
    start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, -1, common_header)
    if start_page1 is None or end_page1 is None:
@ -273,7 +281,7 @@ def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix,
        return None, None
    # 提取第二部分
    start_page2 = end_page1  # 第二部分的开始页就是第一部分的结束页
-    _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header)
+    _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,exclusion_pattern)
    if end_page2 is None:
        print(f"second: {output_suffix} 未找到第二部分的结束页在文件 {pdf_path} 中！")
        return None, None
@ -390,7 +398,7 @@ def truncate_pdf_multiple(input_path, output_folder):

 # TODO:交通智能系统和招标(1)(1)文件有问题  sele=4的时候excludsion有问题
 if __name__ == "__main__":
-    input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）.pdf"
+    input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
    output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output4"
    # truncate_pdf_multiple(input_path,output_folder)
    selection = 4  # 例如：1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1和qualification2  4.投标人须知前附表