From f8be23deaacc9b2dd0ce8b24c34bf22d9bde179a Mon Sep 17 00:00:00 2001
From: zy123 <646228430@qq.com>
Date: Thu, 26 Sep 2024 10:22:03 +0800
Subject: [PATCH] =?UTF-8?q?9.25=20=E8=B5=84=E6=A0=BC=E5=AE=A1=E6=9F=A5?=
 =?UTF-8?q?=E8=BF=98=E5=B7=AE=E8=B7=B3=E8=BD=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 flask_app/main/start_up.py                    |   2 +-
 .../main/投标人须知正文条款提取成json文件.py  |   4 +-
 .../投标人须知正文条款提取成json文件.py       | 141 ++++++++++++++++++
 flask_app/货物标/提示词/prompt1.txt           |   2 +-
 flask_app/货物标/货物标截取pdf.py             |   2 +-
 flask_app/货物标/资格审查main.py              |   2 +-
 6 files changed, 147 insertions(+), 6 deletions(-)
 create mode 100644 flask_app/货物标/投标人须知正文条款提取成json文件.py

diff --git a/flask_app/main/start_up.py b/flask_app/main/start_up.py
index d70df61..9b11239 100644
--- a/flask_app/main/start_up.py
+++ b/flask_app/main/start_up.py
@@ -147,7 +147,7 @@ def process_and_stream(file_url):
         for outer_key, inner_dict in parsed_data.items():
             if isinstance(inner_dict, dict):
                 combined_data.update(inner_dict)
-
+    logger.info(json.dumps(combined_data, ensure_ascii=False,indent=4))
     # 等待所有数据都处理完后，发送整合后的完整数据
     complete_response = {
         'message': 'Combined data',
diff --git a/flask_app/main/投标人须知正文条款提取成json文件.py b/flask_app/main/投标人须知正文条款提取成json文件.py
index ac36918..c2bd3fa 100644
--- a/flask_app/main/投标人须知正文条款提取成json文件.py
+++ b/flask_app/main/投标人须知正文条款提取成json文件.py
@@ -196,13 +196,13 @@ def post_process_json(json_file_path):   #处理一级标题如'5.1'过长的内
 
 if __name__ == "__main__":
     # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
-    file_path='C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest20_tobidders_notice.pdf'
+    file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output3\\2-招标文件（广水市教育局封闭管理）_qualification1.pdf'
     # start_word = "投标人须知正文"
     # end_phrases = [
     #     r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录：', r'^附录一：', r'^附件：', r'^附件一：',
     #     r'^附表：', r'^附表一：', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
     # ]
-    output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp'
+    output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output3\\tmp'
     try:
         output_path = convert_clause_to_json(file_path,output_folder)
         print(f"Final JSON result saved to: {output_path}")
diff --git a/flask_app/货物标/投标人须知正文条款提取成json文件.py b/flask_app/货物标/投标人须知正文条款提取成json文件.py
new file mode 100644
index 0000000..3962254
--- /dev/null
+++ b/flask_app/货物标/投标人须知正文条款提取成json文件.py
@@ -0,0 +1,141 @@
+import json
+import docx
+import re
+import os
+from PyPDF2 import PdfReader
+from flask_app.main.截取pdf import clean_page_content, extract_common_header
+
+def extract_text_from_docx(file_path):
+    doc = docx.Document(file_path)
+    return '\n'.join([para.text for para in doc.paragraphs])
+
+
+def extract_text_from_pdf(file_path):
+    # 从PDF文件中提取文本
+    common_header = extract_common_header(file_path)
+    pdf_document = PdfReader(file_path)
+    text = ""
+    # 遍历每一页
+    for page in pdf_document.pages:
+        # 提取当前页面的文本
+        page_text = page.extract_text() if page.extract_text() else ""
+        # 清洗页面文本
+        page_text = clean_page_content(page_text, common_header)
+        # 将清洗后的文本添加到总文本中
+        text += page_text + "\n"
+    return text
+
+def extract_section(text, start_pattern, end_phrases):
+    # 查找开始模式
+    start_match = re.search(start_pattern, text)
+    if not start_match:
+        return ""  # 如果没有找到匹配的开始模式，返回空字符串
+    start_index = start_match.end()  # 从匹配的结束位置开始
+
+    # 初始化结束索引为文本总长度
+    end_index = len(text)
+
+    # 遍历所有结束短语，查找第一个出现的结束短语
+    for phrase in end_phrases:
+        match = re.search(phrase, text[start_index:], flags=re.MULTILINE)
+        if match:
+            end_index = start_index + match.start()  # 更新结束索引为匹配到的开始位置
+            break  # 找到第一个匹配后立即停止搜索
+
+    # 提取并返回从开始模式后到结束模式前的内容
+    return text[start_index:end_index]
+
+def should_add_newline(content, keywords, max_length=20):
+    content_str = ''.join(content).strip()
+    return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
+
+def handle_content_append(current_content, line_content, append_newline, keywords):
+    if append_newline:
+        if should_add_newline(current_content, keywords):
+            current_content.append('\n')  # 添加换行符
+        append_newline = False
+    current_content.append(line_content)
+    return append_newline
+
+#对二级标题如x.x进行额外处理：如果当前处理内容包含keywords中的内容，则必须保留换行符/如果当前内容字数大于20，不保留换行。
+def parse_text_by_heading(text):
+    keywords = ['包含', '以下']
+    data = {}
+    current_key = None
+    current_content = []
+    append_newline = False
+
+    lines = text.split('\n')
+    for i, line in enumerate(lines):
+        line_stripped = line.strip()
+        # 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题，并确保其前后没有字母或括号
+        match = re.match(r'^(?<![a-zA-Z（(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
+        if not match:
+            match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
+
+        if match:
+            new_key, line_content = match.groups()
+            line_content = line_content.lstrip('.')
+            # 检查是否应该更新当前键和内容
+            if current_key is not None:
+                # 将之前的内容保存到data中，保留第一个换行符，后续的换行符转为空字符
+                content_string = ''.join(current_content).strip()
+                data[current_key] = content_string.replace(' ', '')
+            current_key = new_key
+            current_content = [line_content]
+            # 只有当标题级别为两级（如 1.1）时，才设置 append_newline 为 True
+            append_newline = len(new_key.split('.')) == 2
+        else:
+            if line_stripped:
+                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
+
+    if current_key is not None:
+        # 保存最后一部分内容
+        content_string = ''.join(current_content).strip()
+        data[current_key] = content_string.replace(' ', '')
+
+    return data
+
+def convert_to_json(file_path, start_word, end_phrases):
+    if file_path.endswith('.docx'):
+        text = extract_text_from_docx(file_path)
+    elif file_path.endswith('.pdf'):
+        text = extract_text_from_pdf(file_path)
+    else:
+        raise ValueError("Unsupported file format")
+    # 提取从 start_word 开始到 end_phrases 结束的内容
+    text = extract_section(text, start_word, end_phrases)
+    # print(text)
+    parsed_data = parse_text_by_heading(text)
+    return parsed_data
+
+def convert_clause_to_json(input_path, output_folder, type=1):
+    if not os.path.exists(input_path):
+        print(f"The specified file does not exist: {input_path}")
+        return ""
+    if type == 1:
+        start_word = "第四章"
+        end_phrases = [
+            "第五章"
+        ]
+    else:
+        start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号：'
+        end_phrases = [r'第[一二三四五六七八九十]+章\s*投标人须知', r'投标人须知前附表']
+    result = convert_to_json(input_path, start_word, end_phrases)
+    file_name = "clause1.json" if type == 1 else "clause2.json"
+    output_path = os.path.join(output_folder, file_name)
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(result, f, indent=4, ensure_ascii=False)
+    print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
+    return output_path
+
+
+if __name__ == "__main__":
+    # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
+    file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output3\\2-招标文件（广水市教育局封闭管理）_qualification1.pdf'
+    output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output3\\tmp'
+    try:
+        output_path = convert_clause_to_json(file_path, output_folder)
+        print(f"Final JSON result saved to: {output_path}")
+    except ValueError as e:
+        print("Error:", e)
diff --git a/flask_app/货物标/提示词/prompt1.txt b/flask_app/货物标/提示词/prompt1.txt
index ad1f4fa..23b38d5 100644
--- a/flask_app/货物标/提示词/prompt1.txt
+++ b/flask_app/货物标/提示词/prompt1.txt
@@ -13,7 +13,7 @@
 这是一份货物标中采购要求部分的内容，请告诉我商务要求和其他要求是什么，请以json格式返回结果，外层键名分别是"商务要求"和"其他要求",内层键值对中的键是你对该要求的总结，而值需要完全与原文保持一致，不可擅自总结删减，注意你无需回答具体设备的技术要求，若相关要求不存在，在键值中填"未知"。
 
 "该招标文件中规定的资格性审查标准是怎样的？请以json格式给出，外层为'资格性审查'，对于原文中的序号，你需要捕获它们之间的层级关系并生成嵌套键值对，你的回答无需包含序号，但其余内容要与原文一致，不可擅自总结删减，也不要回答符合性审查的内容。"
-
+该招标文件中规定的资格性审查标准是怎样的？请以json格式给出，外层为'资格性审查'，你的回答要与原文一致，不可擅自总结删减，也不要回答资格性审查的内容。
 "该招标文件中规定的资格性审查标准是怎样的？请以json格式给出，外层为'资格性审查'，对于原文中的序号，你仅需要捕获它们之间的层级关系并根据序号后的内容生成嵌套键值对，若多个内容位于同一层级，你应用字符串列表作为键值保存这些内容，你的回答需删去这些序号，但其余内容要与原文一致，不可擅自总结删减，也不要回答符合性审查的内容。"
 
 "根据该文档中的评标办法前附表，请你列出该文件的技术标，商务标，投标报价评审标准以及它们对应的具体评分要求，若对应内容中存在其他信息，在键名如'技术标'中新增子键名'备注'存放该信息。如果评分内容不是这3个，则返回文档中给定的评分内容以及它的评分要求，都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容")
diff --git a/flask_app/货物标/货物标截取pdf.py b/flask_app/货物标/货物标截取pdf.py
index 64ee4f5..2038a24 100644
--- a/flask_app/货物标/货物标截取pdf.py
+++ b/flask_app/货物标/货物标截取pdf.py
@@ -321,7 +321,7 @@ def truncate_pdf_multiple(input_path, output_folder):
 
 # TODO:交通智能系统和招标(1)(1)文件有问题
 if __name__ == "__main__":
-    input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\622二次视频会议磋商文件(1).pdf"
+    input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
     output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output3"
     # truncate_pdf_multiple(input_path,output_folder)
     selection = 3  # 例如：1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查
diff --git a/flask_app/货物标/资格审查main.py b/flask_app/货物标/资格审查main.py
index 2bb488c..cd8a7c5 100644
--- a/flask_app/货物标/资格审查main.py
+++ b/flask_app/货物标/资格审查main.py
@@ -113,6 +113,6 @@ def qualification_review(truncate_file):
     return combined_res
 
 if __name__ == "__main__":
-    truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\2-招标文件（广水市教育局封闭管理）_qualification1.pdf"
+    truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\622二次视频会议磋商文件(1)_qualification2.pdf"
     res=qualification_review(truncate_file)
     print(json.dumps(res,ensure_ascii=False, indent=4))
\ No newline at end of file