11.13

2024-11-13 09:41:37 +08:00 · 2024-11-13 09:41:37 +08:00 · 6e7da080bc
commit 6e7da080bc
parent 59b7bed441
4 changed files with 12 additions and 4 deletions
--- a/flask_app/general/投标人须知正文条款提取成json文件.py
+++ b/flask_app/general/投标人须知正文条款提取成json文件.py
@ -44,8 +44,8 @@ def parse_text_by_heading(text):
    skip_subheadings = False
    last_main_number = None
    temp_title = None  # 临时存储以点号开头但不带数字的标题
-    pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
-    pattern_parentheses = re.compile(r'^\s*[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*')
+    pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')   #一、
+    pattern_parentheses = re.compile(r'^\s*[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*') #（一）
    initial_heading_pattern = None
    special_section_keywords = ['文件的组成', '文件的构成','文件包括:','文件包括：']  # 定义特殊章节关键词
    in_special_section = False  # 标志是否在特殊章节中
--- a/flask_app/general/无效标和废标公共代码.py
+++ b/flask_app/general/无效标和废标公共代码.py
@ -36,12 +36,19 @@ def read_tables_from_docx(file_path):
 def preprocess_paragraphs(paragraphs):
    processed = []  # 初始化处理后的段落列表
    index = 0
+    # 定义两个新的正则表达式模式
+    pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
+    pattern_parentheses = re.compile(r'^\s*[（(]\s*([一二三四五六七八九十]{1,2})\s*[)）]\s*')
    #排除遇到表格、填空的情况
    def has_long_spaces(text, max_space_count=5):
        return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
    while index < len(paragraphs):
        current_text = paragraphs[index].text.strip()  # 去除当前段落的前后空白
-
+        # 检查当前段落是否匹配任一排除模式
+        if pattern_numbered.match(current_text) or pattern_parentheses.match(current_text):
+            # 如果匹配，则跳过当前段落，不添加到processed列表中
+            index += 1
+            continue
        # 检查当前段落是否为空
        if current_text == '':
            # 确保有前一个和后一个段落
--- a/flask_app/main/无效标和废标和禁止投标整合.py
+++ b/flask_app/main/无效标和废标和禁止投标整合.py
@ -100,6 +100,7 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
        print(f"handle_query 在处理 {result_key} 时发生异常: {e}")
        return {result_key: ""}

+#TODO：增加提取逻辑 else里面的内容
 def combine_find_invalid(file_path, output_folder, qualification):
    os.makedirs(output_folder, exist_ok=True)
    queries = [
--- a/flask_app/main/禁止投标情形.py
+++ b/flask_app/main/禁止投标情形.py
@ -136,7 +136,7 @@ def merge_pdfs(paths, output_filename):
    return output_path


-def find_forbidden(truncate_json_path, clause_path, qualification=""):
+def find_forbidden(qualification=""):
    try:
        if qualification:
            file_id = upload_file(qualification)