diff --git a/flask_app/general/无效标和废标公共代码.py b/flask_app/general/无效标和废标公共代码.py
new file mode 100644
index 0000000..3c4d3f3
--- /dev/null
+++ b/flask_app/general/无效标和废标公共代码.py
@@ -0,0 +1,374 @@
+import re
+
+from docx import Document
+# 完整读取文件中所有表格（适合pdf转docx价格便宜的情况，优先推荐，内容完整）
+def read_tables_from_docx(file_path):
+    # 尝试打开文档
+    try:
+        doc = Document(file_path)
+    except Exception as e:
+        print(f"Error opening file: {e}")
+        return []
+
+    # 初始化列表来保存符合条件的单元格内容
+    cell_contents = []
+
+    # 读取文档中的所有表格
+    if not doc.tables:
+        print("No tables found in the document.")
+        return []
+
+    # 遍历文档中的每个表格
+    for table_idx, table in enumerate(doc.tables):
+        # 遍历表格中的每一行
+        for row_idx, row in enumerate(table.rows):
+            # 遍历每一行中的单元格
+            for cell in row.cells:
+                cell_text = cell.text.strip()  # 去除单元格内容前后空白
+                if len(cell_text) > 8:  # 检查文字数量是否大于5
+                    cell_contents.append(cell_text)
+
+    # 返回符合条件的单元格内容
+    return cell_contents
+
+
+#处理跨页的段落
+def preprocess_paragraphs(paragraphs):
+    processed = []  # 初始化处理后的段落列表
+    index = 0
+    #排除遇到表格、填空的情况
+    def has_long_spaces(text, max_space_count=5):
+        return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
+    while index < len(paragraphs):
+        current_text = paragraphs[index].text.strip()  # 去除当前段落的前后空白
+
+        # 检查当前段落是否为空
+        if current_text == '':
+            # 确保有前一个和后一个段落
+            if 0 < index < len(paragraphs) - 1:
+                prev_text = paragraphs[index - 1].text.strip()  # 获取前一个段落的文本
+                next_text = paragraphs[index + 1].text.strip()  # 获取后一个段落的文本
+
+                # **新增部分：检查前一个段落和后一个段落都非空**  内部没有超过5个连续空格
+                if prev_text and next_text and not has_long_spaces(prev_text) and not has_long_spaces(next_text):
+                    # 检查前一个段落的文本是否不以标点符号结尾
+                    if not prev_text.endswith(('，', ',', '。', '!', '?')):
+                        # 定义列表项的模式
+                        list_item_pattern = r'^\s*([（\(]\d+[）\)]|[A-Za-z]\.\s*|[一二三四五六七八九十]+、)'
+
+                        # 检查后一个段落是否以列表模式开头
+                        is_next_list = re.match(list_item_pattern, next_text)
+
+                        # 如果后一个段落不是列表项，且前一个段落长度大于30
+                        if not is_next_list and len(prev_text) > 30:
+                            # 合并前一个和后一个段落的文本
+                            merged_text = prev_text + ' ' + next_text  # 为了可读性添加空格
+
+                            if processed:
+                                # 更新处理后的最后一个段落
+                                processed[-1] = merged_text
+                            else:
+                                # 如果列表为空，直接添加合并后的文本
+                                processed.append(merged_text)
+
+                            # 跳过下一个段落，因为它已经被合并
+                            index += 2
+                            continue
+            # 如果没有满足条件，不进行合并，跳过当前空段落
+        else:
+            # 非空段落，添加到处理后的列表中
+            processed.append(current_text)
+
+        index += 1
+
+    return processed
+
+#如果当前段落有序号，则向下匹配直接遇到相同的序号样式
+#如果当前段落无序号，则向下匹配序号，把若干同类的序号都摘出来。
+def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
+    from collections import OrderedDict
+    from docx import Document
+    import re
+
+    if isinstance(keywords, str):
+        keywords = [keywords]
+
+    doc = Document(doc_path)
+    extracted_paragraphs = OrderedDict()
+    continue_collecting = False
+    current_section_pattern = None
+    active_key = None
+
+    def match_keywords(text, patterns):
+        # 首先检查关键词是否匹配
+        for pattern in patterns:
+            match = re.search(pattern, text, re.IGNORECASE)
+            if match:
+                # 如果当前的模式是 '不\s*得'，则额外检查是否匹配 '不得分'
+                if '不\s*得' in pattern:  # 使用字符串匹配检查模式
+                    post_match_text = text[match.end():].strip()
+                    if post_match_text.startswith("分"):
+                        continue  # 如果是"不得分"，跳过这个匹配
+                return True
+        return False
+
+    def extract_from_text(text, current_index):
+        nonlocal continue_collecting, current_section_pattern, active_key
+        if text == "":
+            return current_index
+
+        if continue_collecting:
+            if current_section_pattern and re.match(current_section_pattern, text):
+                continue_collecting = False
+                active_key = None
+            else:
+                if active_key is not None:
+                    extracted_paragraphs[active_key].append(text)
+                    return current_index
+
+        if match_keywords(text, keywords):
+            active_key = text
+            extracted_paragraphs[active_key] = [text]
+            if match_keywords(text, follow_up_keywords):
+                continue_collecting = True
+                section_number = re.match(r'^(\d+([.．]\d+)*)\s*[.．]?', text)  # 修改后的正则，支持 '数字 、' 和 '数字.'
+                if section_number:
+                    current_section_number = section_number.group(1)
+                    level_count = current_section_number.count('.')
+
+                    # Pattern to match current level, e.g., 3.4.5
+                    pattern = r'^' + (r'\d+\s*[.．]\s*') * level_count + r'\d+'
+                    # Generate patterns for next section at same level and parent level
+                    parts = current_section_number.split('.')
+                    matched_patterns = [pattern]  # start with the full pattern
+
+                    # Next section at same level
+                    parts[-1] = str(int(parts[-1]) + 1)
+                    # next_pattern = r'^' + r'\.\s*'.join(parts)  #11.12以前版本
+                    next_pattern = r'^' + r'\s*[.．]\s*'.join(parts)
+                    matched_patterns.append(next_pattern)
+
+                    # Parent section (if applicable)
+                    if len(parts) > 1:
+                        parent_section_parts = parts[:-1]
+                        parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1)
+                        parent_pattern = r'^' + r'\s*[.．]\s*'.join(parent_section_parts)
+                        matched_patterns.append(parent_pattern)
+
+                    # 添加对 '数字 、' 格式的支持
+                    digit_comma_pattern = r'^\d+\s*、'
+                    matched_patterns.append(digit_comma_pattern)
+                    # Combine the patterns
+                    combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
+                    current_section_pattern = re.compile(combined_pattern)
+
+                else:
+                    found_next_number = False
+                    current_section_pattern = None
+
+                    while current_index < len(doc.paragraphs) - 1:
+                        current_index += 1
+                        next_text = doc.paragraphs[current_index].text.strip()
+                        if not found_next_number:
+                            # 修改后的正则，支持 '数字 、' 格式
+                            next_section_number = re.match(r'^([A-Za-z0-9]+(?:[.．][A-Za-z0-9]+)*)|(\(\d+\))|(\d+\s*、)',
+                                                           next_text)
+                            if next_section_number:
+                                found_next_number = True
+                                if next_section_number.group(1):
+                                    section_parts = next_section_number.group(1).split('.')
+                                    dynamic_pattern = r'^' + r'[.．]'.join(
+                                        [r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
+                                elif next_section_number.group(2):
+                                    dynamic_pattern = r'^[\(\（]\d+[\)\）]'
+                                elif next_section_number.group(3):
+                                    dynamic_pattern = r'^\d+\s*、'
+                                current_section_pattern = re.compile(dynamic_pattern)
+                        if current_section_pattern and re.match(current_section_pattern, next_text):
+                            extracted_paragraphs[active_key].append(next_text)
+                        else:
+                            continue_collecting = False
+                            active_key = None
+                            break
+
+        return current_index
+
+    processed_paragraphs = preprocess_paragraphs(doc.paragraphs)
+    index = 0
+    while index < len(processed_paragraphs):
+        index = extract_from_text(processed_paragraphs[index].strip(), index)
+        index += 1
+
+    return extracted_paragraphs
+
+"""
+eg:
+text_list = ["这是第一句。 1. 接下来是第二句！ (3) 最后一句。"]
+new_text_list = ["这是第一句。", "1. 接下来是第二句！", "(3) 最后一句。"]
+"""
+def preprocess_text_list(text_list):
+    new_text_list = []
+    # 正则表达式匹配中文字符或标点后的空格，该空格后紧跟字母、数字或带括号的数字
+    split_pattern = re.compile(r'(?<=[\u4e00-\u9fff。；！？?!;])(?=\s+[a-zA-Z\d]|\s+\([1-9]\d*\)|\s+\（[1-9]\d*\）)')
+    for text in text_list:
+        # 使用正则表达式检查并拆分元素
+        parts = split_pattern.split(text)
+        new_text_list.extend(part.strip() for part in parts if part.strip())  # 添加非空字符串检查
+
+    return new_text_list
+
+def clean_dict_datas(extracted_contents, keywords, excludes):  # 让正则表达式提取到的东西格式化
+    all_texts1 = []
+    all_texts2 = []
+    # 定义用于分割句子的正则表达式，包括中文和西文的结束标点
+    split_pattern = r'(?<=[。！？\!\?])'
+
+    for key, text_list in extracted_contents.items():
+        if len(text_list) == 1:
+            # print(text_list)
+            # print("------------------")
+            for data in text_list:
+                # print(data)
+                # 检查是否包含任何需要排除的字符串
+                if any(exclude in data for exclude in excludes):
+                    continue  # 如果包含任何排除字符串，跳过这个数据
+                # 去掉开头的序号，eg:1 | (1) |（2） | 1. | 2．（全角点）| 3、 | 1.1 | 2.3.4 | A1 | C1.1 | 一、
+                pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
+                data = re.sub(pattern, '', data).strip()
+                keyword_match = re.search(keywords, data)
+                if keyword_match:
+                    # 从关键词位置开始查找结束标点符号
+                    start_pos = keyword_match.start()
+                    # 截取从关键词开始到后面的内容
+                    substring = data[start_pos:]
+                    # 按定义的结束标点分割
+                    sentences = re.split(split_pattern, substring, 1)
+                    if len(sentences) > 0 and sentences[0]:
+                        # 只取第一句，保留标点
+                        cleaned_text = data[:start_pos] + sentences[0]  # eg:经采购人允许，潜在投标人可进入项目现场进行考察，但潜在投标人不得因此使采购人承担有关责任和蒙受损失。潜在投标人应自行承担现场考察的全部费用、责任和风险。
+                        # 经采购人允许，潜在投标人可进入项目现场进行考察，但潜在投标人不得因此使采购人承担有关责任和蒙受损失。
+                    else:
+                        cleaned_text = data  # 如果没有标点，使用整个字符串
+                else:
+                    # 如果没有找到关键词，保留原文本
+                    cleaned_text = data
+                # 删除空格
+                cleaned_text_no_spaces = cleaned_text.replace(' ', '').replace('　', '')
+                # 如果长度大于8，则添加到结果列表
+                if len(cleaned_text_no_spaces) > 8:
+                    all_texts1.append(cleaned_text_no_spaces)
+
+        else:
+            # print(text_list)
+            # print("*********")
+            new_text_list = preprocess_text_list(text_list)
+            # 用于处理结构化文本，清理掉不必要的序号，并将分割后的段落合并，最终形成更简洁和格式化的输出。
+            pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
+
+            data = re.sub(pattern, '', new_text_list[0]).strip()  # 去除序号
+            # 将修改后的第一个元素和剩余的元素连接起来
+            new_text_list[0] = data  # 更新列表中的第一个元素
+            joined_text = "\n".join(new_text_list)  # 如果列表中有多个元素，则连接它们
+            # 删除空格
+            joined_text_no_spaces = joined_text.replace(' ', '').replace('　', '')
+            all_texts2.append(joined_text_no_spaces)  # 将每个列表的内容添加到 all_texts 中
+
+    return all_texts1, all_texts2  # all_texts1要额外用gpt   all_text2直接返回结果
+
+
+def extract_table_with_keywords(data, keywords, follow_up_keywords):
+    """遍历列表中的每个元素，查找并返回包含关键词的句子列表，并根据是否存在后续关键词分别存储到两个列表中。"""
+    sentences1 = []  # 保存没有后续关键词的情况
+    sentences2 = []  # 保存有后续关键词的情况
+
+    # 编译关键词的正则表达式，提高匹配性能
+    keywords_pattern = re.compile(keywords, re.IGNORECASE)
+    follow_up_patterns = [re.compile(fu, re.IGNORECASE) for fu in follow_up_keywords]
+
+    # 检查是否包含 '无效报价' 的关键词
+    check_invalid_bidding = bool(re.search(r'无\s*效\s*报\s*价', keywords, re.IGNORECASE))
+
+    # 定义用于提取括号内容的正则表达式，支持中英文括号
+    bracket_pattern = re.compile(r'[（(][^（()）)]+[）)]')
+
+    # 遍历列表中的每个字符串元素
+    for item in data:
+        # 只有在 keywords 包含 '无效报价' 时，才检查 "无效报价"
+        if check_invalid_bidding and re.search(r'无\s*效\s*报\s*价', item, re.IGNORECASE):
+            sentences1.append(item.strip())
+            continue
+
+        # 先检查 item 是否包含任意关键词，如果不包含，则跳过分割
+        if not keywords_pattern.search(item):
+            continue
+
+        # 1. 先提取并替换括号内容
+        bracket_contents = []
+
+        def replace_bracket_content(match):
+            bracket_contents.append(match.group(0))  # 保存括号内容
+            return f"<BRACKET_{len(bracket_contents) - 1}>"  # 使用占位符替换括号内容
+
+        item_with_placeholders = bracket_pattern.sub(replace_bracket_content, item)
+
+        # 2. 分割句子，保证句子完整性（按标点符号和序号分割）
+        split_sentences = re.split(
+            r'(?<=[。！？!?\?；])|'  # 在中文句号、感叹号、问号或分号后面分割
+            r'(?=\d+[.．]\d+)|'  # 在类似1.1的数字序号前分割
+            r'(?=\d+\s(?![号条款节章项例页段部步点年月日时分秒]))|'  # 数字后面跟空格且空格后面不是指定关键字时分割
+            r'(?=\d+[、.．])|'  # 在数字后直接跟顿号、半角点号或全角点号时分割
+            r'(?=[A-Za-z][.．]\s*)|'  # 在字母加点（如A.、a.）前分割
+            r'(?=[A-Za-z]+\s*\d+\s*(?:[.．]\s*\d+)*)|'  # 在可选字母加数字或多级编号前分割
+            r'(?=[一二三四五六七八九十]+、)',  # 在中文数字加顿号（如一、二、）前分割
+            item_with_placeholders
+        )
+
+        # 3. 还原括号内容
+        split_sentences = [re.sub(r"<BRACKET_(\d+)>", lambda m: bracket_contents[int(m.group(1))], s) for s in
+                           split_sentences]
+
+        # 接下来是处理包含和不包含后续关键词的情况
+        i = 0
+        while i < len(split_sentences):
+            sentence = split_sentences[i].strip()
+            if keywords_pattern.search(sentence):
+                # 处理 "不\s*得" 的特殊情况，跳过包含 "不得分" 的句子
+                not_allowed_match = re.search(r'不\s*得', sentence, re.IGNORECASE)
+                if not_allowed_match:
+                    post_match_text = sentence[not_allowed_match.end():].strip()
+                    if post_match_text.startswith("分"):
+                        i += 1  # 跳过这个句子
+                        continue
+
+                # 检查是否存在后续关键词
+                follow_up_present = any(fp.search(sentence) for fp in follow_up_patterns)
+                if follow_up_present:
+                    # 如果存在后续关键词，则从当前位置开始截取
+                    start_index = i
+                    end_index = start_index
+                    found_next_section = False
+                    for j in range(start_index + 1, len(split_sentences)):
+                        if re.match(r'\d+[.．]\d+([.．]\d+)?', split_sentences[j].strip()):
+                            end_index = j
+                            found_next_section = True
+                            break
+                    if found_next_section:
+                        full_text = ' '.join(split_sentences[start_index:end_index]).strip()
+                    else:
+                        full_text = ' '.join(split_sentences[start_index:]).strip()
+
+                    # 清洗文本，去除前缀编号等
+                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z][.．]\s*|[A-Za-z]?\d+\s*([.．]\s*\d+)*(\s|[.．]|、|．)?|[一二三四五六七八九十]+、)'
+                    full_text = re.sub(pattern, '', full_text).replace(' ', '').strip()
+                    sentences2.append(full_text)  # 存储有后续关键词的情况
+                    i = end_index if found_next_section else len(split_sentences)
+                else:
+                    # 没有后续关键词的情况
+                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z][.．]\s*|[A-Za-z]?\d+\s*([.．]\s*\d+)*(\s|[.．]|、|．)?|[一二三四五六七八九十]+、)'
+                    cleaned_sentence = re.sub(pattern, '', sentence).replace('\n', '').replace(' ', '').strip()
+                    if len(cleaned_sentence) > 8:
+                        sentences1.append(cleaned_sentence)  # 存储没有后续关键词的情况
+                    i += 1
+            else:
+                i += 1
+    return sentences1, sentences2  # 返回两个列表
\ No newline at end of file
diff --git a/flask_app/main/基础信息整合快速版.py b/flask_app/main/基础信息整合快速版.py
index 3dda497..0bd187e 100644
--- a/flask_app/main/基础信息整合快速版.py
+++ b/flask_app/main/基础信息整合快速版.py
@@ -154,8 +154,8 @@ def combine_basic_info(merged_baseinfo_path,merged_baseinfo_path_more, tobidders
     返回：
     - dict: 综合后的基础信息。
     """
-    baseinfo_prompt_file_path=r'D:\flask_project\flask_app\static\提示词\基本信息工程标qianwen-long.txt'
-    # baseinfo_prompt_file_path = 'flask_app/static/提示词/基本信息工程标qianwen-long.txt'
+    # baseinfo_prompt_file_path=r'D:\flask_project\flask_app\static\提示词\基本信息工程标qianwen-long.txt'
+    baseinfo_prompt_file_path = 'flask_app/static/提示词/基本信息工程标qianwen-long.txt'
     file_id1 = upload_file(merged_baseinfo_path)
     questions = read_questions_from_file(baseinfo_prompt_file_path)
     more_query = "请你根据招标文件信息，回答以下问题：是否组织踏勘现场？是否召开投标预备会？是否允许偏离？是否退还投标文件？是否允许分包? 是否需要递交投标保证金？是否需要提交履约保证金（履约担保）？是否有招标代理服务费？请按json格式给我提供信息，键名分别为'是否组织踏勘现场','是否召开投标预备会','是否允许偏离','是否退还投标文件',是否允许分包','是否递交投标保证金','是否提交履约保证金','是否有招标代理服务费',键值仅限于'是','否','未知',若存在矛盾信息，请回答'未知'。"
@@ -165,8 +165,8 @@ def combine_basic_info(merged_baseinfo_path,merged_baseinfo_path_more, tobidders
     chosen_numbers, merged = merge_json_to_list(baseinfo_list1.pop(),tobidders_notice)
     baseinfo_list1_copy = copy.deepcopy(baseinfo_list1)
     baseinfo_list1.append(merged)
-    judge_file_path=r'D:\flask_project\flask_app\static\提示词\是否相关问题qianwen-long.txt'
-    # judge_file_path = 'flask_app/static/提示词/是否相关问题qianwen-long.txt'
+    # judge_file_path=r'D:\flask_project\flask_app\static\提示词\是否相关问题qianwen-long.txt'
+    judge_file_path = 'flask_app/static/提示词/是否相关问题qianwen-long.txt'
 
     with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
         # 提交两个任务
diff --git a/flask_app/main/工程标解析main.py b/flask_app/main/工程标解析main.py
index ac545f5..28d7c41 100644
--- a/flask_app/main/工程标解析main.py
+++ b/flask_app/main/工程标解析main.py
@@ -139,10 +139,10 @@ def fetch_evaluation_standards(invalid_path, evaluation_method):
 
 
 # 无效、废标项解析
-def fetch_invalid_requirements(invalid_docpath, output_folder, tobidders_notice_table, clause_path, qualification):
+def fetch_invalid_requirements(invalid_docpath, output_folder, qualification):
     logger.info("starting 无效标与废标...")
     start_time = time.time()
-    find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, tobidders_notice_table, clause_path, qualification)
+    find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, qualification)
     end_time = time.time()
     logger.info(f"无效标与废标 done，耗时：{end_time - start_time:.2f} 秒")
     return find_invalid_res
@@ -194,8 +194,7 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_
                                                     processed_data['merged_baseinfo_path']),
             'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['invalid_path'],processed_data['evaluation_method']),
             'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
-                                                    output_folder, processed_data['tobidders_notice_table'],
-                                                    processed_data['clause_path'], processed_data['qualification']),
+                                                    output_folder, processed_data['qualification']),
             'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'], processed_data['merged_baseinfo_path_more'],processed_data['clause_path']),
             'opening_bid': executor.submit(fetch_bid_opening,processed_data['invalid_path'],processed_data['merged_baseinfo_path_more'], processed_data['clause_path'])
         }
diff --git a/flask_app/main/无效标和废标和禁止投标整合.py b/flask_app/main/无效标和废标和禁止投标整合.py
index e344a1b..0edc0fe 100644
--- a/flask_app/main/无效标和废标和禁止投标整合.py
+++ b/flask_app/main/无效标和废标和禁止投标整合.py
@@ -5,283 +5,14 @@ import time
 import re
 
 from flask_app.general.format_change import pdf2docx
+from flask_app.general.无效标和废标公共代码 import extract_text_with_keywords, preprocess_text_list, clean_dict_datas, \
+    read_tables_from_docx, extract_table_with_keywords
 from flask_app.general.通义千问long import upload_file, qianwen_long,qianwen_long_text
 from concurrent.futures import ThreadPoolExecutor
 
 from flask_app.main.table_content_extraction import extract_tables_main
 from flask_app.main.禁止投标情形 import find_forbidden, process_string_list
 
-
-#处理跨页的段落
-def preprocess_paragraphs(paragraphs):
-    processed = []  # 初始化处理后的段落列表
-    index = 0
-    while index < len(paragraphs):
-        current_text = paragraphs[index].text.strip()  # 去除当前段落的前后空白
-
-        # 检查当前段落是否为空
-        if current_text == '':
-            # 确保有前一个和后一个段落
-            if 0 < index < len(paragraphs) - 1:
-                prev_text = paragraphs[index - 1].text.strip()  # 获取前一个段落的文本
-                next_text = paragraphs[index + 1].text.strip()  # 获取后一个段落的文本
-
-                # 检查前一个段落的文本是否不以标点符号结尾
-                if not prev_text.endswith(('，', ',', '。', '!', '?')):
-                    # 定义列表项的模式
-                    list_item_pattern = r'^\s*([（$]\d+[）$]|[A-Za-z]\.\s*|[一二三四五六七八九十]+、)'
-                    # 检查前一个段落是否以列表模式开头，且后一个段落不以列表模式开头
-                    # is_prev_list = re.match(list_item_pattern, prev_text)
-                    is_next_list = re.match(list_item_pattern, next_text)
-                    if not is_next_list and len(prev_text) > 30:
-                        # 合并前一个和后一个段落的文本
-                        merged_text = prev_text + next_text  # 为了可读性添加空格
-                        if processed:
-                            processed[-1] = merged_text  # 更新处理后的最后一个段落
-                        else:
-                            processed.append(merged_text)  # 如果列表为空，直接添加合并后的文本
-
-                        # 跳过下一个段落，因为它已经被合并
-                        index += 2
-                        continue
-        else:
-            # 非空段落，添加到处理后的列表中
-            processed.append(current_text)
-        index += 1
-
-    return processed
-
-#如果当前段落有序号，则向下匹配直接遇到相同的序号样式
-#如果当前段落无序号，则向下匹配序号，把若干同类的序号都摘出来。
-
-def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
-    from collections import OrderedDict
-    from docx import Document
-    import re
-
-    if isinstance(keywords, str):
-        keywords = [keywords]
-
-    doc = Document(doc_path)
-    extracted_paragraphs = OrderedDict()
-    continue_collecting = False
-    current_section_pattern = None
-    active_key = None
-
-    def match_keywords(text, patterns):
-        return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns)
-
-    def extract_from_text(text, current_index):
-        nonlocal continue_collecting, current_section_pattern, active_key
-        if text == "":
-            return current_index
-
-        if continue_collecting:
-            if current_section_pattern and re.match(current_section_pattern, text):
-                continue_collecting = False
-                active_key = None
-            else:
-                if active_key is not None:
-                    extracted_paragraphs[active_key].append(text)
-                    return current_index
-
-        if match_keywords(text, keywords):
-            active_key = text
-            extracted_paragraphs[active_key] = [text]
-            if match_keywords(text, follow_up_keywords):
-                continue_collecting = True
-                section_number = re.match(r'^(\d+([.．]\d+)*)\s*[.．]?', text)  # 修改后的正则，支持 '数字 、' 和 '数字.'
-                if section_number:
-                    current_section_number = section_number.group(1)
-                    level_count = current_section_number.count('.')
-
-                    # Pattern to match current level, e.g., 3.4.5
-                    pattern = r'^' + (r'\d+\s*[.．]\s*') * level_count + r'\d+'
-                    # Generate patterns for next section at same level and parent level
-                    parts = current_section_number.split('.')
-                    matched_patterns = [pattern]  # start with the full pattern
-
-                    # Next section at same level
-                    parts[-1] = str(int(parts[-1]) + 1)
-                    next_pattern = r'^' + r'\s*\.\s*'.join(parts)
-                    matched_patterns.append(next_pattern)
-
-                    # Parent section (if applicable)
-                    if len(parts) > 1:
-                        parent_section_parts = parts[:-1]
-                        parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1)
-                        parent_pattern = r'^' + r'\s*\.\s*'.join(parent_section_parts)
-                        matched_patterns.append(parent_pattern)
-
-                    # Combine the patterns
-                    combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
-                    current_section_pattern = re.compile(combined_pattern)
-
-                else:
-                    found_next_number = False
-                    current_section_pattern = None
-
-                    while current_index < len(doc.paragraphs) - 1:
-                        current_index += 1
-                        next_text = doc.paragraphs[current_index].text.strip()
-                        if not found_next_number:
-                            next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))', next_text)
-                            if next_section_number:
-                                found_next_number = True
-                                if next_section_number.group(1):
-                                    section_parts = next_section_number.group(1).split('.')
-                                    dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
-                                elif next_section_number.group(2):
-                                    dynamic_pattern = r'^[\(\（]\d+[\)\）]'
-                                current_section_pattern = re.compile(dynamic_pattern)
-                        if current_section_pattern and re.match(current_section_pattern, next_text):
-                            extracted_paragraphs[active_key].append(next_text)
-                        else:
-                            continue_collecting = False
-                            active_key=None
-                            break
-
-        return current_index
-
-    processed_paragraphs = preprocess_paragraphs(doc.paragraphs)
-    index = 0
-    while index < len(processed_paragraphs):
-        index = extract_from_text(processed_paragraphs[index].strip(), index)
-        index += 1
-
-    return extracted_paragraphs
-
-def preprocess_text_list(text_list):
-    new_text_list = []
-    # 正则表达式匹配中文字符或标点后的空格，该空格后紧跟字母、数字或带括号的数字
-    split_pattern = re.compile(r'(?<=[\u4e00-\u9fff。；！？?!;])(?=\s+[a-zA-Z\d]|\s+\([1-9]\d*\)|\s+\（[1-9]\d*\）)')
-    for text in text_list:
-        # 使用正则表达式检查并拆分元素
-        parts = split_pattern.split(text)
-        new_text_list.extend(part.strip() for part in parts if part.strip())  # 添加非空字符串检查
-
-    return new_text_list
-
-def clean_dict_datas(extracted_contents, keywords,excludes):    #让正则表达式提取到的东西格式化
-    all_texts1 = []
-    all_texts2=[]
-    # 定义用于分割句子的正则表达式，包括中文和西文的结束标点
-    split_pattern = r'(?<=[。！？\!\?])'
-
-    for key, text_list in extracted_contents.items():
-        if len(text_list) == 1:
-            for data in text_list:
-                # 检查是否包含任何需要排除的字符串
-                if any(exclude in data for exclude in excludes):
-                    continue  # 如果包含任何排除字符串，跳过这个数据
-                # 去掉开头的序号，包括字母+数字的格式 以及括号+数字
-                pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
-                data = re.sub(pattern, '', data).strip()
-                keyword_match = re.search(keywords, data)
-                if keyword_match:
-                    # 从关键词位置开始查找结束标点符号
-                    start_pos = keyword_match.start()
-                    # 截取从关键词开始到后面的内容
-                    substring = data[start_pos:]
-                    # 按定义的结束标点分割
-                    sentences = re.split(split_pattern, substring, 1)
-                    if len(sentences) > 0 and sentences[0]:
-                        # 只取第一句，保留标点
-                        cleaned_text = data[:start_pos] + sentences[0] #eg:经采购人允许，潜在投标人可进入项目现场进行考察，但潜在投标人不得因此使采购人承担有关责任和蒙受损失。潜在投标人应自行承担现场考察的全部费用、责任和风险。
-                                                                        # 经采购人允许，潜在投标人可进入项目现场进行考察，但潜在投标人不得因此使采购人承担有关责任和蒙受损失。
-                    else:
-                        cleaned_text = data  # 如果没有标点，使用整个字符串
-                else:
-                    # 如果没有找到关键词，保留原文本
-                    cleaned_text = data
-                # 删除空格
-                cleaned_text_no_spaces = cleaned_text.replace(' ', '').replace('　', '')
-                # 如果长度大于8，则添加到结果列表
-                if len(cleaned_text_no_spaces) > 8:
-                    all_texts1.append(cleaned_text_no_spaces)
-
-        else:
-            # print(text_list)
-            new_text_list=preprocess_text_list(text_list)
-            # print(new_text_list)
-            pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
-            # data = re.sub(pattern, '', new_text_list[0]).strip()     #去除序号
-            data = re.sub(pattern, '', new_text_list[0]).replace(' ','').strip()
-            # 将修改后的第一个元素和剩余的元素连接起来
-            new_text_list[0] = data  # 更新列表中的第一个元素
-            joined_text = "\n".join(new_text_list)  # 如果列表中有多个元素，则连接它们
-            # 删除空格
-            joined_text_no_spaces = joined_text.replace(' ', '').replace('　', '')
-            all_texts2.append(joined_text_no_spaces)  # 将每个列表的内容添加到 all_texts 中
-
-    return all_texts1,all_texts2        #all_texts1要额外用gpt   all_text2直接返回结果
-def find_sentences_with_keywords(data, keywords, follow_up_keywords):
-    """递归查找并返回包含关键词的句子列表，并根据是否存在后续关键词分别存储到两个列表中。"""
-    sentences1 = []  # 保存没有后续关键词的情况
-    sentences2 = []  # 保存有后续关键词的情况
-
-    if isinstance(data, dict):
-        for value in data.values():
-            result1, result2 = find_sentences_with_keywords(value, keywords, follow_up_keywords)
-            sentences1.extend(result1)
-            sentences2.extend(result2)
-    elif isinstance(data, list):
-        for item in data:
-            result1, result2 = find_sentences_with_keywords(item, keywords, follow_up_keywords)
-            sentences1.extend(result1)
-            sentences2.extend(result2)
-    elif isinstance(data, str):
-        # 分割句子，保证句子完整性（按标点符号和序号分割）
-        # split_sentences = re.split(r'(?<=[。！？\!\?])|(?=\d+[\、\.])|(?=[（(]\d+[)）])', data)  # 扩展匹配序号分割
-        split_sentences = re.split(r'(?<=[。！？\!\?])|(?=\d+\.\d+)|(?=\d+[\、\.])|(?=[（(]\d+[)）])', data)
-        i = 0
-        while i < len(split_sentences):
-            sentence = split_sentences[i].strip()
-            if re.search(keywords, sentence, re.IGNORECASE):
-                follow_up_present = any(
-                    re.search(follow_up, sentence, re.IGNORECASE) for follow_up in follow_up_keywords)
-                if follow_up_present:
-                    # 如果存在后续关键词，则从当前位置开始截取
-                    start_index = i
-                    end_index = start_index
-                    found_next_section = False
-                    for j in range(start_index + 1, len(split_sentences)):
-                        if re.match(r'\d+\.\d+(\.\d+)?', split_sentences[j].strip()):
-                            end_index = j
-                            found_next_section = True
-                            break
-
-                    if found_next_section:
-                        full_text = ' '.join(split_sentences[start_index:end_index]).strip()
-                    else:
-                        full_text = ' '.join(split_sentences[start_index:]).strip()
-                    # pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
-                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]\.\s*|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
-                    # data=re.sub(pattern,'',full_text)
-                    data = re.sub(pattern, '', full_text).replace(' ','').strip()
-                    sentences2.append(data)  # 存储有后续关键词的情况
-                    i = end_index if found_next_section else len(split_sentences)
-                else:
-                    # pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
-                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]\.\s*|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
-                    # data = re.sub(pattern, '', sentence).replace('\n','').strip()
-                    cleaned_sentence = re.sub(pattern, '', sentence).replace('\n', '').replace(' ','').strip()
-                    if len(cleaned_sentence) > 8:
-                        sentences1.append(cleaned_sentence)  # 存储没有后续关键词的情况
-                    i += 1
-            else:
-                i += 1
-
-    return sentences1, sentences2  # 返回两个列表
-
-def extract_sentences_from_json(json_path, keywords,follow_up_keywords):
-    if not json_path:
-        return [],[]
-    with open(json_path, 'r', encoding='utf-8') as file:
-        data = json.load(file)
-    """从JSON数据中提取包含关键词的句子。"""
-    return find_sentences_with_keywords(data, keywords,follow_up_keywords)
-
 #处理无效投标
 def extract_values_if_contains(data, includes):
     """
@@ -324,14 +55,14 @@ def extract_values_if_contains(data, includes):
 #以上是原文内容，文本内的信息以'...............'分割，请你根据该信息回答：否决投标或拒绝投标或无效投标或使投标失效的情况有哪些？文本中可能存在无关的信息，请你准确筛选所需的信息并返回。最终结果以json列表格式返回给我，键名为'否决和无效投标情形'，你的回答完全忠于原文内容，且回答内容与原文内容一致，要求完整与准确，不能擅自总结或者概括。",
 
 #TODO:truncate_json_path为空的时候，单独提取表格数据
-def handle_query(file_path, user_query, output_file, result_key, keywords, truncate_json_path):
+def handle_query(file_path, user_query, output_file, result_key, keywords):
     try:
-        excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注：", "本人保证："]
+        excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注：", "本人保证：","我方"]
         follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下']
         extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords)  # 提取正文（除表格）
-        # print(extracted_contents)
         all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes)  # 列表
-        all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords)  # 提取表格数据（json_data）
+        table_data_list = read_tables_from_docx(file_path)
+        all_tables1, all_tables2 = extract_table_with_keywords(table_data_list, keywords, follow_up_keywords)
         qianwen_txt = all_texts1 + all_tables1
         selected_contents = set()  # 使用 set 去重
 
@@ -369,20 +100,19 @@ def handle_query(file_path, user_query, output_file, result_key, keywords, trunc
         print(f"handle_query 在处理 {result_key} 时发生异常: {e}")
         return {result_key: ""}
 
-def combine_find_invalid(invalid_docpath, output_dir, tobidders_notice_table, clause_path, qualification):
-    tobidders_notice_table_docx = pdf2docx(tobidders_notice_table)  # 投标人须知前附表转docx
-    truncate_jsonpath = extract_tables_main(tobidders_notice_table_docx, output_dir)  # 投标人须知前附表docx->json
+def combine_find_invalid(file_path, output_folder, qualification):
+    os.makedirs(output_folder, exist_ok=True)
     queries = [
         (
-            r'否\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
+            r'否\s*决|无\s*效\s*投\s*标|无\s*效\s*文\s*件|文\s*件\s*无\s*效|无\s*效\s*响\s*应|无\s*效\s*报\s*价|无\s*效\s*标|视\s*为\s*无\s*效|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
             "以上是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，请你根据该内容回答：否决投标或拒绝投标或无效投标或投标失效的情况有哪些？文本中可能存在无关的信息，请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，x为符合的信息的序号，若情况不存在，返回[]。",
-            os.path.join(output_dir, "temp1.txt"),
+            os.path.join(output_folder, "temp1.txt"),
             "否决和无效投标情形"
         ),
         (
             r'废\s*标',
             "以上是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，请你根据该内容回答：废标项的情况有哪些？文本中可能存在无关的信息，请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，x为符合的信息的序号，若情况不存在，返回[]。",
-            os.path.join(output_dir, "temp2.txt"),
+            os.path.join(output_folder, "temp2.txt"),
             "废标项"
         )
     ]
@@ -391,18 +121,10 @@ def combine_find_invalid(invalid_docpath, output_dir, tobidders_notice_table, cl
     with ThreadPoolExecutor() as executor:
         futures = []
         for keywords, user_query, output_file, result_key in queries:
-            future = executor.submit(
-                handle_query,
-                invalid_docpath,
-                user_query,
-                output_file,
-                result_key,
-                keywords,
-                truncate_jsonpath
-            )
-            futures.append((future, result_key))
+            future = executor.submit(handle_query, file_path, user_query, output_file, result_key, keywords)
+            futures.append((future, result_key))  # 保持顺序
             time.sleep(0.5)  # 暂停0.5秒后再提交下一个任务
-        # 按照提交的顺序收集结果
+
         for future, result_key in futures:
             try:
                 result = future.result()
@@ -414,7 +136,7 @@ def combine_find_invalid(invalid_docpath, output_dir, tobidders_notice_table, cl
     # 禁止投标（find_forbidden）部分
     try:
         # print("starting不得存在的情形...")
-        forbidden_res = find_forbidden(truncate_jsonpath, clause_path, qualification)
+        forbidden_res = find_forbidden(qualification)
     except Exception as e:
         print(f"find_forbidden 处理时出错: {e}")
         forbidden_res = {'不得存在的其他情形': ""}
@@ -428,13 +150,13 @@ def combine_find_invalid(invalid_docpath, output_dir, tobidders_notice_table, cl
 
 if __name__ == '__main__':
     start_time = time.time()
-    tobidders_notice_table=""
-    clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json"
-    qualification="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf"
-    output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\zbout"
+    # tobidders_notice_table=""
+    # clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json"
+    qualification=r"D:\flask_project\flask_app\static\output\output1\fb297c04-b7ef-4b34-9df8-d97b6af69a03\ztbfile_qualification.pdf"
+    output_folder = r"D:\flask_project\flask_app\static\output\output1\fb297c04-b7ef-4b34-9df8-d97b6af69a03\tmp"
     # doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx'
-    invalid_docpath = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_invalid.docx'
-    results = combine_find_invalid(invalid_docpath, output_dir,tobidders_notice_table,clause_path,qualification)
+    file_path = r'D:\flask_project\flask_app\static\output\output1\fb297c04-b7ef-4b34-9df8-d97b6af69a03\ztbfile.docx'
+    results = combine_find_invalid(file_path, output_folder,qualification)
     end_time = time.time()
     print("Elapsed time:", str(end_time - start_time))
     print("Results:", json.dumps(results,ensure_ascii=False,indent=4))
diff --git a/flask_app/old_version/无效标和废标和禁止投标整合.py b/flask_app/old_version/无效标和废标和禁止投标整合.py
new file mode 100644
index 0000000..e344a1b
--- /dev/null
+++ b/flask_app/old_version/无效标和废标和禁止投标整合.py
@@ -0,0 +1,440 @@
+# -*- coding: utf-8 -*-
+import json
+import os.path
+import time
+import re
+
+from flask_app.general.format_change import pdf2docx
+from flask_app.general.通义千问long import upload_file, qianwen_long,qianwen_long_text
+from concurrent.futures import ThreadPoolExecutor
+
+from flask_app.main.table_content_extraction import extract_tables_main
+from flask_app.main.禁止投标情形 import find_forbidden, process_string_list
+
+
+#处理跨页的段落
+def preprocess_paragraphs(paragraphs):
+    processed = []  # 初始化处理后的段落列表
+    index = 0
+    while index < len(paragraphs):
+        current_text = paragraphs[index].text.strip()  # 去除当前段落的前后空白
+
+        # 检查当前段落是否为空
+        if current_text == '':
+            # 确保有前一个和后一个段落
+            if 0 < index < len(paragraphs) - 1:
+                prev_text = paragraphs[index - 1].text.strip()  # 获取前一个段落的文本
+                next_text = paragraphs[index + 1].text.strip()  # 获取后一个段落的文本
+
+                # 检查前一个段落的文本是否不以标点符号结尾
+                if not prev_text.endswith(('，', ',', '。', '!', '?')):
+                    # 定义列表项的模式
+                    list_item_pattern = r'^\s*([（$]\d+[）$]|[A-Za-z]\.\s*|[一二三四五六七八九十]+、)'
+                    # 检查前一个段落是否以列表模式开头，且后一个段落不以列表模式开头
+                    # is_prev_list = re.match(list_item_pattern, prev_text)
+                    is_next_list = re.match(list_item_pattern, next_text)
+                    if not is_next_list and len(prev_text) > 30:
+                        # 合并前一个和后一个段落的文本
+                        merged_text = prev_text + next_text  # 为了可读性添加空格
+                        if processed:
+                            processed[-1] = merged_text  # 更新处理后的最后一个段落
+                        else:
+                            processed.append(merged_text)  # 如果列表为空，直接添加合并后的文本
+
+                        # 跳过下一个段落，因为它已经被合并
+                        index += 2
+                        continue
+        else:
+            # 非空段落，添加到处理后的列表中
+            processed.append(current_text)
+        index += 1
+
+    return processed
+
+#如果当前段落有序号，则向下匹配直接遇到相同的序号样式
+#如果当前段落无序号，则向下匹配序号，把若干同类的序号都摘出来。
+
+def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
+    from collections import OrderedDict
+    from docx import Document
+    import re
+
+    if isinstance(keywords, str):
+        keywords = [keywords]
+
+    doc = Document(doc_path)
+    extracted_paragraphs = OrderedDict()
+    continue_collecting = False
+    current_section_pattern = None
+    active_key = None
+
+    def match_keywords(text, patterns):
+        return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns)
+
+    def extract_from_text(text, current_index):
+        nonlocal continue_collecting, current_section_pattern, active_key
+        if text == "":
+            return current_index
+
+        if continue_collecting:
+            if current_section_pattern and re.match(current_section_pattern, text):
+                continue_collecting = False
+                active_key = None
+            else:
+                if active_key is not None:
+                    extracted_paragraphs[active_key].append(text)
+                    return current_index
+
+        if match_keywords(text, keywords):
+            active_key = text
+            extracted_paragraphs[active_key] = [text]
+            if match_keywords(text, follow_up_keywords):
+                continue_collecting = True
+                section_number = re.match(r'^(\d+([.．]\d+)*)\s*[.．]?', text)  # 修改后的正则，支持 '数字 、' 和 '数字.'
+                if section_number:
+                    current_section_number = section_number.group(1)
+                    level_count = current_section_number.count('.')
+
+                    # Pattern to match current level, e.g., 3.4.5
+                    pattern = r'^' + (r'\d+\s*[.．]\s*') * level_count + r'\d+'
+                    # Generate patterns for next section at same level and parent level
+                    parts = current_section_number.split('.')
+                    matched_patterns = [pattern]  # start with the full pattern
+
+                    # Next section at same level
+                    parts[-1] = str(int(parts[-1]) + 1)
+                    next_pattern = r'^' + r'\s*\.\s*'.join(parts)
+                    matched_patterns.append(next_pattern)
+
+                    # Parent section (if applicable)
+                    if len(parts) > 1:
+                        parent_section_parts = parts[:-1]
+                        parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1)
+                        parent_pattern = r'^' + r'\s*\.\s*'.join(parent_section_parts)
+                        matched_patterns.append(parent_pattern)
+
+                    # Combine the patterns
+                    combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
+                    current_section_pattern = re.compile(combined_pattern)
+
+                else:
+                    found_next_number = False
+                    current_section_pattern = None
+
+                    while current_index < len(doc.paragraphs) - 1:
+                        current_index += 1
+                        next_text = doc.paragraphs[current_index].text.strip()
+                        if not found_next_number:
+                            next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))', next_text)
+                            if next_section_number:
+                                found_next_number = True
+                                if next_section_number.group(1):
+                                    section_parts = next_section_number.group(1).split('.')
+                                    dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
+                                elif next_section_number.group(2):
+                                    dynamic_pattern = r'^[\(\（]\d+[\)\）]'
+                                current_section_pattern = re.compile(dynamic_pattern)
+                        if current_section_pattern and re.match(current_section_pattern, next_text):
+                            extracted_paragraphs[active_key].append(next_text)
+                        else:
+                            continue_collecting = False
+                            active_key=None
+                            break
+
+        return current_index
+
+    processed_paragraphs = preprocess_paragraphs(doc.paragraphs)
+    index = 0
+    while index < len(processed_paragraphs):
+        index = extract_from_text(processed_paragraphs[index].strip(), index)
+        index += 1
+
+    return extracted_paragraphs
+
+def preprocess_text_list(text_list):
+    new_text_list = []
+    # 正则表达式匹配中文字符或标点后的空格，该空格后紧跟字母、数字或带括号的数字
+    split_pattern = re.compile(r'(?<=[\u4e00-\u9fff。；！？?!;])(?=\s+[a-zA-Z\d]|\s+\([1-9]\d*\)|\s+\（[1-9]\d*\）)')
+    for text in text_list:
+        # 使用正则表达式检查并拆分元素
+        parts = split_pattern.split(text)
+        new_text_list.extend(part.strip() for part in parts if part.strip())  # 添加非空字符串检查
+
+    return new_text_list
+
+def clean_dict_datas(extracted_contents, keywords,excludes):    #让正则表达式提取到的东西格式化
+    all_texts1 = []
+    all_texts2=[]
+    # 定义用于分割句子的正则表达式，包括中文和西文的结束标点
+    split_pattern = r'(?<=[。！？\!\?])'
+
+    for key, text_list in extracted_contents.items():
+        if len(text_list) == 1:
+            for data in text_list:
+                # 检查是否包含任何需要排除的字符串
+                if any(exclude in data for exclude in excludes):
+                    continue  # 如果包含任何排除字符串，跳过这个数据
+                # 去掉开头的序号，包括字母+数字的格式 以及括号+数字
+                pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
+                data = re.sub(pattern, '', data).strip()
+                keyword_match = re.search(keywords, data)
+                if keyword_match:
+                    # 从关键词位置开始查找结束标点符号
+                    start_pos = keyword_match.start()
+                    # 截取从关键词开始到后面的内容
+                    substring = data[start_pos:]
+                    # 按定义的结束标点分割
+                    sentences = re.split(split_pattern, substring, 1)
+                    if len(sentences) > 0 and sentences[0]:
+                        # 只取第一句，保留标点
+                        cleaned_text = data[:start_pos] + sentences[0] #eg:经采购人允许，潜在投标人可进入项目现场进行考察，但潜在投标人不得因此使采购人承担有关责任和蒙受损失。潜在投标人应自行承担现场考察的全部费用、责任和风险。
+                                                                        # 经采购人允许，潜在投标人可进入项目现场进行考察，但潜在投标人不得因此使采购人承担有关责任和蒙受损失。
+                    else:
+                        cleaned_text = data  # 如果没有标点，使用整个字符串
+                else:
+                    # 如果没有找到关键词，保留原文本
+                    cleaned_text = data
+                # 删除空格
+                cleaned_text_no_spaces = cleaned_text.replace(' ', '').replace('　', '')
+                # 如果长度大于8，则添加到结果列表
+                if len(cleaned_text_no_spaces) > 8:
+                    all_texts1.append(cleaned_text_no_spaces)
+
+        else:
+            # print(text_list)
+            new_text_list=preprocess_text_list(text_list)
+            # print(new_text_list)
+            pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
+            # data = re.sub(pattern, '', new_text_list[0]).strip()     #去除序号
+            data = re.sub(pattern, '', new_text_list[0]).replace(' ','').strip()
+            # 将修改后的第一个元素和剩余的元素连接起来
+            new_text_list[0] = data  # 更新列表中的第一个元素
+            joined_text = "\n".join(new_text_list)  # 如果列表中有多个元素，则连接它们
+            # 删除空格
+            joined_text_no_spaces = joined_text.replace(' ', '').replace('　', '')
+            all_texts2.append(joined_text_no_spaces)  # 将每个列表的内容添加到 all_texts 中
+
+    return all_texts1,all_texts2        #all_texts1要额外用gpt   all_text2直接返回结果
+def find_sentences_with_keywords(data, keywords, follow_up_keywords):
+    """递归查找并返回包含关键词的句子列表，并根据是否存在后续关键词分别存储到两个列表中。"""
+    sentences1 = []  # 保存没有后续关键词的情况
+    sentences2 = []  # 保存有后续关键词的情况
+
+    if isinstance(data, dict):
+        for value in data.values():
+            result1, result2 = find_sentences_with_keywords(value, keywords, follow_up_keywords)
+            sentences1.extend(result1)
+            sentences2.extend(result2)
+    elif isinstance(data, list):
+        for item in data:
+            result1, result2 = find_sentences_with_keywords(item, keywords, follow_up_keywords)
+            sentences1.extend(result1)
+            sentences2.extend(result2)
+    elif isinstance(data, str):
+        # 分割句子，保证句子完整性（按标点符号和序号分割）
+        # split_sentences = re.split(r'(?<=[。！？\!\?])|(?=\d+[\、\.])|(?=[（(]\d+[)）])', data)  # 扩展匹配序号分割
+        split_sentences = re.split(r'(?<=[。！？\!\?])|(?=\d+\.\d+)|(?=\d+[\、\.])|(?=[（(]\d+[)）])', data)
+        i = 0
+        while i < len(split_sentences):
+            sentence = split_sentences[i].strip()
+            if re.search(keywords, sentence, re.IGNORECASE):
+                follow_up_present = any(
+                    re.search(follow_up, sentence, re.IGNORECASE) for follow_up in follow_up_keywords)
+                if follow_up_present:
+                    # 如果存在后续关键词，则从当前位置开始截取
+                    start_index = i
+                    end_index = start_index
+                    found_next_section = False
+                    for j in range(start_index + 1, len(split_sentences)):
+                        if re.match(r'\d+\.\d+(\.\d+)?', split_sentences[j].strip()):
+                            end_index = j
+                            found_next_section = True
+                            break
+
+                    if found_next_section:
+                        full_text = ' '.join(split_sentences[start_index:end_index]).strip()
+                    else:
+                        full_text = ' '.join(split_sentences[start_index:]).strip()
+                    # pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
+                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]\.\s*|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
+                    # data=re.sub(pattern,'',full_text)
+                    data = re.sub(pattern, '', full_text).replace(' ','').strip()
+                    sentences2.append(data)  # 存储有后续关键词的情况
+                    i = end_index if found_next_section else len(split_sentences)
+                else:
+                    # pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
+                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]\.\s*|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
+                    # data = re.sub(pattern, '', sentence).replace('\n','').strip()
+                    cleaned_sentence = re.sub(pattern, '', sentence).replace('\n', '').replace(' ','').strip()
+                    if len(cleaned_sentence) > 8:
+                        sentences1.append(cleaned_sentence)  # 存储没有后续关键词的情况
+                    i += 1
+            else:
+                i += 1
+
+    return sentences1, sentences2  # 返回两个列表
+
+def extract_sentences_from_json(json_path, keywords,follow_up_keywords):
+    if not json_path:
+        return [],[]
+    with open(json_path, 'r', encoding='utf-8') as file:
+        data = json.load(file)
+    """从JSON数据中提取包含关键词的句子。"""
+    return find_sentences_with_keywords(data, keywords,follow_up_keywords)
+
+#处理无效投标
+def extract_values_if_contains(data, includes):
+    """
+    递归检查字典中的值是否包含列表 'includes' 中的内容。
+    如果包含，将这些值添加到一个列表中并返回。
+
+    参数:
+    data (dict): 字典或从 JSON 解析得到的数据。
+    includes (list): 包含要检查的关键词的列表。
+
+    返回:
+    list: 包含满足条件的值的列表。
+    """
+    included_values = []  # 初始化结果列表
+
+    # 定义递归函数来处理嵌套字典
+    def recursive_search(current_data):
+        if isinstance(current_data, dict):
+            for key, value in current_data.items():
+                if isinstance(value, dict):
+                    # 如果值是字典，递归搜索
+                    recursive_search(value)
+                elif isinstance(value, str):
+                    # 如果值是字符串，检查是否包含任何 includes 中的关键词
+                    if any(include in value for include in includes):
+                        included_values.append(value)
+        elif isinstance(current_data, list):
+            for item in current_data:
+                # 如果是列表，递归每个元素
+                recursive_search(item)
+
+    # 开始递归搜索
+    recursive_search(data)
+
+    return included_values
+
+
+#你是一个文本助手，文本内的信息以'...............'分割，你负责准确筛选所需的信息并返回，每块信息要求完整，不遗漏，你不得擅自进行总结或删减。
+#以上是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，请你根据该内容回答：否决投标或拒绝投标或无效投标或使投标失效的情况有哪些？文本中可能存在无关的信息，请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，x为符合的信息的序号。
+#以上是原文内容，文本内的信息以'...............'分割，请你根据该信息回答：否决投标或拒绝投标或无效投标或使投标失效的情况有哪些？文本中可能存在无关的信息，请你准确筛选所需的信息并返回。最终结果以json列表格式返回给我，键名为'否决和无效投标情形'，你的回答完全忠于原文内容，且回答内容与原文内容一致，要求完整与准确，不能擅自总结或者概括。",
+
+#TODO:truncate_json_path为空的时候，单独提取表格数据
+def handle_query(file_path, user_query, output_file, result_key, keywords, truncate_json_path):
+    try:
+        excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注：", "本人保证："]
+        follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下']
+        extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords)  # 提取正文（除表格）
+        # print(extracted_contents)
+        all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes)  # 列表
+        all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords)  # 提取表格数据（json_data）
+        qianwen_txt = all_texts1 + all_tables1
+        selected_contents = set()  # 使用 set 去重
+
+        if qianwen_txt:
+            with open(output_file, 'w', encoding='utf-8') as file:
+                counter = 1
+                for content in qianwen_txt:
+                    file.write("..............." + '\n')
+                    file.write(f"{counter}. {content}\n")
+                    counter += 1
+
+            file_id = upload_file(output_file)
+            # qianwen_ans = qianwen_long(file_id, user_query)
+            qianwen_ans = qianwen_long_text(file_id, user_query)
+            num_list = process_string_list(qianwen_ans)
+            print(result_key + "选中的序号:" + str(num_list))
+
+            for index in num_list:
+                if 1 <= index <= len(qianwen_txt):
+                    content = qianwen_txt[index - 1]
+                    selected_contents.add(content)
+
+        # 无论 qianwen_txt 是否为空，都添加 all_texts2 和 all_tables2 的内容
+        selected_contents.update(all_texts2)
+        selected_contents.update(all_tables2)
+
+        # 如果 selected_contents 不为空，则返回结果，否则返回空字符串
+        if selected_contents:
+            res = {result_key: list(selected_contents)}
+        else:
+            res = {result_key: ""}
+
+        return res
+    except Exception as e:
+        print(f"handle_query 在处理 {result_key} 时发生异常: {e}")
+        return {result_key: ""}
+
+def combine_find_invalid(invalid_docpath, output_dir, tobidders_notice_table, clause_path, qualification):
+    tobidders_notice_table_docx = pdf2docx(tobidders_notice_table)  # 投标人须知前附表转docx
+    truncate_jsonpath = extract_tables_main(tobidders_notice_table_docx, output_dir)  # 投标人须知前附表docx->json
+    queries = [
+        (
+            r'否\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
+            "以上是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，请你根据该内容回答：否决投标或拒绝投标或无效投标或投标失效的情况有哪些？文本中可能存在无关的信息，请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，x为符合的信息的序号，若情况不存在，返回[]。",
+            os.path.join(output_dir, "temp1.txt"),
+            "否决和无效投标情形"
+        ),
+        (
+            r'废\s*标',
+            "以上是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，请你根据该内容回答：废标项的情况有哪些？文本中可能存在无关的信息，请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，x为符合的信息的序号，若情况不存在，返回[]。",
+            os.path.join(output_dir, "temp2.txt"),
+            "废标项"
+        )
+    ]
+    results = []
+    # 使用线程池来并行处理查询
+    with ThreadPoolExecutor() as executor:
+        futures = []
+        for keywords, user_query, output_file, result_key in queries:
+            future = executor.submit(
+                handle_query,
+                invalid_docpath,
+                user_query,
+                output_file,
+                result_key,
+                keywords,
+                truncate_jsonpath
+            )
+            futures.append((future, result_key))
+            time.sleep(0.5)  # 暂停0.5秒后再提交下一个任务
+        # 按照提交的顺序收集结果
+        for future, result_key in futures:
+            try:
+                result = future.result()
+            except Exception as e:
+                print(f"线程处理 {result_key} 时出错: {e}")
+                result = {result_key: ""}
+            results.append(result)
+
+    # 禁止投标（find_forbidden）部分
+    try:
+        # print("starting不得存在的情形...")
+        forbidden_res = find_forbidden(truncate_jsonpath, clause_path, qualification)
+    except Exception as e:
+        print(f"find_forbidden 处理时出错: {e}")
+        forbidden_res = {'不得存在的其他情形': ""}
+    results.append(forbidden_res)
+
+    combined_dict = {}
+    for d in results:
+        combined_dict.update(d)
+    # print("无效标与废标done...")
+    return {"无效标与废标项": combined_dict}
+
+if __name__ == '__main__':
+    start_time = time.time()
+    tobidders_notice_table=""
+    clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json"
+    qualification="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf"
+    output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\zbout"
+    # doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx'
+    invalid_docpath = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_invalid.docx'
+    results = combine_find_invalid(invalid_docpath, output_dir,tobidders_notice_table,clause_path,qualification)
+    end_time = time.time()
+    print("Elapsed time:", str(end_time - start_time))
+    print("Results:", json.dumps(results,ensure_ascii=False,indent=4))
diff --git a/flask_app/货物标/无效标和废标和禁止投标整合main.py b/flask_app/货物标/无效标和废标和禁止投标整合main.py
index ea8f986..d17d616 100644
--- a/flask_app/货物标/无效标和废标和禁止投标整合main.py
+++ b/flask_app/货物标/无效标和废标和禁止投标整合main.py
@@ -3,259 +3,14 @@ import json
 import os.path
 import time
 import re
+
+from flask_app.general.无效标和废标公共代码 import read_tables_from_docx, extract_text_with_keywords, \
+    preprocess_text_list, clean_dict_datas, extract_table_with_keywords
 from flask_app.general.通义千问long import upload_file, qianwen_long,qianwen_long_text
 from flask_app.general.通用功能函数 import process_string_list
 from docx import Document
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from collections import OrderedDict
-#处理跨页的段落
-def preprocess_paragraphs(paragraphs):
-    processed = []  # 初始化处理后的段落列表
-    index = 0
-    #排除遇到表格、填空的情况
-    def has_long_spaces(text, max_space_count=5):
-        return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
-    while index < len(paragraphs):
-        current_text = paragraphs[index].text.strip()  # 去除当前段落的前后空白
-
-        # 检查当前段落是否为空
-        if current_text == '':
-            # 确保有前一个和后一个段落
-            if 0 < index < len(paragraphs) - 1:
-                prev_text = paragraphs[index - 1].text.strip()  # 获取前一个段落的文本
-                next_text = paragraphs[index + 1].text.strip()  # 获取后一个段落的文本
-
-                # **新增部分：检查前一个段落和后一个段落都非空**  内部没有超过5个连续空格
-                if prev_text and next_text and not has_long_spaces(prev_text) and not has_long_spaces(next_text):
-                    # 检查前一个段落的文本是否不以标点符号结尾
-                    if not prev_text.endswith(('，', ',', '。', '!', '?')):
-                        # 定义列表项的模式
-                        list_item_pattern = r'^\s*([（\(]\d+[）\)]|[A-Za-z]\.\s*|[一二三四五六七八九十]+、)'
-
-                        # 检查后一个段落是否以列表模式开头
-                        is_next_list = re.match(list_item_pattern, next_text)
-
-                        # 如果后一个段落不是列表项，且前一个段落长度大于30
-                        if not is_next_list and len(prev_text) > 30:
-                            # 合并前一个和后一个段落的文本
-                            merged_text = prev_text + ' ' + next_text  # 为了可读性添加空格
-
-                            if processed:
-                                # 更新处理后的最后一个段落
-                                processed[-1] = merged_text
-                            else:
-                                # 如果列表为空，直接添加合并后的文本
-                                processed.append(merged_text)
-
-                            # 跳过下一个段落，因为它已经被合并
-                            index += 2
-                            continue
-            # 如果没有满足条件，不进行合并，跳过当前空段落
-        else:
-            # 非空段落，添加到处理后的列表中
-            processed.append(current_text)
-
-        index += 1
-
-    return processed
-
-
-def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
-    from collections import OrderedDict
-    from docx import Document
-    import re
-
-    if isinstance(keywords, str):
-        keywords = [keywords]
-
-    doc = Document(doc_path)
-    extracted_paragraphs = OrderedDict()
-    continue_collecting = False
-    current_section_pattern = None
-    active_key = None
-
-    def match_keywords(text, patterns):
-        # 首先检查关键词是否匹配
-        for pattern in patterns:
-            match = re.search(pattern, text, re.IGNORECASE)
-            if match:
-                # 如果当前的模式是 '不\s*得'，则额外检查是否匹配 '不得分'
-                if '不\s*得' in pattern:  # 使用字符串匹配检查模式
-                    post_match_text = text[match.end():].strip()
-                    if post_match_text.startswith("分"):
-                        continue  # 如果是"不得分"，跳过这个匹配
-                return True
-        return False
-
-    def extract_from_text(text, current_index):
-        nonlocal continue_collecting, current_section_pattern, active_key
-        if text == "":
-            return current_index
-
-        if continue_collecting:
-            if current_section_pattern and re.match(current_section_pattern, text):
-                continue_collecting = False
-                active_key = None
-            else:
-                if active_key is not None:
-                    extracted_paragraphs[active_key].append(text)
-                    return current_index
-
-        if match_keywords(text, keywords):
-            active_key = text
-            extracted_paragraphs[active_key] = [text]
-            if match_keywords(text, follow_up_keywords):
-                continue_collecting = True
-                section_number = re.match(r'^(\d+([.．]\d+)*)\s*[.．]?', text)  # 修改后的正则，支持 '数字 、' 和 '数字.'
-
-                if section_number:
-                    current_section_number = section_number.group(1)
-                    level_count = current_section_number.count('.')
-
-                    # Pattern to match current level, e.g., 3.4.5 或者 3
-                    pattern = r'^' + (r'\d+\s*[.．]\s*') * level_count + r'\d+'
-                    # Generate patterns for next section at same level and parent level
-                    parts = current_section_number.split('.')
-                    matched_patterns = [pattern]  # start with the full pattern
-
-                    # Next section at same level
-                    parts[-1] = str(int(parts[-1]) + 1)
-                    next_pattern = r'^' + r'\.\s*'.join(parts)
-                    matched_patterns.append(next_pattern)
-
-                    # Parent section (if applicable)
-                    if len(parts) > 1:
-                        parent_section_parts = parts[:-1]
-                        parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1)
-                        parent_pattern = r'^' + r'\.\s*'.join(parent_section_parts)
-                        matched_patterns.append(parent_pattern)
-
-                    # 添加对 '数字 、' 格式的支持
-                    digit_comma_pattern = r'^\d+\s*、'
-                    matched_patterns.append(digit_comma_pattern)
-
-                    # Combine the patterns
-                    combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
-                    current_section_pattern = re.compile(combined_pattern)
-                else:
-                    found_next_number = False
-                    current_section_pattern = None
-
-                    while current_index < len(doc.paragraphs) - 1:
-                        current_index += 1
-                        next_text = doc.paragraphs[current_index].text.strip()
-                        if not found_next_number:
-                            # 修改后的正则，支持 '数字 、' 格式
-                            next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))|(\d+\s*、)',
-                                                           next_text)
-                            if next_section_number:
-                                found_next_number = True
-                                if next_section_number.group(1):
-                                    section_parts = next_section_number.group(1).split('.')
-                                    dynamic_pattern = r'^' + r'\.'.join(
-                                        [r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
-                                elif next_section_number.group(2):
-                                    dynamic_pattern = r'^[\(\（]\d+[\)\）]'
-                                elif next_section_number.group(3):
-                                    dynamic_pattern = r'^\d+\s*、'
-                                current_section_pattern = re.compile(dynamic_pattern)
-                        if current_section_pattern and re.match(current_section_pattern, next_text):
-                            extracted_paragraphs[active_key].append(next_text)
-                        else:
-                            continue_collecting = False
-                            active_key = None
-                            break
-
-        return current_index
-
-    processed_paragraphs = preprocess_paragraphs(doc.paragraphs)
-    index = 0
-    while index < len(processed_paragraphs):
-        index = extract_from_text(processed_paragraphs[index].strip(), index)
-        index += 1
-
-    return extracted_paragraphs
-
-
-
-"""
-eg:
-text_list = ["这是第一句。 1. 接下来是第二句！ (3) 最后一句。"]
-new_text_list = ["这是第一句。", "1. 接下来是第二句！", "(3) 最后一句。"]
-"""
-
-
-def preprocess_text_list(text_list):
-    new_text_list = []
-    # 正则表达式匹配中文字符或标点后的空格，该空格后紧跟字母、数字或带括号的数字
-    split_pattern = re.compile(r'(?<=[\u4e00-\u9fff。；！？?!;])(?=\s+[a-zA-Z\d]|\s+\([1-9]\d*\)|\s+\（[1-9]\d*\）)')
-    for text in text_list:
-        # 使用正则表达式检查并拆分元素
-        parts = split_pattern.split(text)
-        new_text_list.extend(part.strip() for part in parts if part.strip())  # 添加非空字符串检查
-
-    return new_text_list
-
-
-def clean_dict_datas(extracted_contents, keywords, excludes):  # 让正则表达式提取到的东西格式化
-    all_texts1 = []
-    all_texts2 = []
-    # 定义用于分割句子的正则表达式，包括中文和西文的结束标点
-    split_pattern = r'(?<=[。！？\!\?])'
-
-    for key, text_list in extracted_contents.items():
-        if len(text_list) == 1:
-            # print(text_list)
-            # print("------------------")
-            for data in text_list:
-                # print(data)
-                # 检查是否包含任何需要排除的字符串
-                if any(exclude in data for exclude in excludes):
-                    continue  # 如果包含任何排除字符串，跳过这个数据
-                # 去掉开头的序号，eg:1 | (1) |（2） | 1. | 2．（全角点）| 3、 | 1.1 | 2.3.4 | A1 | C1.1 | 一、
-                pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
-                data = re.sub(pattern, '', data).strip()
-                keyword_match = re.search(keywords, data)
-                if keyword_match:
-                    # 从关键词位置开始查找结束标点符号
-                    start_pos = keyword_match.start()
-                    # 截取从关键词开始到后面的内容
-                    substring = data[start_pos:]
-                    # 按定义的结束标点分割
-                    sentences = re.split(split_pattern, substring, 1)
-                    if len(sentences) > 0 and sentences[0]:
-                        # 只取第一句，保留标点
-                        cleaned_text = data[:start_pos] + sentences[
-                            0]  # eg:经采购人允许，潜在投标人可进入项目现场进行考察，但潜在投标人不得因此使采购人承担有关责任和蒙受损失。潜在投标人应自行承担现场考察的全部费用、责任和风险。
-                        # 经采购人允许，潜在投标人可进入项目现场进行考察，但潜在投标人不得因此使采购人承担有关责任和蒙受损失。
-                    else:
-                        cleaned_text = data  # 如果没有标点，使用整个字符串
-                else:
-                    # 如果没有找到关键词，保留原文本
-                    cleaned_text = data
-                    # 删除空格
-                cleaned_text_no_spaces = cleaned_text.replace(' ', '').replace('　', '')
-                # 如果长度大于8，则添加到结果列表
-                if len(cleaned_text_no_spaces) > 8:
-                    all_texts1.append(cleaned_text_no_spaces)
-
-        else:
-            # print(text_list)
-            # print("*********")
-            new_text_list = preprocess_text_list(text_list)
-            # 用于处理结构化文本，清理掉不必要的序号，并将分割后的段落合并，最终形成更简洁和格式化的输出。
-            pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
-
-            data = re.sub(pattern, '', new_text_list[0]).strip()  # 去除序号
-            # 将修改后的第一个元素和剩余的元素连接起来
-            new_text_list[0] = data  # 更新列表中的第一个元素
-            joined_text = "\n".join(new_text_list)  # 如果列表中有多个元素，则连接它们
-            # 删除空格
-            joined_text_no_spaces = joined_text.replace(' ', '').replace('　', '')
-            all_texts2.append(joined_text_no_spaces)  # 将每个列表的内容添加到 all_texts 中
-
-    return all_texts1, all_texts2  # all_texts1要额外用gpt   all_text2直接返回结果
-
 
 # 只读取前附表中的最后一列（省钱，但容易漏内容）
 def read_docx_last_column(truncate_file):
@@ -284,109 +39,6 @@ def read_docx_last_column(truncate_file):
 
     return last_column_values
 
-
-# 完整读取文件中所有表格（适合pdf转docx价格便宜的情况，优先推荐，内容完整）
-def read_tables_from_docx(file_path):
-    # 尝试打开文档
-    try:
-        doc = Document(file_path)
-    except Exception as e:
-        print(f"Error opening file: {e}")
-        return []
-
-    # 初始化列表来保存符合条件的单元格内容
-    cell_contents = []
-
-    # 读取文档中的所有表格
-    if not doc.tables:
-        print("No tables found in the document.")
-        return []
-
-    # 遍历文档中的每个表格
-    for table_idx, table in enumerate(doc.tables):
-        # 遍历表格中的每一行
-        for row_idx, row in enumerate(table.rows):
-            # 遍历每一行中的单元格
-            for cell in row.cells:
-                cell_text = cell.text.strip()  # 去除单元格内容前后空白
-                if len(cell_text) > 8:  # 检查文字数量是否大于5
-                    cell_contents.append(cell_text)
-
-    # 返回符合条件的单元格内容
-    return cell_contents
-
-
-def extract_table_with_keywords(data, keywords, follow_up_keywords):
-    """遍历列表中的每个元素，查找并返回包含关键词的句子列表，并根据是否存在后续关键词分别存储到两个列表中。"""
-    sentences1 = []  # 保存没有后续关键词的情况
-    sentences2 = []  # 保存有后续关键词的情况
-
-    # 检查是否包含 '无效报价' 的关键词
-    check_invalid_bidding = '无\s*效\s*报\s*价' in keywords
-    # 遍历列表中的每个字符串元素
-    for item in data:
-        # 只有在包含 '无效投标' 关键词时，才检查 "无效报价"
-        if check_invalid_bidding and re.search(r'无\s*效\s*报\s*价', item, re.IGNORECASE):
-            sentences1.append(item.strip())
-            continue
-
-        # 分割句子，保证句子完整性（按标点符号和序号分割）eg:(?=\d+\.\d+)：匹配诸如 1.1、2.2 之类的序号，并在序号前进行分割。
-        split_sentences = re.split(
-            r'(?<=[。！？!?\?])|'  # 在中文句号、感叹号或问号后分割
-            r'(?=\d+\.\d+)|'  # 在类似1.1的数字序号前分割
-            r'(?=\d+[\s、\.])|'  # 在数字后跟空格、顿号或点号前分割
-            r'(?=[（(]\d+[)）])|'  # 在括号包围的数字前分割
-            r'(?=[A-Za-z]\.\s*)|'  # 在字母加点（如A.、a.）前分割
-            r'(?=[A-Za-z]?\d+\s*(?:\.\s*\d+)*)|'  # 在可选字母加数字或多级编号前分割
-            r'(?=[一二三四五六七八九十]+、)',  # 在中文数字加顿号（如一、、二、）前分割
-            item
-        )
-        i = 0
-        while i < len(split_sentences):
-            sentence = split_sentences[i].strip()
-            if re.search(keywords, sentence, re.IGNORECASE):
-                # 处理 "不\s*得" 的特殊情况，跳过包含 "不得分" 的句子
-                if re.search(r'不\s*得', sentence, re.IGNORECASE):
-                    post_match_text = sentence[re.search(r'不\s*得', sentence).end():].strip()
-                    if post_match_text.startswith("分"):
-                        i += 1  # 跳过这个句子
-                        continue
-                follow_up_present = any(
-                    re.search(follow_up, sentence, re.IGNORECASE) for follow_up in follow_up_keywords
-                )
-                if follow_up_present:
-                    # 如果存在后续关键词，则从当前位置开始截取
-                    start_index = i
-                    end_index = start_index
-                    found_next_section = False
-                    for j in range(start_index + 1, len(split_sentences)):
-                        if re.match(r'\d+\.\d+(\.\d+)?', split_sentences[j].strip()):
-                            end_index = j
-                            found_next_section = True
-                            break
-                    if found_next_section:
-                        full_text = ' '.join(split_sentences[start_index:end_index]).strip()
-                    else:
-                        full_text = ' '.join(split_sentences[start_index:]).strip()
-                    # pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
-                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]\.\s*|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
-                    full_text = re.sub(pattern, '', full_text).replace(' ', '').strip()  # 删去了空格
-                    sentences2.append(full_text)  # 存储有后续关键词的情况
-                    i = end_index if found_next_section else len(split_sentences)
-                else:
-                    # 没有后续关键词的情况
-                    # pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
-                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]\.\s*|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
-                    cleaned_sentence = re.sub(pattern, '', sentence).replace('\n', '').replace(' ', '').strip()
-                    if len(cleaned_sentence)>8:
-                        sentences1.append(cleaned_sentence)  # 存储没有后续关键词的情况
-                    i += 1
-            else:
-                i += 1
-
-    return sentences1, sentences2  # 返回两个列表
-
-
 # 处理无效投标
 def extract_values_if_contains(data, includes):
     """
@@ -475,6 +127,7 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
         return {result_key: ""}
 
 def combine_find_invalid(file_path, output_dir):
+    os.makedirs(output_dir, exist_ok=True)
     queries = [
         (
             r'否\s*决|无\s*效\s*投\s*标|无\s*效\s*文\s*件|文\s*件\s*无\s*效|无\s*效\s*响\s*应|无\s*效\s*报\s*价|无\s*效\s*标|视\s*为\s*无\s*效|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',