From 6e7da080bc2c6fa65f9a6ec42a082ce1478df3bd Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Wed, 13 Nov 2024 09:41:37 +0800 Subject: [PATCH] 11.13 --- flask_app/general/投标人须知正文条款提取成json文件.py | 4 ++-- flask_app/general/无效标和废标公共代码.py | 9 ++++++++- flask_app/main/无效标和废标和禁止投标整合.py | 1 + flask_app/main/禁止投标情形.py | 2 +- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/flask_app/general/投标人须知正文条款提取成json文件.py b/flask_app/general/投标人须知正文条款提取成json文件.py index fd64a94..811da43 100644 --- a/flask_app/general/投标人须知正文条款提取成json文件.py +++ b/flask_app/general/投标人须知正文条款提取成json文件.py @@ -44,8 +44,8 @@ def parse_text_by_heading(text): skip_subheadings = False last_main_number = None temp_title = None # 临时存储以点号开头但不带数字的标题 - pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') - pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*') + pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') #一、 + pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*') #(一) initial_heading_pattern = None special_section_keywords = ['文件的组成', '文件的构成','文件包括:','文件包括:'] # 定义特殊章节关键词 in_special_section = False # 标志是否在特殊章节中 diff --git a/flask_app/general/无效标和废标公共代码.py b/flask_app/general/无效标和废标公共代码.py index 3c4d3f3..69624b4 100644 --- a/flask_app/general/无效标和废标公共代码.py +++ b/flask_app/general/无效标和废标公共代码.py @@ -36,12 +36,19 @@ def read_tables_from_docx(file_path): def preprocess_paragraphs(paragraphs): processed = [] # 初始化处理后的段落列表 index = 0 + # 定义两个新的正则表达式模式 + pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') + pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*') #排除遇到表格、填空的情况 def has_long_spaces(text, max_space_count=5): return any(len(space) > max_space_count for space in re.findall(r'\s+', text)) while index < len(paragraphs): current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白 - + # 检查当前段落是否匹配任一排除模式 + if pattern_numbered.match(current_text) or pattern_parentheses.match(current_text): + # 如果匹配,则跳过当前段落,不添加到processed列表中 + index += 1 + continue # 检查当前段落是否为空 if current_text == '': # 确保有前一个和后一个段落 diff --git a/flask_app/main/无效标和废标和禁止投标整合.py b/flask_app/main/无效标和废标和禁止投标整合.py index 0edc0fe..5763144 100644 --- a/flask_app/main/无效标和废标和禁止投标整合.py +++ b/flask_app/main/无效标和废标和禁止投标整合.py @@ -100,6 +100,7 @@ def handle_query(file_path, user_query, output_file, result_key, keywords): print(f"handle_query 在处理 {result_key} 时发生异常: {e}") return {result_key: ""} +#TODO:增加提取逻辑 else里面的内容 def combine_find_invalid(file_path, output_folder, qualification): os.makedirs(output_folder, exist_ok=True) queries = [ diff --git a/flask_app/main/禁止投标情形.py b/flask_app/main/禁止投标情形.py index 6ed3b5b..b7cdf04 100644 --- a/flask_app/main/禁止投标情形.py +++ b/flask_app/main/禁止投标情形.py @@ -136,7 +136,7 @@ def merge_pdfs(paths, output_filename): return output_path -def find_forbidden(truncate_json_path, clause_path, qualification=""): +def find_forbidden(qualification=""): try: if qualification: file_id = upload_file(qualification)