11.13
This commit is contained in:
parent
59b7bed441
commit
6e7da080bc
@ -44,8 +44,8 @@ def parse_text_by_heading(text):
|
|||||||
skip_subheadings = False
|
skip_subheadings = False
|
||||||
last_main_number = None
|
last_main_number = None
|
||||||
temp_title = None # 临时存储以点号开头但不带数字的标题
|
temp_title = None # 临时存储以点号开头但不带数字的标题
|
||||||
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
|
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') #一、
|
||||||
pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*')
|
pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*') #(一)
|
||||||
initial_heading_pattern = None
|
initial_heading_pattern = None
|
||||||
special_section_keywords = ['文件的组成', '文件的构成','文件包括:','文件包括:'] # 定义特殊章节关键词
|
special_section_keywords = ['文件的组成', '文件的构成','文件包括:','文件包括:'] # 定义特殊章节关键词
|
||||||
in_special_section = False # 标志是否在特殊章节中
|
in_special_section = False # 标志是否在特殊章节中
|
||||||
|
@ -36,12 +36,19 @@ def read_tables_from_docx(file_path):
|
|||||||
def preprocess_paragraphs(paragraphs):
|
def preprocess_paragraphs(paragraphs):
|
||||||
processed = [] # 初始化处理后的段落列表
|
processed = [] # 初始化处理后的段落列表
|
||||||
index = 0
|
index = 0
|
||||||
|
# 定义两个新的正则表达式模式
|
||||||
|
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
|
||||||
|
pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*')
|
||||||
#排除遇到表格、填空的情况
|
#排除遇到表格、填空的情况
|
||||||
def has_long_spaces(text, max_space_count=5):
|
def has_long_spaces(text, max_space_count=5):
|
||||||
return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
|
return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
|
||||||
while index < len(paragraphs):
|
while index < len(paragraphs):
|
||||||
current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白
|
current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白
|
||||||
|
# 检查当前段落是否匹配任一排除模式
|
||||||
|
if pattern_numbered.match(current_text) or pattern_parentheses.match(current_text):
|
||||||
|
# 如果匹配,则跳过当前段落,不添加到processed列表中
|
||||||
|
index += 1
|
||||||
|
continue
|
||||||
# 检查当前段落是否为空
|
# 检查当前段落是否为空
|
||||||
if current_text == '':
|
if current_text == '':
|
||||||
# 确保有前一个和后一个段落
|
# 确保有前一个和后一个段落
|
||||||
|
@ -100,6 +100,7 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
|
|||||||
print(f"handle_query 在处理 {result_key} 时发生异常: {e}")
|
print(f"handle_query 在处理 {result_key} 时发生异常: {e}")
|
||||||
return {result_key: ""}
|
return {result_key: ""}
|
||||||
|
|
||||||
|
#TODO:增加提取逻辑 else里面的内容
|
||||||
def combine_find_invalid(file_path, output_folder, qualification):
|
def combine_find_invalid(file_path, output_folder, qualification):
|
||||||
os.makedirs(output_folder, exist_ok=True)
|
os.makedirs(output_folder, exist_ok=True)
|
||||||
queries = [
|
queries = [
|
||||||
|
@ -136,7 +136,7 @@ def merge_pdfs(paths, output_filename):
|
|||||||
return output_path
|
return output_path
|
||||||
|
|
||||||
|
|
||||||
def find_forbidden(truncate_json_path, clause_path, qualification=""):
|
def find_forbidden(qualification=""):
|
||||||
try:
|
try:
|
||||||
if qualification:
|
if qualification:
|
||||||
file_id = upload_file(qualification)
|
file_id = upload_file(qualification)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user