12.30 废标项
This commit is contained in:
parent
70b6a4b999
commit
8bfbfc4263
@ -74,9 +74,12 @@ def read_tables_from_docx(file_path):
|
|||||||
def preprocess_paragraphs(paragraphs):
|
def preprocess_paragraphs(paragraphs):
|
||||||
processed = [] # 初始化处理后的段落列表
|
processed = [] # 初始化处理后的段落列表
|
||||||
index = 0
|
index = 0
|
||||||
|
flag = False # 初始化标志位
|
||||||
|
|
||||||
# 定义两个新的正则表达式模式
|
# 定义两个新的正则表达式模式
|
||||||
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
|
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
|
||||||
pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*')
|
pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*')
|
||||||
|
|
||||||
# 定义列表项的模式
|
# 定义列表项的模式
|
||||||
list_item_pattern = re.compile(
|
list_item_pattern = re.compile(
|
||||||
r'^\s*('
|
r'^\s*('
|
||||||
@ -91,6 +94,14 @@ def preprocess_paragraphs(paragraphs):
|
|||||||
r')'
|
r')'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 新增的正则表达式,用于匹配以数字序号开头的段落
|
||||||
|
pattern_numeric_header = re.compile(
|
||||||
|
r'^(?<![a-zA-Z((])(\d+(?:\.\d+)+)\s*(.*)' # 匹配如 '12.1 内容'
|
||||||
|
)
|
||||||
|
pattern_numeric_header_fallback = re.compile(
|
||||||
|
r'^(\d+\.)\s*(.+)$' # 匹配如 '12. 内容'
|
||||||
|
)
|
||||||
|
|
||||||
# 是否存在连续超过指定长度的空白字符序列: 排除遇到表格、填空的情况
|
# 是否存在连续超过指定长度的空白字符序列: 排除遇到表格、填空的情况
|
||||||
def has_long_spaces(text, max_space_count=5):
|
def has_long_spaces(text, max_space_count=5):
|
||||||
return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
|
return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
|
||||||
@ -101,7 +112,10 @@ def preprocess_paragraphs(paragraphs):
|
|||||||
# 辅助函数:查找上一个非空且非标记的段落
|
# 辅助函数:查找上一个非空且非标记的段落
|
||||||
def find_prev_text(current_index):
|
def find_prev_text(current_index):
|
||||||
for i in range(current_index - 1, -1, -1):
|
for i in range(current_index - 1, -1, -1):
|
||||||
|
try:
|
||||||
text = paragraphs[i].text.strip()
|
text = paragraphs[i].text.strip()
|
||||||
|
except AttributeError:
|
||||||
|
continue # 如果段落对象没有 text 属性,跳过
|
||||||
if text and not pattern_marker.search(text):
|
if text and not pattern_marker.search(text):
|
||||||
return text, i
|
return text, i
|
||||||
return '', -1
|
return '', -1
|
||||||
@ -109,13 +123,21 @@ def preprocess_paragraphs(paragraphs):
|
|||||||
# 辅助函数:查找下一个非空且非标记的段落
|
# 辅助函数:查找下一个非空且非标记的段落
|
||||||
def find_next_text(current_index):
|
def find_next_text(current_index):
|
||||||
for i in range(current_index + 1, len(paragraphs)):
|
for i in range(current_index + 1, len(paragraphs)):
|
||||||
|
try:
|
||||||
text = paragraphs[i].text.strip()
|
text = paragraphs[i].text.strip()
|
||||||
|
except AttributeError:
|
||||||
|
continue # 如果段落对象没有 text 属性,跳过
|
||||||
if text and not pattern_marker.search(text):
|
if text and not pattern_marker.search(text):
|
||||||
return text, i
|
return text, i
|
||||||
return '', -1
|
return '', -1
|
||||||
|
|
||||||
while index < len(paragraphs):
|
while index < len(paragraphs):
|
||||||
|
try:
|
||||||
current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白
|
current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白
|
||||||
|
except AttributeError:
|
||||||
|
# 如果段落对象没有 text 属性,跳过该段落
|
||||||
|
index += 1
|
||||||
|
continue
|
||||||
|
|
||||||
# 检查当前段落是否为页面标记
|
# 检查当前段落是否为页面标记
|
||||||
if pattern_marker.search(current_text):
|
if pattern_marker.search(current_text):
|
||||||
@ -147,7 +169,11 @@ def preprocess_paragraphs(paragraphs):
|
|||||||
# 从当前 index 向下,跳过所有连续的空白段落和标记
|
# 从当前 index 向下,跳过所有连续的空白段落和标记
|
||||||
skip_index = index + 1
|
skip_index = index + 1
|
||||||
while skip_index < len(paragraphs):
|
while skip_index < len(paragraphs):
|
||||||
|
try:
|
||||||
skip_text = paragraphs[skip_index].text.strip()
|
skip_text = paragraphs[skip_index].text.strip()
|
||||||
|
except AttributeError:
|
||||||
|
skip_index += 1
|
||||||
|
continue # 如果段落对象没有 text 属性,跳过
|
||||||
if skip_text == '' or pattern_marker.search(skip_text):
|
if skip_text == '' or pattern_marker.search(skip_text):
|
||||||
skip_index += 1
|
skip_index += 1
|
||||||
else:
|
else:
|
||||||
@ -161,12 +187,155 @@ def preprocess_paragraphs(paragraphs):
|
|||||||
index += 1
|
index += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 如果当前段落不是标记且不匹配排除模式,则添加到处理后的列表中
|
# 检查是否为以数字序号开头的段落
|
||||||
|
match = pattern_numeric_header.match(current_text)
|
||||||
|
if not match:
|
||||||
|
match = pattern_numeric_header_fallback.match(current_text)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
# 当前段落以数字序号开头,直接添加到 processed
|
||||||
|
processed.append(current_text)
|
||||||
|
flag = True # 设置标志位,准备处理下一个段落
|
||||||
|
index += 1
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
if flag:
|
||||||
|
# 当前段落不以数字序号开头,且 flag 为 True
|
||||||
|
if not list_item_pattern.match(current_text):
|
||||||
|
if processed:
|
||||||
|
# **新增逻辑开始**
|
||||||
|
next_non_empty_text, next_non_empty_index = find_next_text(index)
|
||||||
|
is_next_numbered = False
|
||||||
|
if next_non_empty_text:
|
||||||
|
is_next_numbered = bool(
|
||||||
|
pattern_numeric_header.match(next_non_empty_text) or
|
||||||
|
pattern_numeric_header_fallback.match(next_non_empty_text)
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_next_numbered:
|
||||||
|
# 只有在下一个段落以数字序号开头时,才将当前段落追加到上一个段落
|
||||||
|
processed[-1] = processed[-1] + ' ' + current_text
|
||||||
|
else:
|
||||||
|
# 否则,不追加,而是作为新的段落添加
|
||||||
|
processed.append(current_text)
|
||||||
|
# **新增逻辑结束**
|
||||||
|
else:
|
||||||
|
# **新增处理:匹配 list_item_pattern 的段落也应被保存**
|
||||||
|
processed.append(current_text)
|
||||||
|
# 无论是否追加,都将 flag 重置
|
||||||
|
flag = False
|
||||||
|
index += 1
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# flag 为 False,直接添加到 processed
|
||||||
processed.append(current_text)
|
processed.append(current_text)
|
||||||
index += 1
|
index += 1
|
||||||
|
continue
|
||||||
|
|
||||||
return processed
|
return processed
|
||||||
|
|
||||||
|
#老版本
|
||||||
|
# def preprocess_paragraphs(paragraphs):
|
||||||
|
# processed = [] # 初始化处理后的段落列表
|
||||||
|
# index = 0
|
||||||
|
# # 定义两个新的正则表达式模式
|
||||||
|
# pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
|
||||||
|
# pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*')
|
||||||
|
# # 定义列表项的模式
|
||||||
|
# list_item_pattern = re.compile(
|
||||||
|
# r'^\s*('
|
||||||
|
# r'[(\(]\d+[)\)]|' # 匹配:(1) 或 (1)
|
||||||
|
# r'[A-Za-z]\.\s*|' # 匹配:A. 或 b.
|
||||||
|
# r'[一二三四五六七八九十]+、|' # 匹配:一、二、三、
|
||||||
|
# r'第[一二三四五六七八九十百零]+[章节部分节]|' # 匹配:第x章,第x部分,第x节
|
||||||
|
# r'[A-Za-z]\d+(?:\.\d+)*[\s\.、.)\)]?|' # 匹配:A1.2 等
|
||||||
|
# r'\d+(?:\.\d+)+[\s\.、.)\)]?(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])|' # 匹配:数字序号如1.1 1.1.1
|
||||||
|
# r'(?=\d+\s(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万]))|' # 数字后空格,空格后非指定关键字
|
||||||
|
# r'(?=\d+[、..])(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])' # 数字后直接跟顿号或点号
|
||||||
|
# r')'
|
||||||
|
# )
|
||||||
|
#
|
||||||
|
# # 是否存在连续超过指定长度的空白字符序列: 排除遇到表格、填空的情况
|
||||||
|
# def has_long_spaces(text, max_space_count=5):
|
||||||
|
# return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
|
||||||
|
#
|
||||||
|
# # 正则表达式用于检测页面标记
|
||||||
|
# pattern_marker = re.compile(r'\$\$index_mark_\d+\$\$')
|
||||||
|
#
|
||||||
|
# # 辅助函数:查找上一个非空且非标记的段落
|
||||||
|
# def find_prev_text(current_index):
|
||||||
|
# for i in range(current_index - 1, -1, -1):
|
||||||
|
# text = paragraphs[i].text.strip()
|
||||||
|
# if text and not pattern_marker.search(text):
|
||||||
|
# return text, i
|
||||||
|
# return '', -1
|
||||||
|
#
|
||||||
|
# # 辅助函数:查找下一个非空且非标记的段落
|
||||||
|
# def find_next_text(current_index):
|
||||||
|
# for i in range(current_index + 1, len(paragraphs)):
|
||||||
|
# text = paragraphs[i].text.strip()
|
||||||
|
# if text and not pattern_marker.search(text):
|
||||||
|
# return text, i
|
||||||
|
# return '', -1
|
||||||
|
#
|
||||||
|
# while index < len(paragraphs):
|
||||||
|
# try:
|
||||||
|
# current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白
|
||||||
|
# except AttributeError:
|
||||||
|
# # 如果段落对象没有 text 属性,跳过该段落
|
||||||
|
# index += 1
|
||||||
|
# continue
|
||||||
|
#
|
||||||
|
# # 检查当前段落是否为页面标记
|
||||||
|
# if pattern_marker.search(current_text):
|
||||||
|
# # 动态查找前一个非空段落
|
||||||
|
# prev_text, prev_index = find_prev_text(index)
|
||||||
|
# # 动态查找后一个非空段落
|
||||||
|
# next_text, next_index = find_next_text(index)
|
||||||
|
#
|
||||||
|
# # 应用现有的合并逻辑
|
||||||
|
# if prev_text and next_text and not has_long_spaces(prev_text) and not has_long_spaces(next_text):
|
||||||
|
# if not prev_text.endswith(('。', '!', '?')): # ',', ',', 先注释了,如果逗号,可能还没结束。
|
||||||
|
# # 检查后一个段落是否为列表项
|
||||||
|
# if not list_item_pattern.match(next_text) and len(prev_text) > 30:
|
||||||
|
# # 合并前后段落
|
||||||
|
# merged_text = prev_text + ' ' + next_text # 为了可读性添加空格
|
||||||
|
# if prev_index < len(paragraphs):
|
||||||
|
# # 移除 processed 中的前一个段落
|
||||||
|
# if processed and processed[-1] == prev_text:
|
||||||
|
# processed.pop()
|
||||||
|
# # 添加合并后的文本
|
||||||
|
# processed.append(merged_text)
|
||||||
|
#
|
||||||
|
# # 跳过标记以及前后所有空白段落,直到 next_index
|
||||||
|
# index = next_index + 1
|
||||||
|
# continue # 继续下一个循环
|
||||||
|
#
|
||||||
|
# # 如果不满足合并条件,跳过标记及其周围的空白段落
|
||||||
|
# # 计算下一个需要处理的索引
|
||||||
|
# # 从当前 index 向下,跳过所有连续的空白段落和标记
|
||||||
|
# skip_index = index + 1
|
||||||
|
# while skip_index < len(paragraphs):
|
||||||
|
# skip_text = paragraphs[skip_index].text.strip()
|
||||||
|
# if skip_text == '' or pattern_marker.search(skip_text):
|
||||||
|
# skip_index += 1
|
||||||
|
# else:
|
||||||
|
# break
|
||||||
|
# index = skip_index
|
||||||
|
# continue # 继续下一个循环
|
||||||
|
#
|
||||||
|
# # 检查当前段落是否匹配任一排除模式
|
||||||
|
# if (pattern_numbered.match(current_text) or pattern_parentheses.match(current_text)) and len(current_text) < 8:
|
||||||
|
# # 如果匹配,则跳过当前段落,不添加到processed列表中
|
||||||
|
# index += 1
|
||||||
|
# continue
|
||||||
|
#
|
||||||
|
# # 如果当前段落不是标记且不匹配排除模式,则添加到处理后的列表中
|
||||||
|
# processed.append(current_text)
|
||||||
|
# index += 1
|
||||||
|
#
|
||||||
|
# return processed
|
||||||
|
|
||||||
#如果当前段落有序号,则向下匹配直接遇到相同的序号样式
|
#如果当前段落有序号,则向下匹配直接遇到相同的序号样式
|
||||||
#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。
|
#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。
|
||||||
def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
|
def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
|
||||||
@ -224,7 +393,7 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
|
|||||||
|
|
||||||
# for i in range(1, 6): #同级,与matched_patterns = [pattern]重复了,故注释
|
# for i in range(1, 6): #同级,与matched_patterns = [pattern]重复了,故注释
|
||||||
# # 复制 parts 列表以避免修改原列表
|
# # 复制 parts 列表以避免修改原列表
|
||||||
# new_parts = parts.copy()
|
# new_parts = parts. copy()
|
||||||
# new_parts[-1] = str(int(new_parts[-1]) + i)
|
# new_parts[-1] = str(int(new_parts[-1]) + i)
|
||||||
# # 使用不同的分隔符
|
# # 使用不同的分隔符
|
||||||
# next_pattern = r'^' + r'\s*[..]\s*'.join(new_parts)
|
# next_pattern = r'^' + r'\s*[..]\s*'.join(new_parts)
|
||||||
@ -262,6 +431,9 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
|
|||||||
while current_index < len(processed_paragraphs) - 1:
|
while current_index < len(processed_paragraphs) - 1:
|
||||||
current_index += 1
|
current_index += 1
|
||||||
next_text = processed_paragraphs[current_index].strip()
|
next_text = processed_paragraphs[current_index].strip()
|
||||||
|
# 添加对空白行的处理
|
||||||
|
if not next_text:
|
||||||
|
continue # 跳过空白行,进入下一个循环
|
||||||
if not found_next_number:
|
if not found_next_number:
|
||||||
# 修改后的正则,支持 '数字 、' 格式
|
# 修改后的正则,支持 '数字 、' 格式
|
||||||
next_section_number = re.match(r'^([A-Za-z0-9]+(?:[..][A-Za-z0-9]+)*)|([((]\s*\d+\s*[))])|(\d+\s*、)',
|
next_section_number = re.match(r'^([A-Za-z0-9]+(?:[..][A-Za-z0-9]+)*)|([((]\s*\d+\s*[))])|(\d+\s*、)',
|
||||||
@ -289,7 +461,9 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
|
|||||||
processed_paragraphs = preprocess_paragraphs(doc.paragraphs)
|
processed_paragraphs = preprocess_paragraphs(doc.paragraphs)
|
||||||
index = 0
|
index = 0
|
||||||
while index < len(processed_paragraphs):
|
while index < len(processed_paragraphs):
|
||||||
|
# print(processed_paragraphs[index].strip())
|
||||||
index = extract_from_text(processed_paragraphs[index].strip(), index)
|
index = extract_from_text(processed_paragraphs[index].strip(), index)
|
||||||
|
# print("--------------")
|
||||||
index += 1
|
index += 1
|
||||||
|
|
||||||
return extracted_paragraphs
|
return extracted_paragraphs
|
||||||
@ -664,7 +838,7 @@ def combine_find_invalid(invalid_docpath, output_dir):
|
|||||||
print("无效标与废标done...")
|
print("无效标与废标done...")
|
||||||
return {"无效标与废标项": combined_dict}
|
return {"无效标与废标项": combined_dict}
|
||||||
|
|
||||||
#TODO:无效标这里提示词修改一下
|
#TODO:preprocess测试一下有无问题
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
# truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json"
|
# truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json"
|
||||||
@ -672,12 +846,12 @@ if __name__ == '__main__':
|
|||||||
# clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
|
# clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
|
||||||
# doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
|
# doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
|
||||||
# doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
|
# doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
|
||||||
pdf_path=r'C:\Users\Administrator\Desktop\fsdownload\8a9ebd69-af0d-4661-a8ce-78136cb6bc4f\ztbfile.pdf'
|
pdf_path = r'C:\Users\Administrator\Desktop\货物标\truncate_all\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目).pdf'
|
||||||
|
|
||||||
output_dir = r"D:\flask_project\flask_app\static\output\output1\000aac0d-4aa4-4bc3-a9f9-76ff82ec2470\tmp"
|
output_dir = r"C:\Users\Administrator\Desktop\货物标\truncate_all"
|
||||||
# invalid_added=insert_mark(pdf_path)
|
# invalid_added = insert_mark(pdf_path)
|
||||||
# invalid_added_docx=pdf2docx(invalid_added)
|
# invalid_added_docx = pdf2docx(invalid_added)
|
||||||
invalid_added_docx=r'D:\flask_project\flask_app\static\output\output1\000aac0d-4aa4-4bc3-a9f9-76ff82ec2470\invalid_added.docx'
|
invalid_added_docx=r'C:\Users\Administrator\Desktop\货物标\truncate_all\invalid_added.docx'
|
||||||
results = combine_find_invalid(invalid_added_docx, output_dir)
|
results = combine_find_invalid(invalid_added_docx, output_dir)
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
|
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user