12.6 优化解析
This commit is contained in:
parent
c771f801db
commit
e5e63e400b
@ -76,7 +76,7 @@ def clean_page_content(text, common_header):
|
|||||||
text = re.sub(r'^第\d+页\s*', '', text)
|
text = re.sub(r'^第\d+页\s*', '', text)
|
||||||
# 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除
|
# 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除
|
||||||
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 投标人须知这块, 页码和顶部序号混在一起的时候也会把序号给去除了。'2018.' 20为页码 18.为序号
|
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 投标人须知这块, 页码和顶部序号混在一起的时候也会把序号给去除了。'2018.' 20为页码 18.为序号
|
||||||
text = re.sub(r'^\s*\/\s*(共\s*)?\d{1,3}\s*(页)?\s*', '', text) #删除/123 /共123 /共123页 /123页
|
text = re.sub(r'^\s*\/\s*(共\s*)?\d+\s*(页)?\s*', '', text) #删除/123 /共123 /共123页 /123页
|
||||||
text = re.sub(r'^\s*[—-]\s*\d{1,3}\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
|
text = re.sub(r'^\s*[—-]\s*\d{1,3}\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
@ -169,10 +169,30 @@ def parse_text_by_heading(text):
|
|||||||
append_newline = True
|
append_newline = True
|
||||||
else:
|
else:
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
|
||||||
|
|
||||||
elif dot_text_match:
|
elif dot_text_match:
|
||||||
# 处理以点号开头但不带数字的情况,存储到临时变量
|
# 处理以点号开头但不带数字的情况,自动生成下一个一级序号
|
||||||
temp_title = dot_text_match.group(1).strip()
|
temp_content = dot_text_match.group(1).strip()
|
||||||
|
if last_main_number:
|
||||||
|
# 生成下一个一级序号
|
||||||
|
try:
|
||||||
|
# 尝试将 last_main_number 转为整数并加1
|
||||||
|
next_main_number = str(int(last_main_number) + 1) + '.'
|
||||||
|
except ValueError:
|
||||||
|
# 如果 last_main_number 不是纯数字,处理为中文序号或其他逻辑
|
||||||
|
# 这里假设 last_main_number 是阿拉伯数字
|
||||||
|
next_main_number = '未识别的序号.'
|
||||||
|
else:
|
||||||
|
# 如果没有上一个主编号,默认从 '1.' 开始
|
||||||
|
next_main_number = '1.'
|
||||||
|
# 更新 current_key 和 last_main_number
|
||||||
|
if current_key is not None:
|
||||||
|
content_string = ''.join(current_content).strip()
|
||||||
|
data[current_key] = data.get(current_key, '') + content_string
|
||||||
|
current_key = next_main_number
|
||||||
|
current_content = [temp_content] # 不直接添加 '\n'
|
||||||
|
last_main_number = next_main_number.rstrip('.')
|
||||||
|
append_newline = True # 设置标志,下一次内容添加时需要 '\n'
|
||||||
|
|
||||||
continue # 跳过进一步处理该行
|
continue # 跳过进一步处理该行
|
||||||
|
|
||||||
elif pure_number_match:
|
elif pure_number_match:
|
||||||
@ -245,6 +265,7 @@ def parse_text_by_heading(text):
|
|||||||
if current_key is not None:
|
if current_key is not None:
|
||||||
content_string = ''.join(current_content).strip()
|
content_string = ''.join(current_content).strip()
|
||||||
data[current_key] = data.get(current_key, '') + content_string
|
data[current_key] = data.get(current_key, '') + content_string
|
||||||
|
current_content=[]
|
||||||
if current_key_chinese is not None:
|
if current_key_chinese is not None:
|
||||||
data[current_key_chinese] = current_value_chinese
|
data[current_key_chinese] = current_value_chinese
|
||||||
current_key_chinese = None
|
current_key_chinese = None
|
||||||
|
@ -119,7 +119,7 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
||||||
# file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\东莞支队查验招标文件.pdf"
|
# file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\东莞支队查验招标文件.pdf"
|
||||||
file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
|
file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
||||||
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
||||||
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
|
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
|
||||||
|
@ -219,7 +219,6 @@ def rename_keys(data):
|
|||||||
# 对整个数据结构进行递归重命名
|
# 对整个数据结构进行递归重命名
|
||||||
return rename_keys_recursive(data)
|
return rename_keys_recursive(data)
|
||||||
|
|
||||||
|
|
||||||
def combine_and_update_results(original_data, updates):
|
def combine_and_update_results(original_data, updates):
|
||||||
"""
|
"""
|
||||||
先规范化original和updates中的字典,防止空格的情况导致匹配不上无法更新
|
先规范化original和updates中的字典,防止空格的情况导致匹配不上无法更新
|
||||||
|
@ -175,9 +175,11 @@ def process_folder(input_folder, output_folder):
|
|||||||
print(f"Error processing {file_name}: {e}")
|
print(f"Error processing {file_name}: {e}")
|
||||||
|
|
||||||
#TODO:招标文件111_tobidders_notice_part2.pdf 陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目(1)_tobidders_notice_part2.pdf
|
#TODO:招标文件111_tobidders_notice_part2.pdf 陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目(1)_tobidders_notice_part2.pdf
|
||||||
|
#TODO:19、竞争性磋商响应文件的加密 暂时没处理'19'缺失的情况
|
||||||
|
#TODO: .不予受理的情形 ,‘.后面必须跟中文或者空格’
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
|
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
|
||||||
file_path=r'C:\Users\Administrator\Desktop\货物标\output4\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf'
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
||||||
output_folder = r'C:\Users\Administrator\Desktop\招标文件\output4\tmp'
|
output_folder = r'C:\Users\Administrator\Desktop\招标文件\output4\tmp'
|
||||||
try:
|
try:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user