11.6修复bug
This commit is contained in:
parent
c0ba74a374
commit
21eeb87903
@ -34,7 +34,7 @@ def preprocess_data(data):
|
|||||||
|
|
||||||
|
|
||||||
# 转换结构化的JSON数据
|
# 转换结构化的JSON数据
|
||||||
#No parent found at level 1 for key '24.2'. Check the data structure.
|
#生成结构化的数据
|
||||||
def transform_json(data):
|
def transform_json(data):
|
||||||
result = {}
|
result = {}
|
||||||
temp = {0: result} # 初始化根字典
|
temp = {0: result} # 初始化根字典
|
||||||
@ -183,19 +183,35 @@ def process_nested_data(data):
|
|||||||
# 到达最内层,处理非字典和非列表的元素(字符串)
|
# 到达最内层,处理非字典和非列表的元素(字符串)
|
||||||
return post_process(data)
|
return post_process(data)
|
||||||
|
|
||||||
def get_requirements_with_gpt(invalid_path, selection):
|
#生成无结构的数据
|
||||||
|
def concatenate_keys_values(section_content):
|
||||||
|
"""
|
||||||
|
将章节内容的键值对拼接成一个字符串列表,每个元素为 "key value"。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
section_content (dict): 章节内容的键值对。
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list of str: 拼接后的字符串列表。
|
||||||
|
"""
|
||||||
|
concatenated = []
|
||||||
|
for key, value in section_content.items():
|
||||||
|
concatenated.append(f"{key} {value}")
|
||||||
|
return concatenated
|
||||||
|
|
||||||
|
def get_requirements_with_gpt(merged_baseinfo_path, selection):
|
||||||
"""
|
"""
|
||||||
根据 selection 的值选择相应的用户查询,并调用大模型获取要求。
|
根据 selection 的值选择相应的用户查询,并调用大模型获取要求。
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
invalid_path (str): 无效文件的路径,用于上传。
|
merged_baseinfo_path (str): 无效文件的路径,用于上传。
|
||||||
selection (int): 选择的类型(1、2 或 3)。
|
selection (int): 选择的类型(1、2 或 3)。
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict: 大模型返回的要求结果,或错误信息。
|
dict: 大模型返回的要求结果,或错误信息。
|
||||||
"""
|
"""
|
||||||
# 上传文件并获取 file_id
|
# 上传文件并获取 file_id
|
||||||
file_id = upload_file(invalid_path)
|
file_id = upload_file(merged_baseinfo_path)
|
||||||
# 定义 selection 对应的用户查询
|
# 定义 selection 对应的用户查询
|
||||||
user_queries = {
|
user_queries = {
|
||||||
# 1: """
|
# 1: """
|
||||||
|
@ -95,8 +95,8 @@ def extract_text_by_page(file_path):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\54d7aa40-8b36-4803-b6f7-278356ff1381\\ztbfile_tobidders_notice_part2.pdf'
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
||||||
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
||||||
|
@ -2,7 +2,7 @@ import json
|
|||||||
import re
|
import re
|
||||||
from functools import cmp_to_key
|
from functools import cmp_to_key
|
||||||
|
|
||||||
from flask_app.general.投标人须知正文提取指定内容 import process_nested_data, transform_json, get_requirements_with_gpt
|
from flask_app.general.投标人须知正文提取指定内容 import process_nested_data, transform_json, get_requirements_with_gpt,concatenate_keys_values
|
||||||
|
|
||||||
|
|
||||||
#提取两个大标题之间的内容
|
#提取两个大标题之间的内容
|
||||||
@ -103,19 +103,26 @@ def extract_from_notice(merged_baseinfo_path,clause_path, type):
|
|||||||
|
|
||||||
# 提取目标部分
|
# 提取目标部分
|
||||||
extracted_data = extract_between_sections(data, target_values) # 读取json,截取大标题之间的内容
|
extracted_data = extract_between_sections(data, target_values) # 读取json,截取大标题之间的内容
|
||||||
transformed_data = process_with_outer_key(extracted_data)
|
if not extracted_data:
|
||||||
final_result = process_nested_data(transformed_data)
|
|
||||||
if not final_result:
|
|
||||||
final_result = get_requirements_with_gpt(merged_baseinfo_path, type) #万一没用正则匹配到,那就调用大模型
|
final_result = get_requirements_with_gpt(merged_baseinfo_path, type) #万一没用正则匹配到,那就调用大模型
|
||||||
return final_result
|
return final_result
|
||||||
|
# print(json.dumps(extracted_data,ensure_ascii=False,indent=4))
|
||||||
|
extracted_data_concatenated = {section: concatenate_keys_values(content)
|
||||||
|
for section, content in extracted_data.items()}
|
||||||
|
|
||||||
|
return extracted_data_concatenated
|
||||||
|
# transformed_data = process_with_outer_key(extracted_data)
|
||||||
|
# final_result = process_nested_data(transformed_data)
|
||||||
|
# return final_result
|
||||||
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error in extract_from_notice: {e}")
|
print(f"Error in extract_from_notice: {e}")
|
||||||
return DEFAULT_RESULT
|
return DEFAULT_RESULT
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
clause_path = 'D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\clause1.json'
|
clause_path = r'C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\\clause1.json'
|
||||||
merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\648e094b-e677-47ce-9073-09e0c82af210\ztbfile_merged_baseinfo.pdf"
|
merged_baseinfo_path=r"C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\ztbfile_merged_baseinfo.pdf"
|
||||||
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\clause1.json'
|
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\clause1.json'
|
||||||
try:
|
try:
|
||||||
res = extract_from_notice(merged_baseinfo_path,clause_path, 1) # 可以改变此处的 type 参数测试不同的场景
|
res = extract_from_notice(merged_baseinfo_path,clause_path, 1) # 可以改变此处的 type 参数测试不同的场景
|
||||||
|
@ -105,12 +105,14 @@ def should_add_newline(content, keywords, max_length=20):
|
|||||||
content_str = ''.join(content).strip()
|
content_str = ''.join(content).strip()
|
||||||
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
|
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
|
||||||
|
|
||||||
def handle_content_append(current_content, line_content, append_newline, keywords):
|
def handle_content_append(current_content, line_content, append_newline, keywords,in_special_section):
|
||||||
if append_newline:
|
if append_newline:
|
||||||
if should_add_newline(current_content, keywords):
|
if should_add_newline(current_content, keywords):
|
||||||
current_content.append('\n') # 添加换行符
|
current_content.append('\n') # 添加换行符
|
||||||
append_newline = False
|
append_newline = False
|
||||||
current_content.append(line_content)
|
current_content.append(line_content)
|
||||||
|
if in_special_section:
|
||||||
|
current_content.append('\n')
|
||||||
return append_newline
|
return append_newline
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@ -134,7 +136,11 @@ def parse_text_by_heading(text):
|
|||||||
skip_subheadings = False
|
skip_subheadings = False
|
||||||
last_main_number = None
|
last_main_number = None
|
||||||
temp_title = None # 临时存储以点号开头但不带数字的标题
|
temp_title = None # 临时存储以点号开头但不带数字的标题
|
||||||
|
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
|
||||||
|
pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*')
|
||||||
|
initial_heading_pattern = None
|
||||||
|
special_section_keywords = ['文件的组成', '文件的构成'] # 定义特殊章节关键词
|
||||||
|
in_special_section = False # 标志是否在特殊章节中
|
||||||
lines = text.split('\n')
|
lines = text.split('\n')
|
||||||
|
|
||||||
def check_year_pattern(line):
|
def check_year_pattern(line):
|
||||||
@ -150,11 +156,22 @@ def parse_text_by_heading(text):
|
|||||||
current_content.append(line_stripped)
|
current_content.append(line_stripped)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# **首先检查是否进入特殊章节**
|
||||||
|
if any(keyword in line_stripped for keyword in special_section_keywords):
|
||||||
|
in_special_section = True
|
||||||
|
|
||||||
# 匹配带数字的标题,例如 '12.1 内容'
|
# 匹配带数字的标题,例如 '12.1 内容'
|
||||||
match = re.match(r'^(?<![a-zA-Z((])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
|
match = re.match(r'^(?<![a-zA-Z((])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
|
||||||
if not match:
|
if not match:
|
||||||
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
|
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
|
||||||
|
|
||||||
|
# 检查是否退出特殊章节
|
||||||
|
if in_special_section:
|
||||||
|
# 如果匹配到了主要标题,且不包含特殊关键词,则退出特殊章节
|
||||||
|
if match and not any(keyword in line_stripped for keyword in special_section_keywords):
|
||||||
|
in_special_section = False
|
||||||
|
|
||||||
|
# 以下是原有的匹配逻辑
|
||||||
# 匹配以点号开头并带有数字的情况,例如 '.12.1 内容'
|
# 匹配以点号开头并带有数字的情况,例如 '.12.1 内容'
|
||||||
dot_match = re.match(r'^[..](\d+(?:[..]\d+)*)\s*(.+)$', line_stripped)
|
dot_match = re.match(r'^[..](\d+(?:[..]\d+)*)\s*(.+)$', line_stripped)
|
||||||
|
|
||||||
@ -197,7 +214,7 @@ def parse_text_by_heading(text):
|
|||||||
append_newline = len(new_key.rstrip('.').split('.')) <= 2
|
append_newline = len(new_key.rstrip('.').split('.')) <= 2
|
||||||
last_main_number = new_key.split('.')[0]
|
last_main_number = new_key.split('.')[0]
|
||||||
else:
|
else:
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
|
||||||
|
|
||||||
elif dot_match:
|
elif dot_match:
|
||||||
# 处理以点号开头并带有数字的情况
|
# 处理以点号开头并带有数字的情况
|
||||||
@ -212,14 +229,15 @@ def parse_text_by_heading(text):
|
|||||||
current_content = [line_content]
|
current_content = [line_content]
|
||||||
append_newline = True
|
append_newline = True
|
||||||
else:
|
else:
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
|
||||||
|
|
||||||
elif dot_text_match:
|
elif dot_text_match:
|
||||||
# 处理以点号开头但不带数字的情况,存储到临时变量
|
# 处理以点号开头但不带数字的情况,存储到临时变量
|
||||||
temp_title = dot_text_match.group(1).strip()
|
temp_title = dot_text_match.group(1).strip()
|
||||||
continue # 跳过进一步处理该行
|
continue # 跳过进一步处理该行
|
||||||
|
|
||||||
elif pure_number_match: # 处理不带点号的纯数字开头的情况,例如 '27xxxxx'
|
elif pure_number_match:
|
||||||
|
# 处理不带点号的纯数字开头的情况
|
||||||
new_key_candidate, line_content = pure_number_match.groups()
|
new_key_candidate, line_content = pure_number_match.groups()
|
||||||
new_key_candidate += '.' # 添加点号
|
new_key_candidate += '.' # 添加点号
|
||||||
line_content = line_content.lstrip('..、,')
|
line_content = line_content.lstrip('..、,')
|
||||||
@ -247,69 +265,86 @@ def parse_text_by_heading(text):
|
|||||||
current_content = [line_content]
|
current_content = [line_content]
|
||||||
append_newline = True
|
append_newline = True
|
||||||
last_main_number = new_key_candidate.rstrip('.')
|
last_main_number = new_key_candidate.rstrip('.')
|
||||||
# if current_key is None or (current_key != new_key and ( #不给序号排序
|
|
||||||
# len(current_content) == 0 or current_content[-1][-1] != '第')):
|
|
||||||
# if current_key is not None:
|
|
||||||
# content_string = ''.join(current_content).strip()
|
|
||||||
# data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
|
|
||||||
# current_key = new_key
|
|
||||||
# current_content = [line_content]
|
|
||||||
# append_newline = len(new_key.rstrip('.').split('.')) <= 2
|
|
||||||
# last_main_number = new_key.split('.')[0]
|
|
||||||
else:
|
else:
|
||||||
# 将当前行视为当前标题的内容
|
# 将当前行视为当前标题的内容
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if not skip_subheadings: # 合并中文标题、字母标题、阿拉伯数字标题的匹配逻辑
|
if not skip_subheadings and not in_special_section: # 增加 in_special_section 判断
|
||||||
pattern_title = re.compile(
|
numbered_match = pattern_numbered.match(line_stripped)
|
||||||
r'^\s*(?:[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
|
parenthesis_match = pattern_parentheses.match(line_stripped)
|
||||||
chinese_match = pattern_title.match(line_stripped)
|
|
||||||
letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
|
letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
|
||||||
arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
|
arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
|
||||||
|
|
||||||
# 优化处理逻辑,减少冗余
|
# 判断当前行是否匹配了任何标题模式
|
||||||
if chinese_match or letter_match or arabic_match:
|
if numbered_match or parenthesis_match or letter_match or arabic_match:
|
||||||
# 保存之前的 key 的内容(无论是中文标题还是阿拉伯数字标题)
|
# 如果初始标题模式尚未设置,则记录当前匹配的标题模式
|
||||||
if current_key is not None:
|
if initial_heading_pattern is None:
|
||||||
content_string = ''.join(current_content).strip()
|
if numbered_match:
|
||||||
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
|
initial_heading_pattern = 'numbered'
|
||||||
if current_key_chinese is not None:
|
elif parenthesis_match:
|
||||||
data[current_key_chinese] = current_value_chinese
|
initial_heading_pattern = 'parentheses'
|
||||||
current_key_chinese = None
|
elif letter_match:
|
||||||
|
initial_heading_pattern = 'letter'
|
||||||
|
elif arabic_match:
|
||||||
|
initial_heading_pattern = 'arabic'
|
||||||
|
|
||||||
# 处理中文标题
|
# 确定当前匹配的标题模式
|
||||||
if chinese_match:
|
if numbered_match:
|
||||||
current_key_chinese = chinese_match.group(1) or chinese_match.group(2)
|
current_heading_pattern = 'numbered'
|
||||||
current_value_chinese = line_stripped[chinese_match.end():].lstrip('..、,').replace(' ', '')
|
elif parenthesis_match:
|
||||||
if current_key_chinese in data:
|
current_heading_pattern = 'parentheses'
|
||||||
handle_content_append(current_content, '\n' + line_stripped, append_newline, keywords)
|
|
||||||
current_key_chinese = None
|
|
||||||
# 处理字母标题
|
|
||||||
elif letter_match:
|
elif letter_match:
|
||||||
letter_key, letter_value = letter_match.groups()
|
current_heading_pattern = 'letter'
|
||||||
letter_to_chinese = {
|
|
||||||
'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五',
|
|
||||||
'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十',
|
|
||||||
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
|
|
||||||
}
|
|
||||||
current_key_chinese = letter_to_chinese.get(letter_key, letter_key)
|
|
||||||
current_value_chinese = letter_value.lstrip('..、,').replace(' ', '')
|
|
||||||
if current_key_chinese in data:
|
|
||||||
handle_content_append(current_content, '\n' + line_stripped, append_newline, keywords)
|
|
||||||
current_key_chinese = None
|
|
||||||
# 处理阿拉伯数字标题
|
|
||||||
elif arabic_match:
|
elif arabic_match:
|
||||||
arabic_key, arabic_value = arabic_match.groups()
|
current_heading_pattern = 'arabic'
|
||||||
current_key = arabic_key.replace('、', '.')
|
|
||||||
current_content = [arabic_value]
|
|
||||||
append_newline = True
|
|
||||||
last_main_number = current_key.rstrip('.')
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 如果没有匹配标题,继续处理正文内容
|
# 如果当前标题模式与初始标题模式一致,创建新的键值对
|
||||||
if line_stripped:
|
if current_heading_pattern == initial_heading_pattern:
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
|
# 保存之前的 key 的内容
|
||||||
|
if current_key is not None:
|
||||||
|
content_string = ''.join(current_content).strip()
|
||||||
|
data[current_key] = data.get(current_key, '') + content_string.replace(' ', '')
|
||||||
|
if current_key_chinese is not None:
|
||||||
|
data[current_key_chinese] = current_value_chinese
|
||||||
|
current_key_chinese = None
|
||||||
|
|
||||||
|
# 处理匹配到的标题
|
||||||
|
if current_heading_pattern == 'numbered':
|
||||||
|
current_key_chinese = numbered_match.group(1)
|
||||||
|
current_value_chinese = line_stripped[numbered_match.end():].lstrip('..、,').replace(' ', '')
|
||||||
|
elif current_heading_pattern == 'parentheses':
|
||||||
|
current_key_chinese = parenthesis_match.group(1)
|
||||||
|
current_value_chinese = line_stripped[parenthesis_match.end():].lstrip('..、,').replace(' ', '')
|
||||||
|
elif current_heading_pattern == 'letter':
|
||||||
|
# 字母标题处理
|
||||||
|
letter_key, letter_value = letter_match.groups()
|
||||||
|
letter_to_chinese = {
|
||||||
|
'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五',
|
||||||
|
'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十',
|
||||||
|
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
|
||||||
|
}
|
||||||
|
current_key_chinese = letter_to_chinese.get(letter_key, letter_key)
|
||||||
|
current_value_chinese = letter_value.lstrip('..、,').replace(' ', '')
|
||||||
|
elif current_heading_pattern == 'arabic':
|
||||||
|
arabic_key, arabic_value = arabic_match.groups()
|
||||||
|
current_key = arabic_key.replace('、', '.')
|
||||||
|
current_content = [arabic_value]
|
||||||
|
append_newline = True
|
||||||
|
last_main_number = current_key.rstrip('.')
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# 当前标题模式与初始模式不一致,将该行视为内容
|
||||||
|
if line_stripped:
|
||||||
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
|
||||||
|
else:
|
||||||
|
# 未匹配到任何标题模式,将该行视为内容
|
||||||
|
if line_stripped:
|
||||||
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
|
||||||
|
else:
|
||||||
|
# 在特殊章节中,所有内容都作为当前标题的内容
|
||||||
|
if line_stripped:
|
||||||
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section)
|
||||||
|
|
||||||
# 最后保存最后一个 key 对应的内容
|
# 最后保存最后一个 key 对应的内容
|
||||||
if current_key is not None:
|
if current_key is not None:
|
||||||
@ -448,11 +483,12 @@ def process_folder(input_folder, output_folder):
|
|||||||
|
|
||||||
#TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次)_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏
|
#TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次)_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏
|
||||||
#TODO: 投标人须知正文这块,序号可能是乱序的,或许可以删除判断序号大小的逻辑,只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理
|
#TODO: 投标人须知正文这块,序号可能是乱序的,或许可以删除判断序号大小的逻辑,只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理
|
||||||
|
#TODO:11.6 目前 '文件的组成' 是匹配任意行,可以考虑只匹配'11.文件的组成' 前面有序号的行 a110ed59-00e8-47ec-873a-bd4579a6e628\ztbfile_tobidders_notice_part2.pdf还有问题
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
|
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
|
||||||
file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
file_path=r'C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\ztbfile_tobidders_notice_part2.pdf'
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
||||||
output_folder = 'D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\tmp'
|
output_folder = r'C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\\tmp'
|
||||||
try:
|
try:
|
||||||
output_path = convert_clause_to_json(file_path,output_folder,1)
|
output_path = convert_clause_to_json(file_path,output_folder,1)
|
||||||
print(f"Final JSON result saved to: {output_path}")
|
print(f"Final JSON result saved to: {output_path}")
|
||||||
|
@ -243,6 +243,9 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
|
|||||||
#广水市 2022 年义务教育学校多媒体补充采购项目 资格审查有问题
|
#广水市 2022 年义务教育学校多媒体补充采购项目 资格审查有问题
|
||||||
#TODO: 目前跳转可能有个问题,资格审查那边:既有原来的内容又有跳转后的内容;符合本采购文件第一章第二款要求,并提供合格有效的证明材料<br>1、满足《中华人民共和国政府采购法》第二十二条规定,即:<br>(1)具有独立承担
|
#TODO: 目前跳转可能有个问题,资格审查那边:既有原来的内容又有跳转后的内容;符合本采购文件第一章第二款要求,并提供合格有效的证明材料<br>1、满足《中华人民共和国政府采购法》第二十二条规定,即:<br>(1)具有独立承担
|
||||||
|
|
||||||
|
#TODO:资格审查默认加上第一章的内容, 资格审查章节不做跳转逻辑.
|
||||||
|
#TODO:开评定标那块,不做层次遍历 ing
|
||||||
|
#TODO:技术要求提取更全面一点 fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628
|
||||||
#good_list 金额 截取上下文
|
#good_list 金额 截取上下文
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# 配置日志器
|
# 配置日志器
|
||||||
|
Loading…
x
Reference in New Issue
Block a user