1.10
This commit is contained in:
parent
06a3c2bbaf
commit
4617595c82
@ -161,7 +161,7 @@ def get_invalid_file(file_path, output_folder, common_header,begin_page):
|
||||
|
||||
# 定义排除模式
|
||||
exclusion_pattern = regex.compile(
|
||||
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制',
|
||||
r'文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制',
|
||||
regex.MULTILINE
|
||||
)
|
||||
|
||||
@ -294,29 +294,16 @@ def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
|
||||
break
|
||||
return start_page, end_page
|
||||
|
||||
def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern):
|
||||
def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header):
|
||||
"""
|
||||
从PDF文档中提取起始页、中间页和结束页。
|
||||
|
||||
如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取。
|
||||
|
||||
参数:
|
||||
pdf_document (PDFDocument): 要处理的PDF文档对象。
|
||||
begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。
|
||||
begin_page (int): 开始搜索的页码。
|
||||
common_header (str): 每页需要清理的公共头部文本。
|
||||
exclusion_pattern (str 或 regex.Pattern): 用于排除某些页的模式。
|
||||
|
||||
返回:
|
||||
tuple: (start_page, mid_page, end_page) 如果成功,否则 (None, None, None)
|
||||
"""
|
||||
|
||||
exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')
|
||||
def run_extraction(local_begin_pattern, local_end_pattern=None):
|
||||
"""
|
||||
使用提供的 begin 和 end 模式运行提取过程。
|
||||
|
||||
如果未提供 local_end_pattern,则根据匹配的章节类型动态生成 end_pattern。
|
||||
|
||||
参数:
|
||||
local_begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。
|
||||
local_end_pattern (str 或 regex.Pattern, 可选): 用于识别结束的正则表达式模式。
|
||||
@ -427,18 +414,28 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
||||
|
||||
return start_page, mid_page, end_page
|
||||
|
||||
def perform_extraction_and_save(start_page, mid_page, end_page):
|
||||
path1 = save_extracted_pages(pdf_path,output_folder,start_page,mid_page,"tobidders_notice_part1",common_header)
|
||||
if mid_page != end_page:
|
||||
path2 = save_extracted_pages(pdf_path,output_folder,mid_page,end_page,"tobidders_notice_part2",common_header)
|
||||
else:
|
||||
path2 = path1
|
||||
return path1, path2
|
||||
|
||||
# 第一次提取尝试,使用初始的 begin_pattern
|
||||
start_page, mid_page, end_page = run_extraction(begin_pattern)
|
||||
# 如果第一次提取成功,保存并返回路径
|
||||
if start_page is not None and mid_page is not None and end_page is not None:
|
||||
return perform_extraction_and_save(start_page, mid_page, end_page)
|
||||
|
||||
# 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取
|
||||
if not (start_page and mid_page and end_page):
|
||||
print(f"第二次尝试 tobidders_notice!")
|
||||
print(f"第一次提取tobidders_notice失败,尝试第二次提取: {pdf_path}")
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
|
||||
if start_page and end_page and mid_page:
|
||||
return start_page, mid_page, end_page
|
||||
else:
|
||||
# 定义新的 begin_pattern 和 end_pattern
|
||||
|
||||
if start_page is not None and mid_page is not None and end_page is not None:
|
||||
return perform_extraction_and_save(start_page, mid_page, end_page)
|
||||
|
||||
# 如果第二次提取失败,定义新的 begin_pattern 和 end_pattern 进行第三次尝试
|
||||
new_begin_pattern = regex.compile(
|
||||
r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
|
||||
r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表',
|
||||
@ -448,11 +445,15 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
||||
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$',
|
||||
regex.MULTILINE
|
||||
)
|
||||
print("第三次尝试 tobidders_notice! ")
|
||||
# 第二次提取尝试,使用新的模式
|
||||
print("第二次提取tobidders_notice失败,尝试第三次提取!")
|
||||
start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern)
|
||||
|
||||
return start_page, mid_page, end_page
|
||||
if start_page is not None and mid_page is not None and end_page is not None:
|
||||
return perform_extraction_and_save(start_page, mid_page, end_page)
|
||||
else:
|
||||
# 所有提取尝试均失败
|
||||
print(f"三次提取tobidders_notice均失败!! {pdf_path}")
|
||||
return "", ""
|
||||
|
||||
|
||||
def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page):
|
||||
|
@ -118,7 +118,7 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
|
||||
|
||||
if __name__ == '__main__':
|
||||
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
||||
file_path=r"C:\Users\Administrator\Downloads\2024-福建-2024年度防汛抢险物资储备.pdf"
|
||||
file_path=r'D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\ztbfile.pdf'
|
||||
# file_path = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
||||
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
||||
|
@ -34,6 +34,7 @@ def judge_zbfile() -> Any:
|
||||
try:
|
||||
start_time = time.time()
|
||||
downloaded_filename = os.path.join(output_folder, "ztbfile")
|
||||
logger.info(f"接收到的url:{file_url}")
|
||||
downloaded_filepath, file_type = download_file(file_url, downloaded_filename)
|
||||
|
||||
if not downloaded_filepath or file_type == 4:
|
||||
|
@ -44,7 +44,7 @@ def judge_zbfile_exec(file_path):
|
||||
|
||||
if __name__ == '__main__':
|
||||
start_time = time.time()
|
||||
pdf_path = r"C:/Users/Administrator/Downloads/094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件 - 副本.pdf"
|
||||
pdf_path = r"C:\Users\Administrator\Downloads\商务标null1736476867734.docx"
|
||||
res = judge_zbfile_exec(pdf_path)
|
||||
if res:
|
||||
print("yes")
|
||||
|
@ -283,8 +283,10 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
|
||||
# 2.废标项这边,考虑大模型+正则并用
|
||||
# 废标项,增加对表格的提取+排除重复项,按顺序处理
|
||||
# 考虑将工程标和货物标的 投标人须知那块逻辑结合
|
||||
# C:\Users\Administrator\Desktop\fsdownload\b29de31a-297e-42cf-b9ba-6859e530a472 这里的投标文件要求有点问题
|
||||
# D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3排查一下
|
||||
# 解决禅道 测试的bug
|
||||
# 截取:对应 按照 根据
|
||||
# 评分点
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 配置日志器
|
||||
|
@ -13,7 +13,7 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
|
||||
is_secondary_match):
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
exclusion_pattern = regex.compile(
|
||||
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制')
|
||||
r'文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制')
|
||||
def run_extraction():
|
||||
start_page = None
|
||||
mid_page = None
|
||||
@ -134,7 +134,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
||||
end_page = None
|
||||
flag = True
|
||||
exclusion_pattern = regex.compile(
|
||||
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制')
|
||||
r'文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制')
|
||||
# 遍历文档的每一页,查找开始和结束短语的位置
|
||||
for i in range(len(pdf_document.pages)):
|
||||
page = pdf_document.pages[i]
|
||||
|
@ -12,24 +12,6 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
||||
common_header = extract_common_header(pdf_path)
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
exclusion_pattern = None
|
||||
|
||||
if output_suffix == "tobidders_notice":
|
||||
exclusion_pattern = regex.compile(
|
||||
r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')
|
||||
start_page, mid_page, end_page = extract_pages_tobidders_notice(
|
||||
pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern
|
||||
)
|
||||
if not start_page or not mid_page or not end_page:
|
||||
print(f"三次提取tobidders_notice均失败!!{pdf_path}")
|
||||
return "", ""
|
||||
path1 = save_extracted_pages(pdf_path,output_folder,start_page, mid_page, "tobidders_notice_part1",common_header)
|
||||
if mid_page != end_page:
|
||||
path2 = save_extracted_pages(pdf_path,output_folder, mid_page, end_page,"tobidders_notice_part2",common_header)
|
||||
else:
|
||||
path2=path1
|
||||
return path1, path2
|
||||
|
||||
else:
|
||||
# 原有的处理逻辑保持不变
|
||||
if output_suffix == "qualification1" or output_suffix == "procurement" or output_suffix == "evaluation_method":
|
||||
exclusion_pattern = regex.compile(
|
||||
@ -163,7 +145,7 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
|
||||
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page, logger):
|
||||
try:
|
||||
exclusion_pattern = regex.compile(
|
||||
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成|文件的编制|文件编制')
|
||||
r'文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制')
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
patterns = None
|
||||
start_page = None
|
||||
@ -298,7 +280,9 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
|
||||
if output_suffix == "default":
|
||||
output_suffix = local_output_suffix
|
||||
|
||||
# 将原先的 process_files 逻辑合并到此处
|
||||
if selection==4:
|
||||
result = extract_pages_tobidders_notice(pdf_path,output_folder,begin_pattern,begin_page,common_header)
|
||||
else:
|
||||
result = extract_pages(
|
||||
pdf_path,
|
||||
output_folder,
|
||||
@ -363,10 +347,10 @@ if __name__ == "__main__":
|
||||
logger = get_global_logger("123")
|
||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
||||
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
||||
pdf_path=r"C:\Users\Administrator\Downloads\招标文件 (4).pdf"
|
||||
pdf_path=r"D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\ztbfile.pdf"
|
||||
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
||||
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
||||
output_folder = r"D:\flask_project\flask_app\static\output\output1\test"
|
||||
output_folder = r"D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\tmp"
|
||||
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
||||
selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
|
||||
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
||||
|
@ -297,7 +297,7 @@ def generate_prompt(judge_res, full_text=None):
|
||||
**输出格式**:
|
||||
1.JSON格式,外层键名为需要采购的货物、设备或系统名称(一级键)。
|
||||
2.层次关系用嵌套键值对表示:
|
||||
-采购活动可能将目标划分为多个系统或货物。若文档通过大标题或表格层次或序号标明这种归属关系,请在JSON中以嵌套形式表示:
|
||||
-采购活动可能将目标划分为多个系统或货物。若文档通过大标题或表格结构或序号标明这种归属关系,请在JSON中以嵌套形式表示:
|
||||
-系统作为一级键,包含的货物作为二级键。
|
||||
-例子:"""
|
||||
1. 激光扫描系统:xxx
|
||||
|
Loading…
x
Reference in New Issue
Block a user