截取pdf工程/货物尝试合并
This commit is contained in:
parent
8aa7875a52
commit
51a8796353
@ -394,9 +394,9 @@ def extract_from_notice(merged_baseinfo_path, clause_path, type):
|
|||||||
final_result = postprocess_formatted2(extracted_data, target_values)
|
final_result = postprocess_formatted2(extracted_data, target_values)
|
||||||
return final_result
|
return final_result
|
||||||
|
|
||||||
# 如果 clause_path 为空,或者所有筛选方法均失败,调用回退函数
|
# # 如果 clause_path 为空,或者所有筛选方法均失败,调用回退函数
|
||||||
final_result = get_requirements_with_gpt(merged_baseinfo_path, type)
|
# final_result = get_requirements_with_gpt(merged_baseinfo_path, type)
|
||||||
return final_result
|
# return final_result
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error occurred while processing clause_path '{clause_path}': {e}")
|
print(f"Error occurred while processing clause_path '{clause_path}': {e}")
|
||||||
@ -404,10 +404,10 @@ def extract_from_notice(merged_baseinfo_path, clause_path, type):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json'
|
||||||
merged_baseinfo_path=r"C:\Users\Administrator\Desktop\fsdownload\ec7d5328-9c57-450f-baf4-2e5a6f90ed1d\tmp\merged_baseinfo_path_more.pdf"
|
merged_baseinfo_path=r"C:\Users\Administrator\Desktop\fsdownload\b29de31a-297e-42cf-b9ba-6859e530a472\ztbfile_merged_baseinfo.pdf"
|
||||||
clause_path=r"C:\Users\Administrator\Desktop\fsdownload\ec7d5328-9c57-450f-baf4-2e5a6f90ed1d\tmp\clause1.json"
|
clause_path=r"C:\Users\Administrator\Desktop\fsdownload\b29de31a-297e-42cf-b9ba-6859e530a472\clause1.json"
|
||||||
try:
|
try:
|
||||||
res = extract_from_notice(merged_baseinfo_path,clause_path, 2) # 可以改变此处的 type 参数测试不同的场景
|
res = extract_from_notice(merged_baseinfo_path,clause_path, 1) # 可以改变此处的 type 参数测试不同的场景
|
||||||
res2 = json.dumps(res, ensure_ascii=False, indent=4)
|
res2 = json.dumps(res, ensure_ascii=False, indent=4)
|
||||||
print(res2)
|
print(res2)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
from PyPDF2 import PdfReader
|
from PyPDF2 import PdfReader
|
||||||
|
|
||||||
from flask_app.货物标.截取pdf货物标版 import clean_page_content,extract_common_header
|
from flask_app.货物标.截取pdf货物标版 import clean_page_content,extract_common_header
|
||||||
@ -49,9 +48,6 @@ def handle_content_append(current_content, line_content, append_newline, keyword
|
|||||||
current_content.append('\n')
|
current_content.append('\n')
|
||||||
return append_newline
|
return append_newline
|
||||||
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
def parse_text_by_heading(text):
|
def parse_text_by_heading(text):
|
||||||
keywords = ['包含', '以下']
|
keywords = ['包含', '以下']
|
||||||
data = {}
|
data = {}
|
||||||
@ -367,9 +363,6 @@ def parse_text_by_heading(text):
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def extract_text_from_pdf(file_path, start_word, end_pattern):
|
def extract_text_from_pdf(file_path, start_word, end_pattern):
|
||||||
# 从PDF文件中提取文本
|
# 从PDF文件中提取文本
|
||||||
common_header = extract_common_header(file_path)
|
common_header = extract_common_header(file_path)
|
||||||
|
@ -284,9 +284,10 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
|
|||||||
|
|
||||||
#TODO:
|
#TODO:
|
||||||
# 2.废标项这边,考虑大模型+正则并用
|
# 2.废标项这边,考虑大模型+正则并用
|
||||||
# 废标项,增加对表格的提取+排除重复项
|
# 废标项,增加对表格的提取+排除重复项,按顺序处理
|
||||||
# 考虑将工程标和货物标的 投标人须知那块结合
|
# 考虑将工程标和货物标的 投标人须知那块逻辑结合
|
||||||
|
# C:\Users\Administrator\Desktop\fsdownload\b29de31a-297e-42cf-b9ba-6859e530a472 这里的投标文件要求有点问题
|
||||||
|
# 解决禅道 测试的bug
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# 配置日志器
|
# 配置日志器
|
||||||
|
@ -363,11 +363,11 @@ if __name__ == "__main__":
|
|||||||
logger = get_global_logger("123")
|
logger = get_global_logger("123")
|
||||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
||||||
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
||||||
pdf_path=r"D:\flask_project\flask_app\static\output\output1\test\招标文件-第二章-第六章-172404【电能表标准设备172404-1305001-0002】.pdf"
|
pdf_path=r"C:\Users\Administrator\Downloads\招标文件 (4).pdf"
|
||||||
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
||||||
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
||||||
output_folder = r"D:\flask_project\flask_app\static\output\output1\test"
|
output_folder = r"D:\flask_project\flask_app\static\output\output1\test"
|
||||||
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
||||||
selection = 6 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
|
selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
|
||||||
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
||||||
print(generated_files)
|
print(generated_files)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user