12.24 禅道bug修改
This commit is contained in:
parent
94cb44466b
commit
c71f8d6080
@ -6,6 +6,8 @@ from docx import Document
|
||||
from reportlab.pdfgen import canvas
|
||||
from reportlab.lib.units import cm
|
||||
|
||||
|
||||
#复制input_pdf_path的内容到invalid_added.pdf 这一步可能报错!绘制新的一页基本不会报错
|
||||
def insert_mark(input_pdf_path):
|
||||
try:
|
||||
# 构建输出文件路径,与输入文件同目录,名称为 invalid_added.pdf
|
||||
@ -59,7 +61,7 @@ def insert_mark(input_pdf_path):
|
||||
|
||||
except Exception as e:
|
||||
print(f"发生错误: {e}")
|
||||
return ""
|
||||
return input_pdf_path
|
||||
|
||||
|
||||
def delete_mark(docx_path):
|
||||
@ -88,7 +90,8 @@ def delete_mark(docx_path):
|
||||
return new_file_path
|
||||
|
||||
if __name__ == '__main__':
|
||||
input=r'C:\Users\Administrator\Desktop\fsdownload\1073c74f-02b5-463e-a129-23c790a3c872\ztbfile_tobidders_notice_part2.pdf'
|
||||
# add_blank_pages_v2(input)
|
||||
doc_path = r'C:\Users\Administrator\Desktop\fsdownload\1073c74f-02b5-463e-a129-23c790a3c872\invalid_added.docx'
|
||||
delete_mark(doc_path)
|
||||
input=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\baada43d-24f6-459d-8a81-219d130f20da\ztbfile.pdf'
|
||||
# input=r'C:\Users\Administrator\Desktop\new招标文件\output5\广水市公安局音视频监控系统设备采购项目_procurement.pdf'
|
||||
output=insert_mark(input)
|
||||
# doc_path = r'C:\Users\Administrator\Desktop\fsdownload\1073c74f-02b5-463e-a129-23c790a3c872\invalid_added.docx'
|
||||
# delete_mark(doc_path)
|
@ -236,11 +236,12 @@ def combine_evaluation_standards(evaluation_method_path,invalid_path,zb_type):
|
||||
1. 请首先定位评分细则的表格,不要回答有关资格审查的内容,也不要从评标办法正文中提取回答
|
||||
2. 你无需将表格的单元格内的内容进行拆分,需要将它视为一个整体
|
||||
3. '评分'的键值不能是一个范围数字,如'0-5分',应该是一个具体数字,如'5分',或者是一个定性的指标如'合格制'
|
||||
4. 如果该招标活动有多个包,则最外层键名为对应的包名,否则最外层键名为各大评分项
|
||||
5. 若表格中商务和技术评分混合一起,请根据你对招投标业务的熟悉,对实际表格内容的评分因素进行准确分类,归类至商务评分或技术评分。
|
||||
6. 若表中的评分大项不是这三个,请你根据语义分别映射到'技术评分'、'商务评分'、'投标报价评分',而不必严格按照表格中的名称。
|
||||
7. 若大项的'xx评分'要求未在文中说明,则键名'xx评分'的键值设为'本项目无xx评分项',例如"技术评分":"本项目无技术评分项"
|
||||
4. 若表格中商务和技术评分混合一起,请根据你对招投标业务的熟悉,对实际表格内容的评分因素进行准确分类,归类至商务评分或技术评分。
|
||||
5. 若表中的评分大项不是这三个,请你根据语义分别映射到'技术评分'、'商务评分'、'投标报价评分',而不必严格按照表格中的名称。
|
||||
6. 若大项的'xx评分'要求未在文中说明,则键名'xx评分'的键值设为'本项目无xx评分项',例如"技术评分":"本项目无技术评分项"
|
||||
|
||||
特殊情况:
|
||||
最外层键名为各大评分项;但是如果该招标、采购活动有多个分包,则最外层键名为对应的包名,如'一包',内部格式不变。
|
||||
禁止内容:
|
||||
1. 确保所有输出内容均基于提供的实际招标文件内容(除了最外层的三个评分大项名称),不使用任何预设的示例作为回答。
|
||||
2. 不得擅自添加不属于评审因素的键名以及 `'备注'` 之外的其他键名。
|
||||
|
@ -116,13 +116,13 @@ if __name__ == "__main__":
|
||||
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
|
||||
# pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
|
||||
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
|
||||
pdf_path=r'C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件(实高电子显示屏).pdf'
|
||||
pdf_path=r'C:\Users\Administrator\Downloads\_2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_加水印3333.pdf'
|
||||
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp"
|
||||
# selections = [1, 4] # 仅处理 selection 4、1
|
||||
selections = [1, 3, 5]
|
||||
files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods', selections)
|
||||
# files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods')
|
||||
# selections = [1, 3, 5]
|
||||
# files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods', selections)
|
||||
files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods')
|
||||
print(files)
|
||||
# print(files[-1])
|
||||
# print(files[-2])
|
||||
|
@ -235,6 +235,7 @@ def extract_sections(data, target_values):
|
||||
"""
|
||||
result = {}
|
||||
merged_sections = []
|
||||
processed_keys = set()
|
||||
|
||||
# 对键进行排序以保持顺序
|
||||
sorted_keys = sorted(
|
||||
@ -257,16 +258,17 @@ def extract_sections(data, target_values):
|
||||
return len(parts)
|
||||
|
||||
for key in sorted_keys:
|
||||
if key in processed_keys:
|
||||
continue # 跳过已经处理过的键
|
||||
value = data[key]
|
||||
if value in target_values:
|
||||
section_key_prefix = key # e.g., "3."
|
||||
section_name = value # e.g., "投标文件"
|
||||
section_level = get_level(section_key_prefix)
|
||||
# 使用子字符串匹配
|
||||
if any(target in value for target in target_values):
|
||||
section_key_prefix = key if key.endswith('.') else key + '.'
|
||||
section_name = value
|
||||
section_level = get_level(key)
|
||||
subitems = []
|
||||
|
||||
for sub_key in sorted_keys:
|
||||
# 检查子键是否属于当前章节且不是章节本身
|
||||
if sub_key.startswith(section_key_prefix) and sub_key != section_key_prefix:
|
||||
if sub_key.startswith(section_key_prefix) and sub_key != key:
|
||||
sub_value = data[sub_key]
|
||||
sub_level = get_level(sub_key)
|
||||
# 根据层级添加缩进,二级无缩进,三级开始每多一级加4个空格
|
||||
@ -275,12 +277,13 @@ def extract_sections(data, target_values):
|
||||
else:
|
||||
indent = ' ' * 4 * (sub_level - section_level - 1) # 三级及以上层级增加缩进
|
||||
subitems.append(f"{indent}{sub_key} {sub_value}")
|
||||
|
||||
processed_keys.add(sub_key) # 标记子键为已处理
|
||||
# 检查是否需要合并 "定标" 和 "中标" 章节
|
||||
if section_name in ["定标", "中标"]:
|
||||
if any(target in section_name for target in ["定标", "中标"]):
|
||||
merged_sections.extend(subitems)
|
||||
else:
|
||||
result[section_name] = subitems
|
||||
processed_keys.add(key) # 标记主键为已处理
|
||||
|
||||
# 如果存在需要合并的章节,将其合并为 "定标与中标"
|
||||
if merged_sections:
|
||||
|
@ -358,7 +358,6 @@ def extract_text_from_pdf(file_path, start_word, end_pattern):
|
||||
# 从PDF文件中提取文本
|
||||
common_header = extract_common_header(file_path)
|
||||
pdf_document = PdfReader(file_path)
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') #仅匹配第一页和最后一页,不需要exclusion_pattern
|
||||
all_pages_text = []
|
||||
start_index = None
|
||||
# 处理所有页面
|
||||
|
@ -672,12 +672,12 @@ if __name__ == '__main__':
|
||||
# clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
|
||||
# doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
|
||||
# doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
|
||||
pdf_path=r'C:\Users\Administrator\Desktop\fsdownload\8a9ebd69-af0d-4661-a8ce-78136cb6bc4f\ztbfile.pdf'
|
||||
pdf_path=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\baada43d-24f6-459d-8a81-219d130f20da\ztbfile.pdf'
|
||||
|
||||
output_dir = r"D:\flask_project\flask_app\static\output\output1\000aac0d-4aa4-4bc3-a9f9-76ff82ec2470\tmp"
|
||||
output_dir = r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\baada43d-24f6-459d-8a81-219d130f20da\tmp"
|
||||
# invalid_added=insert_mark(pdf_path)
|
||||
# invalid_added_docx=pdf2docx(invalid_added)
|
||||
invalid_added_docx=r'D:\flask_project\flask_app\static\output\output1\000aac0d-4aa4-4bc3-a9f9-76ff82ec2470\invalid_added.docx'
|
||||
invalid_added_docx=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\baada43d-24f6-459d-8a81-219d130f20da\ztbfile.docx'
|
||||
results = combine_find_invalid(invalid_added_docx, output_dir)
|
||||
end_time = time.time()
|
||||
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
|
||||
|
@ -118,7 +118,7 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
|
||||
|
||||
if __name__ == '__main__':
|
||||
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
||||
file_path=r"D:\flask_project\flask_app\static\output\output1\1571dc96-4bb3-4ab2-a45a-993ed82678e8\ztbfile.pdf"
|
||||
file_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\baada43d-24f6-459d-8a81-219d130f20da\ztbfile.pdf"
|
||||
# file_path = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
||||
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
||||
|
@ -271,7 +271,6 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
|
||||
#TODO:把所有未知都删掉。
|
||||
#TODO:考虑把解析失败的调用豆包,全文上传。
|
||||
|
||||
#TODO:重置一下投标文件格式提取那部分的代码
|
||||
#TODO:小解析考虑提速:1:直接pdf转文本,再切分。后期考虑。
|
||||
|
||||
#TODO: ec7d5328-9c57-450f-baf4-2e5a6f90ed1d
|
||||
|
@ -1,20 +1,20 @@
|
||||
import re
|
||||
|
||||
# 修改后的正则表达式
|
||||
pattern = re.compile(r'^\d+\s*[..]\s*\d+\s*[..]\s*\d+(?![..])')
|
||||
cleaned_text = """第三章 评标办法 (综合评分法)
|
||||
一、评标原则
|
||||
1.评标将本着公平、公正、科学、择优的原则进行。
|
||||
2.依法评标、严格保密。
|
||||
...
|
||||
8.投标文件 含有招标人不能接受的附加条件的;
|
||||
"""
|
||||
|
||||
# 测试字符串
|
||||
test_strings = [
|
||||
'5.1.3 投标文件未按时上传系统的。',
|
||||
'5.1.3.1 投标文件未按时上传系统的。',
|
||||
'5.1 投标文件未按时上传系统的。',
|
||||
'5.1.3 投标成功。',
|
||||
]
|
||||
end_pattern = '^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|评标办法前附表|附录(?:一)?[::]|附件(?:一)?[::]|附表(?:一)?[::]'
|
||||
matches = list(re.finditer(end_pattern, cleaned_text, re.MULTILINE))
|
||||
|
||||
# 测试匹配
|
||||
for test_string in test_strings:
|
||||
match = pattern.search(test_string)
|
||||
if match:
|
||||
print(f"匹配成功: {match.group()} -> {test_string}")
|
||||
if matches:
|
||||
end_index = matches[-1].start()
|
||||
cleaned_text = cleaned_text[:end_index]
|
||||
print("匹配成功,截断后文本:")
|
||||
print(cleaned_text)
|
||||
else:
|
||||
print(f"未匹配: {test_string}")
|
||||
print("未匹配到内容。")
|
||||
|
@ -113,7 +113,6 @@ def extract_from_notice(merged_baseinfo_path, clause_path, type):
|
||||
if not extracted_data:
|
||||
# 如果大章节筛选失败,尝试使用另一种筛选方法
|
||||
extracted_data = extract_json(data, target_values)
|
||||
|
||||
if not extracted_data:
|
||||
# 如果所有筛选方法均失败,调用回退函数
|
||||
final_result = get_requirements_with_gpt(merged_baseinfo_path, type)
|
||||
@ -146,8 +145,8 @@ def extract_from_notice(merged_baseinfo_path, clause_path, type):
|
||||
|
||||
if __name__ == "__main__":
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json'
|
||||
merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\05339b83-50bf-4405-905c-38625928840e\merged_baseinfo_path_more.pdf"
|
||||
clause_path=r"D:\flask_project\flask_app\static\output\output1\05339b83-50bf-4405-905c-38625928840e\clause1.json"
|
||||
merged_baseinfo_path=r"C:\Users\Administrator\Desktop\fsdownload\ec7d5328-9c57-450f-baf4-2e5a6f90ed1d\merged_baseinfo_path_more.pdf"
|
||||
clause_path=r"C:\Users\Administrator\Desktop\fsdownload\ec7d5328-9c57-450f-baf4-2e5a6f90ed1d\tmp\clause1.json"
|
||||
try:
|
||||
res = extract_from_notice(merged_baseinfo_path,clause_path, 2) # 可以改变此处的 type 参数测试不同的场景
|
||||
res2 = json.dumps(res, ensure_ascii=False, indent=4)
|
||||
|
@ -13,6 +13,7 @@ def convert_clause_to_json(file_path, output_folder, type=1):
|
||||
)
|
||||
end_pattern = (
|
||||
# r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
|
||||
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$|'
|
||||
r'评标办法前附表|'
|
||||
r'附录(?:一)?[::]|'
|
||||
@ -94,8 +95,8 @@ def convert_clause_to_json(file_path, output_folder, type=1):
|
||||
|
||||
if __name__ == "__main__":
|
||||
# file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zbtest20_tobidders_notice.pdf'
|
||||
file_path=r'C:\Users\Administrator\Desktop\招标文件\招标test文件夹\all\招标02_tobidders_notice.pdf'
|
||||
output_folder = r'C:\Users\Administrator\Desktop\招标文件\招标test文件夹\tmp'
|
||||
file_path=r'C:\Users\Administrator\Desktop\fsdownload\ec7d5328-9c57-450f-baf4-2e5a6f90ed1d\ztbfile_tobidders_notice.pdf'
|
||||
output_folder = r'C:\Users\Administrator\Desktop\fsdownload\ec7d5328-9c57-450f-baf4-2e5a6f90ed1d\tmp'
|
||||
try:
|
||||
output_path = convert_clause_to_json(file_path,output_folder)
|
||||
print(f"Final JSON result saved to: {output_path}")
|
||||
|
@ -179,11 +179,11 @@ def process_folder(input_folder, output_folder):
|
||||
#TODO:2024-陕西-陕西省某单位2024年执勤化妆服采购项目.pdf
|
||||
#TODO: .不予受理的情形 ,‘.后面必须跟中文或者空格’
|
||||
if __name__ == "__main__":
|
||||
file_path = r'C:\Users\Administrator\Desktop\fsdownload\d1ad6d85-fb69-4d01-ab8f-f5721fbb4400\ztbfile_tobidders_notice_part2.pdf'
|
||||
file_path = r'C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp\_2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_加水印3333_tobidders_notice_part2.pdf'
|
||||
# file_path=r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf'
|
||||
# file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf'
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\6.2定版视频会议磋商文件_tobidders_notice_part2.pdf'
|
||||
output_folder = r'C:\Users\Administrator\Desktop\招标文件\output4\tmp'
|
||||
output_folder = r'C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp'
|
||||
try:
|
||||
output_path = convert_clause_to_json(file_path,output_folder,1)
|
||||
print(f"Final JSON result saved to: {output_path}")
|
||||
|
Loading…
x
Reference in New Issue
Block a user