12.6 优化解析
This commit is contained in:
parent
e0ea02544d
commit
d2090e7fd6
@ -72,12 +72,12 @@ def clean_page_content(text, common_header):
|
||||
if header_line.strip(): # 只处理非空行
|
||||
# 替换首次出现的完整行
|
||||
text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
|
||||
|
||||
# 删除文本开头的“第x页”格式的页码
|
||||
text = re.sub(r'^第\d+页\s*', '', text)
|
||||
# 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除
|
||||
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
|
||||
text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码
|
||||
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码
|
||||
text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
|
||||
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 投标人须知这块, 页码和顶部序号混在一起的时候也会把序号给去除了。'2018.' 20为页码 18.为序号
|
||||
text = re.sub(r'^\s*\/\s*(共\s*)?\d{1,3}\s*(页)?\s*', '', text) #删除/123 /共123 /共123页 /123页
|
||||
text = re.sub(r'^\s*[—-]\s*\d{1,3}\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
|
||||
return text
|
||||
|
||||
|
||||
|
@ -259,22 +259,23 @@ if __name__ == '__main__':
|
||||
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
|
||||
# local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
|
||||
# local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf"
|
||||
local_path_in = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile.pdf"
|
||||
downloaded_file=pdf2docx(local_path_in)
|
||||
# local_path_in = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile.pdf"
|
||||
# downloaded_file=pdf2docx(local_path_in)
|
||||
# # downloaded_file=pdf2docx(local_path_in)
|
||||
# # downloaded_file=docx2pdf(local_path_in)
|
||||
print(downloaded_file)
|
||||
# test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/094%E5%AE%9A%E7%A8%BF-%E6%B9%96%E5%8C%97%E5%B7%A5%E4%B8%9A%E5%A4%A7%E5%AD%A6%E8%BD%BB%E6%AD%A6%E5%99%A8%E6%A8%A1%E6%8B%9F%E5%B0%84%E5%87%BB%E8%AE%BE%E5%A4%87%E9%87%87%E8%B4%AD%E9%A1%B9%E7%9B%AE%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6.pdf?Expires=1733410874&OSSAccessKeyId=TMP.3KdesMxbSGgK41BgYKRtckUxdQ2LUh2YMYiH1FnuupFhvRuxMbiuBAsXK9oGvHxaLvsDLQjp3db28cUK5YJSEYTGPnUevo&Signature=EjLj3KeLtj337lS1DEJO56471Tg%3D"
|
||||
# local_file_name = r'C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile'
|
||||
# downloaded = download_file(test_url, local_file_name)
|
||||
# if not downloaded:
|
||||
# print("下载文件失败或不支持的文件类型")
|
||||
# downloaded_filepath, file_type = downloaded
|
||||
# print(downloaded_filepath)
|
||||
# print(file_type)
|
||||
# # 检查文件类型
|
||||
# if file_type == 4:
|
||||
# print("error")
|
||||
# print(downloaded_file)
|
||||
|
||||
test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/094%E5%AE%9A%E7%A8%BF-%E6%B9%96%E5%8C%97%E5%B7%A5%E4%B8%9A%E5%A4%A7%E5%AD%A6%E8%BD%BB%E6%AD%A6%E5%99%A8%E6%A8%A1%E6%8B%9F%E5%B0%84%E5%87%BB%E8%AE%BE%E5%A4%87%E9%87%87%E8%B4%AD%E9%A1%B9%E7%9B%AE%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6%20-%20%E5%89%AF%E6%9C%AC.PDF?Expires=1733478585&OSSAccessKeyId=TMP.3KhfwZc3kpT9TUmsb46yBDdnRq8bbENcEWBbZP8nLMgmSjVkjg9edpTPUQUsH8VXtvvg839Xbm8N5paYxPKvxCGqx3Vx4m&Signature=RYOo7tMEyahaMA3cSsf2kkf8co8%3D"
|
||||
local_file_name = r'C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile'
|
||||
downloaded = download_file(test_url, local_file_name)
|
||||
if not downloaded:
|
||||
print("下载文件失败或不支持的文件类型")
|
||||
downloaded_filepath, file_type = downloaded
|
||||
print(downloaded_filepath)
|
||||
print(file_type)
|
||||
# 检查文件类型
|
||||
if file_type == 4:
|
||||
print("error")
|
||||
|
||||
|
||||
|
||||
|
@ -13,7 +13,9 @@ def extract_text_by_page(file_path):
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
cleaned_text = clean_page_content(text,common_header)
|
||||
# cleaned_text=text
|
||||
print(cleaned_text)
|
||||
print("-----------------"+str(page_num))
|
||||
result += cleaned_text
|
||||
# print(f"Page {page_num + 1} Content:\n{cleaned_text}")
|
||||
else:
|
||||
@ -116,13 +118,14 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
|
||||
|
||||
if __name__ == '__main__':
|
||||
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
||||
file_path=r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile.pdf"
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
|
||||
# file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\东莞支队查验招标文件.pdf"
|
||||
file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
||||
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
||||
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
|
||||
# ress = extract_common_header(file_path)
|
||||
# print(ress)
|
||||
ress = extract_common_header(file_path)
|
||||
print(ress)
|
||||
print("-----------------")
|
||||
res=extract_text_by_page(file_path)
|
||||
# print(res)磋商文件_tobidders_notice_part2.pdf
|
||||
# save_extracted_text_to_txt(file_path,"output.txt")
|
@ -7,7 +7,7 @@ from collections import OrderedDict
|
||||
from flask_app.general.json_utils import clean_json_string
|
||||
from flask_app.general.多线程提问 import multi_threading
|
||||
from flask_app.general.通义千问long import upload_file
|
||||
from flask_app.main.判断是否分包等 import read_questions_from_judge
|
||||
from flask_app.工程标.判断是否分包等 import read_questions_from_judge
|
||||
|
||||
def process_judge_questions(judge_file_path, chosen_numbers, merged_baseinfo_path, baseinfo_list1):
|
||||
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
|
||||
|
@ -1,8 +1,8 @@
|
||||
import json
|
||||
|
||||
from flask_app.general.json_utils import clean_json_string, rename_outer_key
|
||||
from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
|
||||
from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge
|
||||
from flask_app.工程标.投标人须知正文提取指定内容工程标 import extract_from_notice
|
||||
from flask_app.工程标.判断是否分包等 import judge_whether_main, read_questions_from_judge
|
||||
from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
|
||||
from flask_app.general.通义千问long import upload_file
|
||||
from flask_app.general.通用功能函数 import judge_consortium_bidding
|
||||
|
@ -4,10 +4,10 @@ import json
|
||||
import time
|
||||
|
||||
from flask_app.general.多线程提问 import multi_threading
|
||||
from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2
|
||||
from flask_app.工程标.根据条款号整合json import process_and_merge_entries,process_and_merge2
|
||||
from flask_app.general.json_utils import extract_content_from_json
|
||||
from flask_app.main.截取pdf import truncate_pdf_main
|
||||
from flask_app.main.提取json工程标版 import convert_clause_to_json
|
||||
from flask_app.工程标.截取pdf import truncate_pdf_main
|
||||
from flask_app.工程标.提取json工程标版 import convert_clause_to_json
|
||||
prompt = """
|
||||
# 角色
|
||||
你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。
|
||||
|
@ -3,17 +3,17 @@ import json
|
||||
import logging
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from flask_app.main.截取pdf import truncate_pdf_multiple
|
||||
from flask_app.工程标.截取pdf import truncate_pdf_multiple
|
||||
from flask_app.general.table_content_extraction import extract_tables_main
|
||||
from flask_app.old_version.文档理解大模型版知识库处理.知识库操作_old import addfileToKnowledge, deleteKnowledge
|
||||
from flask_app.main.提取json工程标版 import convert_clause_to_json
|
||||
from flask_app.工程标.提取json工程标版 import convert_clause_to_json
|
||||
from flask_app.general.json_utils import transform_json_values
|
||||
from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid
|
||||
from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
|
||||
from flask_app.工程标.无效标和废标和禁止投标整合 import combine_find_invalid
|
||||
from flask_app.工程标.投标人须知正文提取指定内容工程标 import extract_from_notice
|
||||
import concurrent.futures
|
||||
from flask_app.old_version.基础信息整合_old import combine_basic_info
|
||||
from flask_app.old_version.资格审查模块old_old import combine_review_standards
|
||||
from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards
|
||||
from flask_app.工程标.商务评分技术评分整合 import combine_evaluation_standards
|
||||
from flask_app.general.format_change import pdf2docx, docx2pdf
|
||||
from flask_app.general.docx截取docx import copy_docx
|
||||
|
||||
|
@ -3,16 +3,16 @@ import json
|
||||
import logging
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from flask_app.main.截取pdf import truncate_pdf_multiple
|
||||
from flask_app.工程标.截取pdf import truncate_pdf_multiple
|
||||
from flask_app.general.table_content_extraction import extract_tables_main
|
||||
from flask_app.main.提取json工程标版 import convert_clause_to_json
|
||||
from flask_app.工程标.提取json工程标版 import convert_clause_to_json
|
||||
from flask_app.general.json_utils import transform_json_values
|
||||
from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid
|
||||
from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
|
||||
from flask_app.工程标.无效标和废标和禁止投标整合 import combine_find_invalid
|
||||
from flask_app.工程标.投标人须知正文提取指定内容工程标 import extract_from_notice
|
||||
import concurrent.futures
|
||||
from flask_app.main.基础信息整合快速版 import combine_basic_info
|
||||
from flask_app.main.资格审查模块 import combine_review_standards
|
||||
from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards
|
||||
from flask_app.工程标.基础信息整合快速版 import combine_basic_info
|
||||
from flask_app.工程标.资格审查模块 import combine_review_standards
|
||||
from flask_app.工程标.商务评分技术评分整合 import combine_evaluation_standards
|
||||
from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx
|
||||
from flask_app.general.docx截取docx import copy_docx
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
import os
|
||||
|
||||
from flask_app.main.提取json工程标版 import convert_clause_to_json
|
||||
from flask_app.工程标.提取json工程标版 import convert_clause_to_json
|
||||
from flask_app.general.json_utils import extract_content_from_json
|
||||
from flask_app.old_version.形式响应评审old import process_reviews
|
||||
from flask_app.old_version.资格评审old_old import process_qualification
|
||||
|
@ -7,16 +7,16 @@ from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from docx import Document
|
||||
|
||||
from flask_app.main.截取pdf import truncate_pdf_multiple
|
||||
from flask_app.工程标.截取pdf import truncate_pdf_multiple
|
||||
from flask_app.general.merge_pdfs import merge_pdfs
|
||||
from flask_app.main.提取json工程标版 import convert_clause_to_json
|
||||
from flask_app.工程标.提取json工程标版 import convert_clause_to_json
|
||||
from flask_app.general.json_utils import transform_json_values
|
||||
from flask_app.general.无效标和废标公共代码 import combine_find_invalid
|
||||
from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
|
||||
from flask_app.工程标.投标人须知正文提取指定内容工程标 import extract_from_notice
|
||||
import concurrent.futures
|
||||
from flask_app.main.基础信息整合快速版 import combine_basic_info
|
||||
from flask_app.main.资格审查模块 import combine_review_standards
|
||||
from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards
|
||||
from flask_app.工程标.基础信息整合快速版 import combine_basic_info
|
||||
from flask_app.工程标.资格审查模块 import combine_review_standards
|
||||
from flask_app.工程标.商务评分技术评分整合 import combine_evaluation_standards
|
||||
from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx
|
||||
|
||||
|
||||
|
@ -10,7 +10,7 @@ from flask_app.general.多线程提问 import read_questions_from_file, multi_th
|
||||
from flask_app.general.通义千问long import upload_file
|
||||
from flask_app.货物标.基础信息解析main import aggregate_basic_info_goods
|
||||
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_specific_goods
|
||||
from flask_app.main.截取pdf import truncate_pdf_specific_engineering,truncate_pdf_main
|
||||
from flask_app.工程标.截取pdf import truncate_pdf_specific_engineering,truncate_pdf_main
|
||||
from flask_app.general.post_processing import inner_post_processing
|
||||
from flask_app.old_version.基础信息整合_old import aggregate_basic_info_engineering
|
||||
|
||||
|
@ -274,8 +274,7 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
|
||||
#TODO:考虑把解析失败的调用豆包,全文上传。
|
||||
|
||||
#TODO:重置一下投标文件格式提取那部分的代码
|
||||
|
||||
#TODO:小解析考虑提速:1:直接pdf转文本,再切分 2.多线程读取每页是否有图片
|
||||
#TODO:小解析考虑提速:1:直接pdf转文本,再切分。后期考虑。
|
||||
#商务标这里改为列表最里层
|
||||
#good_list 金额 截取上下文
|
||||
if __name__ == "__main__":
|
||||
|
@ -1,20 +1,60 @@
|
||||
import re
|
||||
|
||||
# 合并后的正则表达式
|
||||
begin_pattern_combined = re.compile('^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?|资格性检查|资格审查|符合性审查)', re.MULTILINE)
|
||||
begin_pattern = re.compile(
|
||||
r'(?<!见)' # 确保前面不是“见”
|
||||
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
|
||||
r'[\u4e00-\u9fff、()()]*?' # 匹配允许的字符(中文、顿号、括号)
|
||||
r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审”
|
||||
r'(?=.*(?:办法|方法))' # 确保包含“办法”或“方法”
|
||||
r'[\u4e00-\u9fff、()()]*\s*$' # 继续匹配允许的字符直到行尾
|
||||
r'|\s*评标(办法|方法)前附表\s*$', # 或匹配“评标办法前附表”或“评标方法前附表”
|
||||
re.MULTILINE
|
||||
)
|
||||
|
||||
|
||||
# 测试字符串
|
||||
test_strings = [
|
||||
"附件 4",
|
||||
"第三部分 商务要求哈哈",
|
||||
"第六章 哈哈采购",
|
||||
"第八部分 需求说明",
|
||||
"第九章 技术要求",
|
||||
"""
|
||||
.4评标委员会成员因缺席、回避、擅评标办法前附表康等原因不能继续履评标办法前附表
|
||||
责的,采购人或者采购代理机构有权向相关监督管理部门通报。
|
||||
17. 投标人资格审查和投标文件符合性审查
|
||||
17.1投标人资格审查指依据法律、法规和招标文件的规定,对投标文件中的资格、资信证
|
||||
明等进行审查,以确定投标人是否具备投标资格;投标文件符合性审查指依据招标文件的
|
||||
规定,从投标文件的有效性、完整性和对招标文件的响应程度进行审查,以确定是否对招
|
||||
标文件的实质性要求作出响应。
|
||||
17.2投标人未通过资格审查的不得进入投标文件符合性审查 ; 投标人未通过符合性审查的,
|
||||
不得进入投标文件的综合比较与评价。
|
||||
17.3品牌及型号必须为清单中有效期内产品并提供证明文件, 否则其投标将作为无效投标
|
||||
被拒绝。
|
||||
17.3.1如本项目使用最低评标价法, 提供相同品牌产品的不同投标人以其中通过资格审查、
|
||||
符合性审查且报价最低的参加评标;报价相同的,由采购人或者采购人委托评标委员会按
|
||||
照招标文件中评标办法规定的方式确定 一个参加评标的投标人;其他投标无效。
|
||||
17.3.2如本项目使用综合评分法,提供相同品牌产品且通过资格审查、符合性审查的不同
|
||||
投标人,按一家投标人计算,评审后得分最高的同品牌投标人获得中标人推荐资格;评审
|
||||
得分相同的,由采购人或者采购人委托评标 委员会按照招标文件中评标办法规定的方式确
|
||||
定一个投标人获得中标人推荐资格;
|
||||
17.4如一个分包内包含多种产品的, 采购人或采购代理机构将在投标人须知前附表中载明
|
||||
核心产品,多家投标人提供的所有核心产品品牌均相同的, 按第 18.3.2 条及相关法律法
|
||||
规处理。
|
||||
17.5投标人所投产品如被列入财政部与国家主管部门颁发的节能产品目录或环境标志产
|
||||
品目录,应提供相关证明,在评标时予以优先采购,具体优先采购办见第五章评标方法
|
||||
和标准。如采购人所采购产品为政府强制采购的节能产品,投标人所投产品的品牌及型号
|
||||
必须为清单中有效期内产品并提供证明文件,否则其投标将作为无效投标被拒绝。
|
||||
17.6投标人不良信用记录以采购人或采购代理机构查询结果为准。
|
||||
17.7资格审查和符合性审查标准详见第五章评标方法和标准。
|
||||
18. 投标文件的澄清和修正
|
||||
18.1对于投标文件中含义不明确、 同类问题表述不一致或者有明显文字和计算错误的内容,
|
||||
评标委员会应当以书面形式要求投标人作出必要的澄 清、说明或者补正。
|
||||
18.2投标人的澄清、说明或者补正应当采用书面形式,并加盖公章,或者由法定代表人或
|
||||
其授权的代表签字。投标人的澄清、说明或者补正不得超出投标文件的范围或者改变投标
|
||||
文件的实质性内容。澄清文件将作为投标文件内容的一部分。
|
||||
"""
|
||||
]
|
||||
|
||||
for test_string in test_strings:
|
||||
match = begin_pattern_combined.search(test_string)
|
||||
match = re.search(begin_pattern, test_string)
|
||||
if match:
|
||||
print(f"匹配成功:{test_string}")
|
||||
print("Matched Content:", match.group()) # 输出匹配的内容
|
||||
else:
|
||||
print(f"匹配失败:{test_string}")
|
||||
print("No match found.")
|
||||
|
@ -3,7 +3,7 @@ import json
|
||||
import os.path
|
||||
import re
|
||||
from flask_app.general.json_utils import extract_content_from_json, clean_json_string # 可以选择性地导入特定的函数
|
||||
from flask_app.main.提取打勾符号 import read_pdf_and_judge_main
|
||||
from flask_app.工程标.提取打勾符号 import read_pdf_and_judge_main
|
||||
from flask_app.general.多线程提问 import multi_threading
|
||||
from flask_app.general.通义千问long import qianwen_long,upload_file
|
||||
#调用qianwen-ask之后,组织提示词问百炼。
|
@ -4,8 +4,8 @@ import time
|
||||
import concurrent.futures
|
||||
from flask_app.general.json_utils import clean_json_string, rename_outer_key
|
||||
from flask_app.general.通用功能函数 import judge_consortium_bidding, process_judge_questions
|
||||
from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
|
||||
from flask_app.main.判断是否分包等 import read_questions_from_judge, merge_json_to_list
|
||||
from flask_app.工程标.投标人须知正文提取指定内容工程标 import extract_from_notice
|
||||
from flask_app.工程标.判断是否分包等 import read_questions_from_judge, merge_json_to_list
|
||||
from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
|
||||
from flask_app.general.通义千问long import upload_file,qianwen_long
|
||||
|
@ -5,9 +5,9 @@ import json
|
||||
import time
|
||||
|
||||
from flask_app.general.多线程提问 import multi_threading
|
||||
from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2
|
||||
from flask_app.工程标.根据条款号整合json import process_and_merge_entries,process_and_merge2
|
||||
from flask_app.general.json_utils import clean_json_string
|
||||
from flask_app.main.提取json工程标版 import convert_clause_to_json
|
||||
from flask_app.工程标.提取json工程标版 import convert_clause_to_json
|
||||
from flask_app.general.通义千问long import upload_file
|
||||
from flask_app.general.merge_pdfs import merge_pdfs
|
||||
prompt = """
|
@ -75,7 +75,7 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
|
||||
def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header,
|
||||
is_secondary_match):
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
|
||||
|
||||
def run_extraction():
|
||||
start_page = None
|
||||
@ -195,7 +195,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
start_page = None
|
||||
end_page = None
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
|
||||
# 遍历文档的每一页,查找开始和结束短语的位置
|
||||
for i in range(len(pdf_document.pages)):
|
||||
page = pdf_document.pages[i]
|
||||
@ -367,7 +367,13 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
||||
),
|
||||
(
|
||||
re.compile(
|
||||
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]*?(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))[\u4e00-\u9fff、()()]*\s*$|\s*评标(办法|方法)前附表\s*$',
|
||||
r'(?<!见)' # 确保前面不是“见”
|
||||
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
|
||||
r'[\u4e00-\u9fff、()()]*?' # 匹配允许的字符(中文、顿号、括号)
|
||||
r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审”
|
||||
r'(?=.*(?:办法|方法))' # 确保包含“办法”或“方法”
|
||||
r'[\u4e00-\u9fff、()()]*\s*$' # 继续匹配允许的字符直到行尾
|
||||
r'|\s*评标(办法|方法)前附表\s*$', # 或匹配“评标办法前附表”或“评标方法前附表”
|
||||
re.MULTILINE
|
||||
),
|
||||
re.compile(r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
@ -5,8 +5,8 @@ import time
|
||||
from flask_app.general.format_change import pdf2docx
|
||||
from flask_app.general.json_utils import extract_content_from_json, clean_json_string
|
||||
from flask_app.general.table_content_extraction import extract_tables_main
|
||||
from flask_app.main.形式响应评审 import process_reviews
|
||||
from flask_app.main.资格评审 import process_qualification
|
||||
from flask_app.工程标.形式响应评审 import process_reviews
|
||||
from flask_app.工程标.资格评审 import process_qualification
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from flask_app.货物标.资格审查main import combine_qualification_review
|
@ -1,6 +1,6 @@
|
||||
import json
|
||||
|
||||
from flask_app.main.资格评审 import process_qualification
|
||||
from flask_app.工程标.资格评审 import process_qualification
|
||||
|
||||
|
||||
def test_process_qualification_type1():
|
@ -7,7 +7,7 @@ from flask_app.general.json_utils import clean_json_string, rename_outer_key
|
||||
from flask_app.general.通用功能函数 import judge_consortium_bidding, process_judge_questions
|
||||
from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
|
||||
from flask_app.general.通义千问long import upload_file
|
||||
from flask_app.main.判断是否分包等 import merge_json_to_list, read_questions_from_judge
|
||||
from flask_app.工程标.判断是否分包等 import merge_json_to_list, read_questions_from_judge
|
||||
from flask_app.货物标.投标人须知正文提取指定内容货物标版 import extract_from_notice
|
||||
from flask_app.货物标.提取采购需求main import fetch_procurement_reqs
|
||||
|
||||
|
@ -105,7 +105,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
||||
total_pages = len(pdf_document.pages) - 1 # 获取总页数
|
||||
|
||||
if output_suffix == "tobidders_notice":
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
|
||||
start_page, mid_page, end_page = extract_pages_tobidders_notice(
|
||||
pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern
|
||||
)
|
||||
@ -122,8 +122,8 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
||||
|
||||
else:
|
||||
# 原有的处理逻辑保持不变
|
||||
if output_suffix == "qualification1" or output_suffix=="procurement":
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
|
||||
if output_suffix == "qualification1" or output_suffix=="procurement" or output_suffix=="evaluation_method":
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
|
||||
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern, output_suffix)
|
||||
# 针对 selection = 6 的特殊处理
|
||||
if output_suffix == "format":
|
||||
@ -167,10 +167,11 @@ def get_patterns_for_evaluation_method():
|
||||
# re.MULTILINE
|
||||
# )
|
||||
begin_pattern = re.compile(
|
||||
r'(?<!见)'
|
||||
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
|
||||
r'(?:[\u4e00-\u9fff、()()]*?)' # 匹配允许的字符(中文、顿号、括号)
|
||||
r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审” 注意这里的'.*'是允许这些关键词出现在任意位置,但主体匹配部分仍然受到字符集的限制。
|
||||
r'(?=.*(?:办法|方法))' # 确保包含“办法”或“方法”
|
||||
r'(?=.*(?:办法|方法|内容))' # 确保包含“办法”或“方法”
|
||||
r'[\u4e00-\u9fff、()()]*\s*$', # 继续匹配允许的字符直到行尾
|
||||
re.MULTILINE
|
||||
)
|
||||
@ -374,60 +375,6 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
||||
|
||||
return start_page, mid_page, end_page
|
||||
|
||||
|
||||
#投标人须知分为两个章节
|
||||
# def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page):
|
||||
# begin_pattern = re.compile(
|
||||
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+'
|
||||
# )
|
||||
# end_pattern = re.compile(
|
||||
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)' # 捕获中文部分
|
||||
# )
|
||||
# exclusion_words = ["合同", "评标", "开标","评审","采购","资格"] # 在这里添加需要排除的关键词
|
||||
#
|
||||
# pdf_document = PdfReader(pdf_path)
|
||||
# exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
|
||||
#
|
||||
# # 提取第一部分
|
||||
# start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
|
||||
# if start_page1 is None or end_page1 is None:
|
||||
# print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
|
||||
# return "", ""
|
||||
#
|
||||
# # 保存第一部分的路径
|
||||
# path1 = save_extracted_pages(pdf_document, start_page1, end_page1, pdf_path, output_folder,
|
||||
# "tobidders_notice_part1")
|
||||
#
|
||||
# # 提取第二部分
|
||||
# start_page2 = end_page1
|
||||
#
|
||||
# # 检查end_page1页面的内容
|
||||
# text = pdf_document.pages[end_page1].extract_text() or ""
|
||||
# cleaned_text = clean_page_content(text, common_header)
|
||||
# match = end_pattern.search(cleaned_text)
|
||||
#
|
||||
# if match:
|
||||
# # 获取匹配到的中文部分
|
||||
# chapter_title = match.group(1)
|
||||
# # 检查是否包含排除关键词
|
||||
# if any(word in chapter_title for word in exclusion_words):
|
||||
# # 如果包含排除关键词,直接返回相同的路径
|
||||
# return path1, path1
|
||||
#
|
||||
# # 如果不包含排除关键词,继续提取第二部分
|
||||
# _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
|
||||
# exclusion_pattern)
|
||||
#
|
||||
# if end_page2 is None:
|
||||
# print(f"second: {output_suffix} 未找到第二部分的结束页在文件 {pdf_path} 中!")
|
||||
# return path1, path1
|
||||
#
|
||||
# # 保存第二部分的路径
|
||||
# path2 = save_extracted_pages(pdf_document, start_page2, end_page2, pdf_path, output_folder,
|
||||
# "tobidders_notice_part2")
|
||||
#
|
||||
# return path1, path2
|
||||
|
||||
def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page):
|
||||
output_suffix="tobidders_notice"
|
||||
begin_pattern = re.compile(
|
||||
@ -438,7 +385,7 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page)
|
||||
)
|
||||
exclusion_words = ["合同", "评标", "开标","评审","采购","资格"] # 在这里添加需要排除的关键词
|
||||
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
|
||||
|
||||
# 提取第一部分
|
||||
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
|
||||
@ -548,7 +495,7 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
|
||||
|
||||
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page):
|
||||
try:
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
patterns = None
|
||||
start_page = None
|
||||
@ -688,8 +635,7 @@ def process_input(input_path, output_folder, selection, output_suffix):
|
||||
end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
||||
local_output_suffix = "notice"
|
||||
elif selection == 2:
|
||||
begin_pattern = re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(磋商|谈判|评标|评定|评审|办法|方法).*(磋商|谈判|评标|评定|评审|办法|方法)')
|
||||
begin_pattern = re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法|内容))')
|
||||
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
||||
local_output_suffix = "evaluation_method"
|
||||
elif selection == 3:
|
||||
@ -827,17 +773,18 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
|
||||
|
||||
#ztbfile.pdf少资格评审 包头少符合性评审
|
||||
if __name__ == "__main__":
|
||||
logger=get_global_logger("123")
|
||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
||||
input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
||||
input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2020-安徽-安徽省生态环境厅电梯采购.pdf"
|
||||
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
|
||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
||||
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
|
||||
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
|
||||
output_folder=r"C:\Users\Administrator\Desktop\new招标文件\output5"
|
||||
files = truncate_pdf_multiple(input_path, output_folder)
|
||||
output_folder=r"C:\Users\Administrator\Desktop\招标文件-采购类\tmp2"
|
||||
# files = truncate_pdf_multiple(input_path, output_folder,logger)
|
||||
# selections = [3,5]
|
||||
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
|
||||
# print(files)
|
||||
selection = 5# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
||||
selection = 2# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
print(generated_files)
|
@ -177,9 +177,9 @@ def process_folder(input_folder, output_folder):
|
||||
#TODO:招标文件111_tobidders_notice_part2.pdf 陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目(1)_tobidders_notice_part2.pdf
|
||||
if __name__ == "__main__":
|
||||
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
|
||||
file_path=r'C:\Users\Administrator\Desktop\货物标\output4\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次)_tobidders_notice_part2.pdf'
|
||||
file_path=r'C:\Users\Administrator\Desktop\货物标\output4\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
||||
output_folder = r'C:\Users\Administrator\Desktop\招标文件\output3\tmp'
|
||||
output_folder = r'C:\Users\Administrator\Desktop\招标文件\output4\tmp'
|
||||
try:
|
||||
output_path = convert_clause_to_json(file_path,output_folder,1)
|
||||
print(f"Final JSON result saved to: {output_path}")
|
||||
|
Loading…
x
Reference in New Issue
Block a user