9.18
This commit is contained in:
parent
048ba886aa
commit
770e004e63
@ -39,7 +39,7 @@ def download_file(url, local_filename):
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 测试下载的URL
|
||||
test_url ="https://temp-pdf2docx.oss-cn-wuhan-lr.aliyuncs.com/docx/zbfile.docx?Expires=1725019436&OSSAccessKeyId=TMP.3KjfvBwPjtUPCu4BTNdkuN6BEvSbm1ibnrnTQX4ZdpSjCLX99a2Pq9bV52aA8JysVrbCZwhyuVjeMdJgdgxkqgPhwQfQoV&Signature=kXhJZZouEb82jQlhCwCpbm5%2Furs%3D"
|
||||
test_url ="https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/01%20%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6-%E5%A4%A7%E8%8D%94%E5%8E%BF%E5%85%AC%E5%AE%89%E5%B1%80%E6%83%85%E6%8C%87%E5%8B%A4%E8%88%86%E4%B8%80%E4%BD%93%E5%8C%96%E5%B9%B3%E5%8F%B0%E5%BB%BA%E8%AE%BE%E9%A1%B9%E7%9B%AE%28%E4%BA%8C%E6%AC%A1%29.pdf?Expires=1726661197&OSSAccessKeyId=TMP.3KdMWkHL1nqHypaY6LmhnzqJTRkTzuYLNNPhW9KZruLGLaM3YL7F45NfuF3JT4CszSe2FD7ZH6WZFUTumokmJsSEW6pPh6&Signature=7BIGVSb9YGYLKFBXeTQZm7QnTI8%3D"
|
||||
local_file_name = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output\\downloaded_file'
|
||||
file_path = download_file(test_url, local_file_name)
|
||||
if file_path:
|
||||
|
@ -64,7 +64,7 @@ def zbparse():
|
||||
return file_url
|
||||
try:
|
||||
logger.info("starting parsing url:" + file_url)
|
||||
final_json_path, output_folder = download_and_process_file(file_url)
|
||||
final_json_path, output_folder= download_and_process_file(file_url)
|
||||
if not final_json_path:
|
||||
return jsonify({'error': 'File processing failed'}), 500
|
||||
response = generate_response(final_json_path) # 先获取响应内容
|
||||
@ -138,8 +138,7 @@ def validate_request():
|
||||
def download_and_process_file(file_url):
|
||||
logger = g.logger
|
||||
unique_id = g.unique_id
|
||||
# output_folder = f"flask_app/static/output/{unique_id}" # 直接使用全局 unique_id 构建路径
|
||||
output_folder = f"C:/Users/Administrator/Desktop/货物标/zboutpub/{unique_id}"
|
||||
output_folder = f"flask_app/static/output/{unique_id}" # 直接使用全局 unique_id 构建路径
|
||||
filename = "ztbfile"
|
||||
downloaded_filename = os.path.join(output_folder, filename)
|
||||
|
||||
@ -148,7 +147,7 @@ def download_and_process_file(file_url):
|
||||
|
||||
if downloaded_filepath is None or file_type == 3:
|
||||
logger.error("Unsupported file type or failed to download file")
|
||||
return None, output_folder, logger
|
||||
return None, output_folder
|
||||
|
||||
logger.info("Local file path: " + downloaded_filepath)
|
||||
processed_file_path = main_processing(output_folder, downloaded_filepath, file_type, unique_id)
|
||||
|
@ -59,8 +59,8 @@ def combine_evaluation_standards(truncate2):
|
||||
user_query_2 = (
|
||||
"根据该文档中的评标办法前附表,请你列出该文件的技术标,商务标,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术标'中新增子键名'备注'存放该信息。如果评分内容不是这3个,则返回文档中给定的评分内容以及它的评分要求,都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容")
|
||||
evaluation_res = qianwen_long(file_id, user_query_2)
|
||||
target_values1 = ['技术标', '设计', '实施', '方案']
|
||||
target_values2=['投标报价','商务标','业绩','信誉','分值','计算公式','信用','人员','资格','奖项','认证','荣誉']
|
||||
target_values1 = ['技术标','技术部分','设计', '实施', '方案']
|
||||
target_values2=['投标报价','商务标','商务部分','报价部分','业绩','信誉','分值','计算公式','信用','人员','资格','奖项','认证','荣誉']
|
||||
update_json=combine_technical_and_business(clean_json_string(evaluation_res),target_values1,target_values2)
|
||||
evaluation_combined_res = json.dumps(update_json,ensure_ascii=False,indent=4)
|
||||
return evaluation_combined_res
|
||||
|
@ -130,11 +130,13 @@ def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type
|
||||
result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans)
|
||||
|
||||
def multi_threading(queries, knowledge_name="", file_id="",llm_type=1):
|
||||
if not queries:
|
||||
return None
|
||||
print("多线程提问:starting multi_threading...")
|
||||
result_queue = queue.Queue()
|
||||
|
||||
# 使用 ThreadPoolExecutor 管理线程
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor:
|
||||
# 逐个提交任务,每提交一个任务后休眠1秒
|
||||
future_to_query = {}
|
||||
for index, query in enumerate(queries):
|
||||
|
@ -19,11 +19,18 @@ def clean_page_content(text, common_header):
|
||||
def extract_common_header(pdf_path):
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
headers = []
|
||||
start_page = 4 # 从第5页开始读取,索引为4
|
||||
num_pages_to_read = 3 # 连续读取3页
|
||||
total_pages = len(pdf_document.pages)
|
||||
|
||||
# 确保从第5页开始,且总页数足够
|
||||
for i in range(start_page, min(start_page + num_pages_to_read, len(pdf_document.pages))):
|
||||
# 确定要读取的页数和起始页
|
||||
if total_pages == 2:
|
||||
pages_to_read = 2
|
||||
start_page = 0
|
||||
else:
|
||||
pages_to_read = 3
|
||||
middle_page = total_pages // 2
|
||||
start_page = max(0, middle_page - 1)
|
||||
|
||||
for i in range(start_page, min(start_page + pages_to_read, total_pages)):
|
||||
page = pdf_document.pages[i]
|
||||
text = page.extract_text() or ""
|
||||
if text:
|
||||
@ -256,9 +263,9 @@ def truncate_pdf_multiple(input_path, output_folder):
|
||||
|
||||
# TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下,比如说就不进行跳转(见xx表) 开评定标这里也要考虑 如果评分表为空,也要处理。
|
||||
if __name__ == "__main__":
|
||||
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标01.pdf"
|
||||
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件"
|
||||
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest1.pdf"
|
||||
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
|
||||
# truncate_pdf_multiple(input_path,output_folder)
|
||||
selection = 2 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
|
||||
selection = 3 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
|
||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
# # print("生成的文件:", generated_files)
|
||||
|
@ -1,30 +1,28 @@
|
||||
import json
|
||||
import docx
|
||||
import fitz
|
||||
import re
|
||||
import os
|
||||
|
||||
from PyPDF2 import PdfReader
|
||||
from flask_app.main.截取pdf import clean_page_content,extract_common_header
|
||||
|
||||
def extract_text_from_docx(file_path):
|
||||
doc = docx.Document(file_path)
|
||||
return '\n'.join([para.text for para in doc.paragraphs])
|
||||
|
||||
def clean_page_numbers(text):
|
||||
# 删除每页开头的页码,假设页码后跟至少一个空格
|
||||
cleaned_text = re.sub(r'^\s*\d+\s+', '', text)
|
||||
# 删除每页末尾的页码,假设页码前有至少一个空格
|
||||
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text)
|
||||
# 删除形如 /123 或 123/ 的页码
|
||||
cleaned_text = re.sub(r'\s*/\s*\d+\s*|\s*\d+\s*/\s*', '', cleaned_text)
|
||||
return cleaned_text
|
||||
|
||||
def extract_text_from_pdf(file_path):
|
||||
doc = fitz.open(file_path)
|
||||
# 从PDF文件中提取文本
|
||||
common_header = extract_common_header(file_path)
|
||||
pdf_document = PdfReader(file_path)
|
||||
text = ""
|
||||
for page in doc:
|
||||
page_text = page.get_text()
|
||||
page_text = clean_page_numbers(page_text)
|
||||
text += page_text
|
||||
# 遍历每一页
|
||||
for page in pdf_document.pages:
|
||||
# 提取当前页面的文本
|
||||
page_text = page.extract_text() if page.extract_text() else ""
|
||||
# 清洗页面文本
|
||||
page_text = clean_page_content(page_text, common_header)
|
||||
# 将清洗后的文本添加到总文本中
|
||||
text += page_text+"\n"
|
||||
return text
|
||||
|
||||
def extract_section(text, start_pattern, end_phrases):
|
||||
@ -33,6 +31,7 @@ def extract_section(text, start_pattern, end_phrases):
|
||||
return "" # 如果没有找到匹配的开始模式,返回空字符串
|
||||
start_index = start_match.start()
|
||||
end_index = len(text)
|
||||
print(text[start_index:])
|
||||
for phrase in end_phrases:
|
||||
# Use multiline mode with `re.MULTILINE`
|
||||
match = re.search(phrase, text[start_index:], re.MULTILINE) #Hello, world!\nWelcome to OpenAI. 在多行字符串多,要 re.MULTILINE以匹配每一行的开头,否则只会匹配字符串的开头。
|
||||
@ -146,13 +145,13 @@ def convert_clause_to_json(input_path,output_folder,type=1):
|
||||
return output_path
|
||||
|
||||
if __name__ == "__main__":
|
||||
file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest16_tobidders_notice.pdf'
|
||||
file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest20_tobidders_notice.pdf'
|
||||
start_word = "投标人须知正文"
|
||||
end_phrases = [
|
||||
r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
|
||||
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
|
||||
]
|
||||
output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件'
|
||||
output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹'
|
||||
try:
|
||||
output_path = convert_clause_to_json(file_path,output_folder)
|
||||
print(f"Final JSON result saved to: {output_path}")
|
||||
|
@ -1,47 +1,237 @@
|
||||
# -*- encoding:utf-8 -*-
|
||||
def postprocess(data):
|
||||
"""递归地转换字典中的值为列表,如果所有键对应的值都是'/', '{}' 或 '未知'"""
|
||||
def convert_dict(value):
|
||||
# 如果所有值是'/', '{}' 或 '未知'
|
||||
if all(v in ['/', '未知', {}] for v in value.values()):
|
||||
return list(value.keys())
|
||||
else:
|
||||
# 如果不满足条件,则递归处理嵌套的字典
|
||||
return {k: convert_dict(v) if isinstance(v, dict) else v for k, v in value.items()}
|
||||
import json
|
||||
|
||||
# 递归处理顶层数据
|
||||
return {key: convert_dict(val) if isinstance(val, dict) else val for key, val in data.items()}
|
||||
|
||||
def combine_technical_and_business(data, target_values1, target_values2):
|
||||
extracted_data = {} # 根级别存储所有数据
|
||||
technical_found = False
|
||||
business_found = False
|
||||
|
||||
def extract_nested(data, parent_key='', is_technical=False, is_business=False):
|
||||
nonlocal technical_found, business_found
|
||||
if isinstance(data, dict):
|
||||
for key, value in data.items():
|
||||
current_key = f"{parent_key}.{key}" if parent_key else key
|
||||
|
||||
# 检查是否为技术标的内容
|
||||
if any(target in key for target in target_values1): #模糊匹配
|
||||
if not is_technical:
|
||||
# 直接存储在根级别
|
||||
extracted_data[key] = value
|
||||
technical_found = True
|
||||
# 标记为技术标内容并停止进一步处理这个分支
|
||||
continue
|
||||
|
||||
# 检查是否为商务标的内容
|
||||
elif any(target in key for target in target_values2):
|
||||
if not is_business:
|
||||
# 存储在'商务标'分类下
|
||||
if '商务标' not in extracted_data:
|
||||
extracted_data['商务标'] = {}
|
||||
extracted_data['商务标'][key] = value
|
||||
business_found = True
|
||||
# 标记为商务标内容并停止进一步处理这个分支
|
||||
continue
|
||||
|
||||
# 如果当前值是字典或列表,且不在技术或商务分类下,继续递归搜索
|
||||
if isinstance(value, dict) or isinstance(value, list):
|
||||
extract_nested(value, current_key, is_technical, is_business)
|
||||
|
||||
elif isinstance(data, list):
|
||||
for index, item in enumerate(data):
|
||||
extract_nested(item, f"{parent_key}[{index}]", is_technical, is_business)
|
||||
|
||||
# 开始从顶级递归搜索
|
||||
extract_nested(data)
|
||||
|
||||
# 处理未找到匹配的情况
|
||||
if not technical_found:
|
||||
extracted_data['技术标'] = ''
|
||||
if not business_found:
|
||||
extracted_data['商务标'] = ''
|
||||
|
||||
return extracted_data
|
||||
|
||||
|
||||
# 示例数据
|
||||
data = {
|
||||
"第一包": {
|
||||
"办公电子设备": {
|
||||
"技术参数或采购要求": "1、服务器、台式计算机、便携式计算机、信息安全设备、喷墨打印机、激光打印机、针式打印机、液晶显示器、扫描仪、基础软件、信息安全软件、复印机、投影仪、多功能一体机、触控一体机、碎纸机、复印纸; 2、相关要求:供应商必须在市内有固定的经营场所,并提供相关经营场所的房产证明材料(自有场所要求提供房产证(不动产证)等证明文件、租赁场所要求提供房产租赁合同等证明文件)。供应商注册地不在湖北省内的,应在项目地设立分支机构,并提供相关证明材料(以工商营业执照和财政监管部门核实或备案证明材料为准)。所投货物应满足国家的强制性标准,执行有关政府采购政策,符合国家相关产业政策,合法销售、原厂原装、全新产品。在质保期内严格执行“三包”服务规定,提供设备标准现场保修、技术支持服务及备品、备件支持服务。安装的任何零配件,必须是其投标货物制造商原产或是经其认可的产品。3、报价要求:投标人的投标报价是基于市场实际成交价的基础上给予的优惠价格。每项货物内容只允许有一个报价。投标文件中的报价为投标人对采购单位提供该项货物的最高限价,且低于投标人市场成交的最低价。严格依据招标文件要求和我方投标文件的承诺,按中标价格优惠承诺向采购单位提供服务。不实行任何形式的区域限制,不在框架协议内容之外,提出任何附加条款。全面履行投标承诺,圆满完成采购单位的相关定点服务工作,确保质量,提供快捷、方便、满意的服务。",
|
||||
"数量": "采购清单中未具体指定数量"
|
||||
"一包评分标准": {
|
||||
"报价部分": {
|
||||
"价格评议": {
|
||||
"分值": 30,
|
||||
"评分标准": "对确定资格性和符合性审查合格的投标文件进行价格评议,报价分采用低价优先法计算,即满足招标文件要求且投标价格最低的投标报价为评标基准价,其报价分30分。其他投标人的价格分按照下列公式计算:投标报价得分=(评标基准价/投标报价)×30%×100(得分保留小数点后2位)"
|
||||
}
|
||||
},
|
||||
"商务部分": {
|
||||
"企业实力": [
|
||||
{
|
||||
"分值": 2,
|
||||
"评分标准": "投标人具有有效期内的CCRC信息安全集成服务资质贰级以上资质证书,得2分,不具备得0分。(原件备查)"
|
||||
},
|
||||
"软件及耗材": {
|
||||
"服务器": "/",
|
||||
"台式计算机": "/",
|
||||
"便携式计算机": "/",
|
||||
"信息安全设备": "/",
|
||||
"喷墨打印机": "/",
|
||||
"激光打印机": "/",
|
||||
"针式打印机": "/",
|
||||
"液晶显示器": "/",
|
||||
"扫描仪": "/",
|
||||
"基础软件": "/",
|
||||
"信息安全软件": "/",
|
||||
"复印机": "/",
|
||||
"投影仪": "/",
|
||||
"多功能一体机": "/",
|
||||
"触控一体机": "/",
|
||||
"碎纸机": "/",
|
||||
"复印纸": "/"
|
||||
{
|
||||
"分值": 2,
|
||||
"评分标准": "投标人有健全的信息技术运维服务能力,并通过ITSS信息技术服务运维标准符合性认证证书,贰级以上得2分,不具备得0分。(原件备查)"
|
||||
},
|
||||
{
|
||||
"分值": 2,
|
||||
"评分标准": "投标人具有ISO 14001环境管理体系认证以及OHSAS 18001职业健康安全管理体系认证,具有ISO 20000 IT服务管理体系认证以及ISO 27001信息安全管理体 系认证,每满足一项得2分,否则不得分。(原件备查)"
|
||||
},
|
||||
{
|
||||
"分值": 3,
|
||||
"评分标准": "投标人同时具有省级及以上高新技术企业证书、AAA企业信用等级证书、上 年度纳税信用评价为A级,完全满足得3分,每缺一项扣1分,直至得0分;(原件备查)"
|
||||
},
|
||||
{
|
||||
"分值": 2,
|
||||
"评分标准": "所投主要产品制造商应具备较强的合同执行诚信度和执行能力,要求获得政府权威机构颁发的国家级或省级守合同重信用公示企业,国家级得2分,省级得1分,不提供不得分。(原件备查)"
|
||||
},
|
||||
{
|
||||
"分值": 2,
|
||||
"评分标准": "所投主要产品制造商通过CMMI软件能力成熟度模型集成认证,认证等级不低于5级得2分,4级得1分,其他不得分。(原件备查)"
|
||||
},
|
||||
{
|
||||
"分值": 3,
|
||||
"评分标准": "投标人近三年以来具有类似项目经验(提供中标通知书及合同原件)。每提供一个得1分,最多得3分。"
|
||||
},
|
||||
{
|
||||
"分值": 2,
|
||||
"评分标准": "投标人具备GB/T27922五星级服务认证证书的得2分,不满足得0分。(原件备查)"
|
||||
}
|
||||
],
|
||||
"产品制造商实力": [
|
||||
{
|
||||
"分值": 2,
|
||||
"评分标准": "为保证系统的稳定性,所投产品制造商应具有完善的工业信息安全应急体系、良好的安全应急能力,能被工业信息安全产业发展联盟认定为“工业信息安全应急服务支撑单位”,满足得2分,否则不得分。(原件备查)"
|
||||
},
|
||||
{
|
||||
"分值": 2,
|
||||
"评分标准": "所投主要产品制造商通过CMMI软件能力成熟度模型集成认证,认证等级不低于5级得2分,4级得1分,其他不得分。(原件备查)"
|
||||
}
|
||||
]
|
||||
},
|
||||
"技术部分": {
|
||||
"技术要求": {
|
||||
"分值": 30,
|
||||
"评分标准": "投标人所投产品完全满足招标文件要求的30分,'★'指标不符合招标文件技术参数的视为无效投标,针对'★'指标,投标人须对应招标文件第三章商务技术要求产品参数要求逐条提供包括但不限于第三方产品检测报告、产品技术白皮书、产品官网或公开的技术资料等佐证资料,并加盖产品制造商公章;'▲'指标不满足招标文件第三章商务技术要求产品参数要求的每项扣2分,其他非指标性参数不满足招标文件第三章商务技术要求产品参数要求的每项扣1分,扣完为止,未提供佐证资料的将作产品不响应处理,'▲'指标产品参数要求必须提供第三方权威机构检测报告佐证并加盖产品制造商公章,非指标参数提供产品彩页佐证并加盖产品制造商公章即可。",
|
||||
"备注": "针对'★'指标,投标人须对应招标文件第三章商务技术要求产品参数要求逐条提供包括但不限于第三方产品检测报告、产品技术白皮书、产品官网或公开的技术资料等佐证资料,并加盖产品制造商公章;'▲'指标产品参数要求必须提供第三方权威机构检测报告佐证并加盖产品制造商公章,非指标参数提供产品彩页佐证并加盖产品制造商公章即可。"
|
||||
},
|
||||
"质量保证": {
|
||||
"分值": 2,
|
||||
"评分标准": "投标人所投的电子警察系统、信号控制系统,如果不是自己生产的,需提供制造商出具的授权及满足招标质保要求的售后服务承诺函,提供得2分;(开标时提供授权书及售后服务承诺函原件予以证明,否则不得分。)"
|
||||
},
|
||||
"实施方案": {
|
||||
"分值": 5,
|
||||
"评分标准": "投标人对于本项目的实施保障方案中包含:①施工组织方案、②施工协调经验、③进度计划、④质量保障、⑤工期保障五个方案的可行性、合理性、实用性等方面进行综合性评分,每个实施方案清晰、完整、合理、可行的得1分。"
|
||||
},
|
||||
"实施团队": {
|
||||
"分值": 9,
|
||||
"评分标准": "1、投标人拟派项目经理具有机电工程专业壹级注册建造师资格及安全生产考核合格证书(B证),并同时具备相关专业高级工程师职称的,得2分;其他不得分;2、项目技术负责人具备相关专业高级工程师职称及高级信息系统项目管理师证书的,得2分;其他不得分;3、项目管理团队中的施工员、质量员、安全员(C证)、材料员、资料员须具有中级或以上职称;完全满足得5分,每缺一个扣1分,直至得0分;(以上人员不可兼任,提供证书复印件、身份证、劳动合同及社保证明文件加盖公章)"
|
||||
},
|
||||
"运维服务": {
|
||||
"分值": 2,
|
||||
"评分标准": "投标人应对维护期给出合理、完善的运维及售后服务方案。包括售后服务团队管理、运维服务内容(日常运作、服务咨询、巡检保养、主动监测、故障修复、特殊保障和升级优化)、备件方案、运维服务流程、运维管理,服务方案清晰、完整、合理、可行2分;服务方案较完整1分;服务方案不完整,不明确得0分。"
|
||||
},
|
||||
"培训方案": {
|
||||
"分值": 2,
|
||||
"评分标准": "投标人应针对本项目制定全面、可操作性强的技术培训方案有具体的培训时间、人员安排的2分,其它得0分。"
|
||||
}
|
||||
}
|
||||
},
|
||||
"二包评分标准": {
|
||||
"报价部分": {
|
||||
"价格评议": {
|
||||
"分值": 30,
|
||||
"评分标准": "对确定资格性和符合性审查合格的投标文件进行价格评议,报价分采用低价优先法计算,即满足招标文件要求且投标价格最低的投标报价为评标基准价,其报价分30分。其他投标人的价格分按照下列公式计算:投标报价得分=(评标基准价/投标报价)×30%×100(得分保留小数点后2位)"
|
||||
}
|
||||
},
|
||||
"商务部分": {
|
||||
"企业实力": [
|
||||
{
|
||||
"分值": 3,
|
||||
"评分标准": "投标人具有三级交通运输企业安全生产标准化建设等级证明的得1分;具有二级交通运输企业安全生产标准化建设等级证明的得2分;具有一级交通运输企业安全生产标准化建设等级证明的得3分。(提供相关证明材料加盖公章)"
|
||||
},
|
||||
{
|
||||
"分值": 3,
|
||||
"评分标准": "投标人具有ISO 14001环境管理体系认证以及OHSAS 18001职业健康安全管理体系认证,具有ISO 09001质量管理体系认证,全部满足得3分,每满足一项得1分,否则不得分。(原件备查)"
|
||||
},
|
||||
{
|
||||
"分值": 4,
|
||||
"评分标准": "投标人具有AAA企业信用等级证书且在有效期内的、上年度纳税信用评价为A级,完全满足得4分,每缺一项扣2分,直至得0分;(提供相关证明材料)"
|
||||
},
|
||||
{
|
||||
"分值": 8,
|
||||
"评分标准": "投标人近三年以来具有类似项目经验(提供中标通知书及合同原件)。每提供一个得2分,最多得8分。"
|
||||
},
|
||||
{
|
||||
"分值": 9,
|
||||
"评分标准": "1、投标人拟派项目经理具有高级工程师职称的,得2分;其他不得分;2、项目技术负责人具备相关专业高级工程师职称,得2分;其他不得分;3、项目管理团队中的施工员、质量员、安全员(C证)、材料员、资料员须具有中级或以上职称;完全满足得5分,每缺一个扣1分,直至得0分;(以上人员不可兼任,提供证书复印件、身份证、劳动合同及社保证明文件加盖公章)"
|
||||
}
|
||||
]
|
||||
},
|
||||
"技术部分": {
|
||||
"技术要求": {
|
||||
"分值": 20,
|
||||
"评分标准": "投标人所投产品完全满足招标文件要求的20分,'▲'指标低于招标文件第三章商务技术要求产品参数要求的每项扣3分,扣完为止。"
|
||||
},
|
||||
"质量保证措施": {
|
||||
"分值": 8,
|
||||
"评分标准": "投标人提供的用于本项目的质量保障措施,符合采购要求,并包含以下几点:(1)质量管理目标;(2)质量管理规章制度;(3)质量管理方案;(4)质量承诺。每一项2分,完全满足得8分,缺一项扣2分,直至得0分。"
|
||||
},
|
||||
"售后服务承诺": {
|
||||
"分值": 12,
|
||||
"评分标准": "1、投标人承诺对招标人提供优先供货服务,以满足招标人的紧急需求;(2)投标人承诺接到故障报修,24小时内提供解决方案并开始组织实施;(3)投标人承诺现场施工降低污染,文明施工;每一项5分。完全满足得12分,缺一项扣4分,直至得0分。"
|
||||
},
|
||||
"培训方案": {
|
||||
"分值": 3,
|
||||
"评分标准": "投标人应针对本项目制定全面、可操作性强的技术培训方案有具体的培训时间、人员安排的3分,其它得0分。"
|
||||
}
|
||||
}
|
||||
},
|
||||
"三包评分标准": {
|
||||
"报价部分": {
|
||||
"分值": 40,
|
||||
"评分标准": "对确定资格性和符合性审查合格的投标文件进行价格评议,报价分采用低价优先法计算,即满足招标文件要求且投标价格最低的投标报价为评标基准价,其报价分40分。其他投标人的价格分按照下列公式计算:投标报价得分=(评标基准价/投标报价)×40%×100(得分保留小数点后2位)"
|
||||
},
|
||||
"技术指标": {
|
||||
"分值": 40,
|
||||
"评分标准": "1)供应商的技术投标书的响应性、符合性、完整性及编制质量,在技术参数*号项、普通项全部满足的得满分40分;2)技术参数中*号项一项不满足的扣2分,扣完为止;3)技术参数中普通项一项不满足扣1分,扣完为止;4)复制招标文件技术要求作为实际响应数据或投标响应数据无对应支持文件的,其技术响应将按负偏离处理。"
|
||||
},
|
||||
"服务保障": {
|
||||
"分值": 10,
|
||||
"评分标准": "投标人应对维护期给出合理、完善的运维及售后服务方案。包括售后服务团队管理、运维服务内容(日常运作、服务咨询、巡检保养、主动监测、故障修复、特殊保障和升级优化)、备件方案、运维服务流程、运维管理,服务方案清晰、完整、合理、可行2分;服务方案较完整1分;服务方案不完整,不明确得0分。"
|
||||
},
|
||||
"业绩证明": {
|
||||
"分值": 5,
|
||||
"评分标准": "投标人近三年以来具有类似项目经验(提供中标通知书及合同原件)。每提供一个得2.5分,最多得5分。"
|
||||
},
|
||||
"投标文件的规范性": {
|
||||
"分值": 2,
|
||||
"评分标准": "投标文件的制作质量、文档编排、规范性、方便查阅性等进行综合比较:优得2分,良得1分。"
|
||||
},
|
||||
"售后服务": {
|
||||
"分值": 3,
|
||||
"评分标准": "产品故障响应:省内24小时、省外48小时内到达现场处理并修复,且有完善售后服务网络和巡检方案的得3分,不完善售后服务酌情扣分。"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# 转换字典
|
||||
converted_data = postprocess(data)
|
||||
print(converted_data)
|
||||
def get_evaluation_standards(truncate_file):
|
||||
|
||||
include = ['一包', '二包', '三包', '四包', '五包']
|
||||
target_values1 = ['技术', '设计', '实施', '方案']
|
||||
target_values2 = ['投标报价', '商务标', '商务部分', '报价部分', '业绩', '信誉', '分值', '计算公式', '信用', '人员',
|
||||
'资格', '奖项', '认证', '荣誉']
|
||||
updated_jsons = {}
|
||||
|
||||
for key in data.keys():
|
||||
if any(item in key for item in include):
|
||||
inner_dict = data[key]
|
||||
# 将处理后的结果存储到updated_jsons中,每个包名为键
|
||||
updated_jsons[key] = combine_technical_and_business(inner_dict, target_values1, target_values2)
|
||||
|
||||
# 将updated_jsons转换为JSON格式
|
||||
evaluation_combined_res = json.dumps(updated_jsons, ensure_ascii=False, indent=4)
|
||||
return evaluation_combined_res
|
||||
|
||||
res=get_evaluation_standards("1")
|
||||
print(res)
|
||||
|
90
flask_app/货物标/商务服务其他要求提取.py
Normal file
90
flask_app/货物标/商务服务其他要求提取.py
Normal file
@ -0,0 +1,90 @@
|
||||
import json
|
||||
import re
|
||||
from PyPDF2 import PdfReader
|
||||
from flask_app.main.json_utils import clean_json_string, combine_json_results
|
||||
from flask_app.main.多线程提问 import multi_threading
|
||||
from flask_app.main.通义千问long import qianwen_long, upload_file
|
||||
from flask_app.货物标.货物标截取pdf import extract_common_header, clean_page_content
|
||||
|
||||
|
||||
def find_exists(truncate_file, required_keys):
|
||||
common_header = extract_common_header(truncate_file)
|
||||
pdf_document = PdfReader(truncate_file)
|
||||
text = ""
|
||||
begin_pattern = re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|'
|
||||
r'^[一二三四五六七八九十百千]+、\s*采购清单', re.MULTILINE)
|
||||
end_pattern = re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||
|
||||
for page in pdf_document.pages:
|
||||
page_text = page.extract_text() or ""
|
||||
cleaned_text = clean_page_content(page_text, common_header)
|
||||
text += cleaned_text + '\n'
|
||||
|
||||
start_match = re.search(begin_pattern, text)
|
||||
if not start_match:
|
||||
return []
|
||||
|
||||
start_index = start_match.end()
|
||||
end_match = re.search(end_pattern, text[start_index:])
|
||||
|
||||
if end_match:
|
||||
end_index = start_index + end_match.start()
|
||||
relevant_text = text[start_index:end_index]
|
||||
else:
|
||||
relevant_text = text[start_index:]
|
||||
relevant_text=re.sub(r'\s+', '', relevant_text) #删除换行符 空格
|
||||
print(relevant_text)
|
||||
# Custom logic for "服务要求"
|
||||
matched_requirements = []
|
||||
punctuation = r"[,。?!、;:,.?!]*"
|
||||
for req in required_keys:
|
||||
if re.search(re.escape(req), relevant_text):
|
||||
if req == "服务要求":
|
||||
# 提取所有包含"服务要求"的行
|
||||
lines = [line for line in relevant_text.split('\n') if req in line]
|
||||
# 检查是否存在'技术'紧跟在'服务要求'前面(中间只有标点,标点是可选的)
|
||||
pattern = r'技术\s*' + punctuation + re.escape(req)
|
||||
# 如果存在'技术'紧跟'服务要求'并且中间仅有标点(可选),则不添加该要求
|
||||
if not any(re.search(pattern, line) for line in lines):
|
||||
matched_requirements.append(req)
|
||||
else:
|
||||
matched_requirements.append(req)
|
||||
|
||||
return matched_requirements
|
||||
|
||||
|
||||
def generate_queries(truncate_file, required_keys):
|
||||
key_list = find_exists(truncate_file, required_keys)
|
||||
queries = []
|
||||
user_query_template = "这是一份货物标中采购要求部分的内容,请告诉我\"{}\"是什么,请以json格式返回结果,外层键名是\"{}\",内层键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减,注意你无需回答具体设备的技术要求,"
|
||||
for key in key_list:
|
||||
query_base = user_query_template.format(key, key)
|
||||
other_keys = [k for k in key_list if k != key]
|
||||
if other_keys:
|
||||
query_base += "也不需要回答\"{}\"中的内容,".format("\"和\"".join(other_keys))
|
||||
query_base += "若相关要求不存在,在键值中填'未知'。"
|
||||
queries.append(query_base)
|
||||
print(query_base)
|
||||
return queries
|
||||
|
||||
|
||||
def get_business_requirements(truncate_file):
|
||||
required_keys = ["商务要求", "服务要求", "其他要求"]
|
||||
file_id = upload_file(truncate_file)
|
||||
queries = generate_queries(truncate_file, required_keys)
|
||||
results = multi_threading(queries, "", file_id, 2)
|
||||
business_requirements = [res for _, res in results] if results else []
|
||||
|
||||
# Combine and fill missing keys with default values
|
||||
final_res = combine_json_results(business_requirements)
|
||||
final_res.update({key: final_res.get(key, "") for key in required_keys})
|
||||
|
||||
return json.dumps(final_res, ensure_ascii=False, indent=4)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
||||
print(get_business_requirements(truncate_file))
|
@ -3,7 +3,7 @@ import json
|
||||
import os
|
||||
|
||||
from flask_app.main.多线程提问 import multi_threading
|
||||
from flask_app.main.通义千问long import upload_file, qianwen_long
|
||||
from flask_app.main.通义千问long import qianwen_long, upload_file
|
||||
from flask_app.main.json_utils import clean_json_string, combine_json_results
|
||||
|
||||
|
||||
@ -65,15 +65,15 @@ def postprocess(data):
|
||||
|
||||
# 递归处理顶层数据
|
||||
return {key: convert_dict(val) if isinstance(val, dict) else val for key, val in data.items()}
|
||||
def get_technical_requirements(truncate_file):
|
||||
def get_technical_requirements(file_id):
|
||||
user_query1 = "这是一份货物标中采购要求部分的内容,请告诉我需要采购的系统(货物),如果有采购清单,请直接根据清单上的货物名称给出结果,若没有采购清单,你要从文中摘取需要采购的系统(货物),采购需求中可能包含层次关系,如大系统中包含若干子系统,你需要保留这种层次关系,给出系统(货物)名称,请以json格式返回,外层键名为\"采购需求\",嵌套键名为对应的系统名称或货物名称,需与原文保持一致,无需给出采购数量和单位,如有未知内容,在对应键值处填\"未知\"。"
|
||||
file_id = upload_file(truncate_file)
|
||||
res = qianwen_long(file_id, user_query1)
|
||||
print(res)
|
||||
cleaned_res = clean_json_string(res)
|
||||
keys_list ,no_keys_added= generate_key_paths(cleaned_res['采购需求']) # 提取需要采购的货物清单
|
||||
if '采购需求' in cleaned_res:
|
||||
cleaned_res['技术要求'] = cleaned_res.pop('采购需求')
|
||||
if no_keys_added:
|
||||
final_res = postprocess(cleaned_res['采购需求'])
|
||||
final_res = postprocess(cleaned_res)
|
||||
else:
|
||||
user_query_template = "这是一份货物标中采购要求部分的内容,请你给出\"{}\"的技术参数(或采购要求)和数量,请以json格式返回结果,外层键名为\"{}\", 键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减。"
|
||||
queries = []
|
||||
@ -93,8 +93,8 @@ def get_technical_requirements(truncate_file):
|
||||
technical_requirements_combined_res = combine_json_results(technical_requirements)
|
||||
"""根据所有键是否已添加处理技术要求"""
|
||||
# 更新原始采购需求字典
|
||||
combine_and_update_results(cleaned_res['采购需求'], technical_requirements_combined_res)
|
||||
final_res = postprocess(cleaned_res['采购需求'])
|
||||
combine_and_update_results(cleaned_res['技术要求'], technical_requirements_combined_res)
|
||||
final_res = postprocess(cleaned_res)
|
||||
print("更新后的采购需求处理完成.")
|
||||
# 输出最终的 JSON 字符串
|
||||
json_string = json.dumps(final_res, ensure_ascii=False, indent=4)
|
||||
@ -125,8 +125,9 @@ def test_all_files_in_folder(input_folder, output_folder):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\招标文件(2)_procurement.pdf"
|
||||
res=get_technical_requirements(truncate_file)
|
||||
truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
||||
file_id = upload_file(truncate_file)
|
||||
res=get_technical_requirements(file_id)
|
||||
print(res)
|
||||
# input_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
|
||||
# output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output3"
|
@ -2,8 +2,8 @@ import json
|
||||
|
||||
from flask_app.货物标.货物标截取pdf import truncate_pdf_main
|
||||
from flask_app.main.format_change import docx2pdf, pdf2docx
|
||||
from flask_app.货物标.技术服务商务要求提取 import get_technical_requirements
|
||||
|
||||
from flask_app.货物标.技术要求提取 import get_technical_requirements
|
||||
from flask_app.main.通义千问long import upload_file
|
||||
#获取采购清单
|
||||
def fetch_purchasing_list(file_path,output_folder,file_type):
|
||||
if file_type==1:
|
||||
@ -16,7 +16,8 @@ def fetch_purchasing_list(file_path,output_folder,file_type):
|
||||
print("未传入指定格式的文件!")
|
||||
return None
|
||||
truncate_file=truncate_pdf_main(pdf_path,output_folder,1)
|
||||
tech_reqs=get_technical_requirements(truncate_file)
|
||||
file_id = upload_file(truncate_file)
|
||||
tech_reqs=get_technical_requirements(file_id)
|
||||
|
||||
if __name__ == "__main__":
|
||||
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output"
|
||||
|
@ -9,6 +9,11 @@
|
||||
这是一份货物标中采购要求部分的内容,请你给出\"{}\"的具体型号参数要求和数量,请以json格式返回结果,外层键名为\"{}\", 键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减。
|
||||
#这是一份货物标中采购要求部分的内容,请你给出"网络硬盘录像机"的具体型号参数要求,请以json格式返回结果,外层键名为"网络硬盘录像机",键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减。
|
||||
|
||||
|
||||
这是一份货物标中采购要求部分的内容,请告诉我商务要求和其他要求是什么,请以json格式返回结果,外层键名分别是"商务要求"和"其他要求",内层键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减,注意你无需回答具体设备的技术要求,若相关要求不存在,在键值中填"未知"。
|
||||
|
||||
"根据该文档中的评标办法前附表,请你列出该文件的技术标,商务标,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术标'中新增子键名'备注'存放该信息。如果评分内容不是这3个,则返回文档中给定的评分内容以及它的评分要求,都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容")
|
||||
|
||||
{
|
||||
"采购需求": {
|
||||
"硬盘录像机 A": {},
|
||||
|
131
flask_app/货物标/评分标准提取.py
Normal file
131
flask_app/货物标/评分标准提取.py
Normal file
@ -0,0 +1,131 @@
|
||||
import json
|
||||
|
||||
from flask_app.main.通义千问long import upload_file, qianwen_long
|
||||
from flask_app.main.json_utils import clean_json_string
|
||||
|
||||
def combine_technical_and_business(data, target_values1, target_values2):
|
||||
extracted_data = {} # 根级别存储所有数据
|
||||
technical_found = False
|
||||
business_found = False
|
||||
|
||||
def extract_nested(data, parent_key='', is_technical=False, is_business=False):
|
||||
nonlocal technical_found, business_found
|
||||
if isinstance(data, dict):
|
||||
for key, value in data.items():
|
||||
current_key = f"{parent_key}.{key}" if parent_key else key
|
||||
|
||||
# 检查是否为技术标的内容
|
||||
if any(target in key for target in target_values1): #模糊匹配
|
||||
if not is_technical:
|
||||
# 直接存储在根级别
|
||||
extracted_data[key] = value
|
||||
technical_found = True
|
||||
# 标记为技术标内容并停止进一步处理这个分支
|
||||
continue
|
||||
|
||||
# 检查是否为商务标的内容
|
||||
elif any(target in key for target in target_values2):
|
||||
if not is_business:
|
||||
# 存储在'商务标'分类下
|
||||
if '商务标' not in extracted_data:
|
||||
extracted_data['商务标'] = {}
|
||||
extracted_data['商务标'][key] = value
|
||||
business_found = True
|
||||
# 标记为商务标内容并停止进一步处理这个分支
|
||||
continue
|
||||
|
||||
# 如果当前值是字典或列表,且不在技术或商务分类下,继续递归搜索
|
||||
if isinstance(value, dict) or isinstance(value, list):
|
||||
extract_nested(value, current_key, is_technical, is_business)
|
||||
|
||||
elif isinstance(data, list):
|
||||
for index, item in enumerate(data):
|
||||
extract_nested(item, f"{parent_key}[{index}]", is_technical, is_business)
|
||||
|
||||
# 开始从顶级递归搜索
|
||||
extract_nested(data)
|
||||
|
||||
# 处理未找到匹配的情况
|
||||
if not technical_found:
|
||||
extracted_data['技术标'] = ''
|
||||
if not business_found:
|
||||
extracted_data['商务标'] = ''
|
||||
|
||||
return extracted_data
|
||||
|
||||
|
||||
def get_evaluation_standards(truncate_file):
|
||||
file_id = upload_file(truncate_file)
|
||||
user_query = "根据该文档中的评标办法前附表,请你列出该文件的技术标,商务标,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术标'中新增子键名'备注'存放该信息。如果评分内容不是这3个,则返回文档中给定的评分内容以及它的评分要求,都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容"
|
||||
evaluation_res = qianwen_long(file_id, user_query)
|
||||
cleaned_evaluation_res = clean_json_string(evaluation_res)
|
||||
|
||||
include = ['一包', '二包', '三包', '四包', '五包']
|
||||
target_values1 = ['技术标', '技术部分', '设计', '实施', '方案']
|
||||
target_values2 = ['投标报价', '商务标', '商务部分', '报价部分', '业绩', '信誉', '分值', '计算公式', '信用', '人员',
|
||||
'资格', '奖项', '认证', '荣誉']
|
||||
updated_jsons = {}
|
||||
|
||||
for key in cleaned_evaluation_res.keys():
|
||||
if any(item in key for item in include):
|
||||
inner_dict = cleaned_evaluation_res[key]
|
||||
# 将处理后的结果存储到updated_jsons中,每个包名为键
|
||||
updated_jsons[key] = combine_technical_and_business(inner_dict, target_values1, target_values2)
|
||||
|
||||
# 将updated_jsons转换为JSON格式
|
||||
evaluation_combined_res = json.dumps(updated_jsons, ensure_ascii=False, indent=4)
|
||||
return evaluation_combined_res
|
||||
def combine_technical_and_business(data, target_values1, target_values2):
|
||||
extracted_data = {} # 根级别存储所有数据
|
||||
technical_found = False
|
||||
business_found = False
|
||||
|
||||
def extract_nested(data, parent_key='', is_technical=False, is_business=False):
|
||||
nonlocal technical_found, business_found
|
||||
if isinstance(data, dict):
|
||||
for key, value in data.items():
|
||||
current_key = f"{parent_key}.{key}" if parent_key else key
|
||||
|
||||
# 检查是否为技术标的内容
|
||||
if any(target in key for target in target_values1):
|
||||
if not is_technical:
|
||||
# 直接存储在根级别
|
||||
extracted_data[key] = value
|
||||
technical_found = True
|
||||
# 标记为技术标内容并停止进一步处理这个分支
|
||||
continue
|
||||
|
||||
# 检查是否为商务标的内容
|
||||
elif any(target in key for target in target_values2):
|
||||
if not is_business:
|
||||
# 存储在'商务标'分类下
|
||||
if '商务标' not in extracted_data:
|
||||
extracted_data['商务标'] = {}
|
||||
extracted_data['商务标'][key] = value
|
||||
business_found = True
|
||||
# 标记为商务标内容并停止进一步处理这个分支
|
||||
continue
|
||||
|
||||
# 如果当前值是字典或列表,且不在技术或商务分类下,继续递归搜索
|
||||
if isinstance(value, dict) or isinstance(value, list):
|
||||
extract_nested(value, current_key, is_technical, is_business)
|
||||
|
||||
elif isinstance(data, list):
|
||||
for index, item in enumerate(data):
|
||||
extract_nested(item, f"{parent_key}[{index}]", is_technical, is_business)
|
||||
|
||||
# 开始从顶级递归搜索
|
||||
extract_nested(data)
|
||||
|
||||
# 处理未找到匹配的情况
|
||||
if not technical_found:
|
||||
extracted_data['技术标'] = ''
|
||||
if not business_found:
|
||||
extracted_data['商务标'] = ''
|
||||
|
||||
return extracted_data
|
||||
|
||||
if __name__ == "__main__":
|
||||
truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output2\\招标文件(107国道)_evaluation_method.pdf"
|
||||
res=get_evaluation_standards(truncate_file)
|
||||
cleaned_res=clean_json_string(res).get("")
|
@ -22,13 +22,17 @@ def extract_common_header(pdf_path):
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
headers = []
|
||||
total_pages = len(pdf_document.pages)
|
||||
middle_page = total_pages // 2 # 计算中间页
|
||||
|
||||
# 确保中间页前后各一页(共3页),如果不足3页,则尽可能取足
|
||||
start_page = max(0, middle_page - 1)
|
||||
num_pages_to_read = 3
|
||||
# 确定要读取的页数和起始页
|
||||
if total_pages == 2:
|
||||
pages_to_read = 2
|
||||
start_page = 0
|
||||
else:
|
||||
pages_to_read = 3
|
||||
middle_page = total_pages // 2
|
||||
start_page = max(0, middle_page - 1)
|
||||
|
||||
for i in range(start_page, min(start_page + num_pages_to_read, total_pages)):
|
||||
for i in range(start_page, min(start_page + pages_to_read, total_pages)):
|
||||
page = pdf_document.pages[i]
|
||||
text = page.extract_text() or ""
|
||||
if text:
|
||||
@ -82,67 +86,64 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
|
||||
print("提供的路径既不是文件夹也不是PDF文件。")
|
||||
return generated_files
|
||||
|
||||
def extract_pages_twice(pdf_path, output_folder, output_suffix,common_header):
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
begin_page=5
|
||||
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None):
|
||||
start_page = None
|
||||
end_page = None
|
||||
# 定义用于检测是否包含"文件的构成"的正则表达式
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成')
|
||||
if output_suffix == "procurement":
|
||||
begin_pattern = re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|'
|
||||
r'^[一二三四五六七八九十百千]+、\s*采购清单',
|
||||
re.MULTILINE
|
||||
)
|
||||
end_pattern = re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
|
||||
re.MULTILINE
|
||||
)
|
||||
for i in range(len(pdf_document.pages)):
|
||||
page = pdf_document.pages[i]
|
||||
text = page.extract_text() or ""
|
||||
cleaned_text = clean_page_content(text,common_header)
|
||||
if re.search(begin_pattern, cleaned_text) and i > begin_page and not re.search(exclusion_pattern, cleaned_text):
|
||||
start_page = i
|
||||
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
|
||||
end_page = i
|
||||
break
|
||||
if start_page is None or end_page is None:
|
||||
print(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!")
|
||||
return ""
|
||||
else:
|
||||
return save_extracted_pages(pdf_document,start_page, end_page, pdf_path,output_folder, output_suffix)
|
||||
|
||||
for i, page in enumerate(pdf_document.pages):
|
||||
text = page.extract_text() or ""
|
||||
cleaned_text = clean_page_content(text, common_header)
|
||||
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text):
|
||||
continue
|
||||
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
|
||||
start_page = i
|
||||
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
|
||||
end_page = i
|
||||
break
|
||||
return start_page, end_page
|
||||
|
||||
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
||||
try:
|
||||
common_header = extract_common_header(pdf_path)
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
start_page = None
|
||||
end_page = None
|
||||
for i in range(len(pdf_document.pages)):
|
||||
page = pdf_document.pages[i]
|
||||
text = page.extract_text() or ""
|
||||
cleaned_text = clean_page_content(text,common_header)
|
||||
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
|
||||
start_page = i
|
||||
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
|
||||
end_page = i
|
||||
break
|
||||
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
|
||||
if start_page is None or end_page is None:
|
||||
if output_suffix == "procurement":
|
||||
return extract_pages_twice(pdf_path, output_folder, output_suffix,common_header)
|
||||
else:
|
||||
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
|
||||
return None
|
||||
|
||||
print(f"未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
||||
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header)
|
||||
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
||||
except Exception as e:
|
||||
print(f"Error processing {pdf_path}: {e}")
|
||||
return None
|
||||
|
||||
def get_patterns_for_procurement():
|
||||
begin_pattern = re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|'
|
||||
r'^[一二三四五六七八九十百千]+、\s*采购清单', re.MULTILINE)
|
||||
end_pattern = re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||
return begin_pattern, end_pattern
|
||||
|
||||
def get_patterns_for_evaluation_method():
|
||||
begin_pattern = re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(磋商|谈判|评标|评定|评审)(方法|办法).*', re.MULTILINE)
|
||||
end_pattern = re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||
return begin_pattern, end_pattern
|
||||
|
||||
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成')
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
if output_suffix == "procurement":
|
||||
begin_pattern, end_pattern = get_patterns_for_procurement()
|
||||
elif output_suffix == "evaluation_method":
|
||||
begin_pattern, end_pattern = get_patterns_for_evaluation_method()
|
||||
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, 5, common_header, exclusion_pattern)
|
||||
if start_page is None or end_page is None:
|
||||
print(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!")
|
||||
return ""
|
||||
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
||||
|
||||
|
||||
def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):
|
||||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
|
||||
@ -168,7 +169,15 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
|
||||
)
|
||||
output_suffix = "procurement"
|
||||
|
||||
elif selection == 2:
|
||||
begin_pattern=re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(磋商|谈判|评标|评定|评审)(方法|办法).*'
|
||||
)
|
||||
begin_page = 5
|
||||
end_pattern = re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
|
||||
)
|
||||
output_suffix = "evaluation_method"
|
||||
else:
|
||||
print("无效的选择")
|
||||
return None
|
||||
@ -178,16 +187,16 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
||||
|
||||
def truncate_pdf_multiple(input_path, output_folder):
|
||||
truncate_files = []
|
||||
for selection in range(1, 2):
|
||||
for selection in range(1, 3):
|
||||
files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
truncate_files.extend(files)
|
||||
return truncate_files
|
||||
|
||||
#TODO:交通智能系统和招标(1)(1)文件有问题
|
||||
if __name__ == "__main__":
|
||||
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件(2).pdf"
|
||||
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
|
||||
truncate_pdf_multiple(input_path,output_folder)
|
||||
# selection = 1 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-招标公告-合同条款前
|
||||
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
# print("生成的文件:", generated_files)
|
||||
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\交通智能运行维护招标文件.pdf"
|
||||
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output2"
|
||||
# truncate_pdf_multiple(input_path,output_folder)
|
||||
selection = 2 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-招标公告-合同条款前
|
||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user