10.23 无效标优化

This commit is contained in:
zy123 2024-10-23 20:33:41 +08:00
parent 7f6ea14439
commit 5cfd6479d4
11 changed files with 199 additions and 116 deletions

View File

@ -48,29 +48,123 @@ def pdf2docx(local_path_in):
print(f"format_change p2d:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath
def doc2docx(local_path_in):
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d'
receive_download_url = upload_file(local_path_in, remote_url)
print(receive_download_url)
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
local_filename = os.path.join(folder, filename) # 输出文件名
downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
print(f"format_change d2d:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath
def docx2pdf(local_path_in):
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p'
receive_download_url = upload_file(local_path_in, remote_url)
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
local_filename = os.path.join(folder, filename) # 输出文件名
downloaded_filepath,file_type = download_file(receive_download_url, local_filename)
print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath
# def doc2docx(local_path_in):
# remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d'
# receive_download_url = upload_file(local_path_in, remote_url)
# print(receive_download_url)
# filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
# local_filename = os.path.join(folder, filename) # 输出文件名
# downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
# print(f"format_change d2d:have downloaded file to: {downloaded_filepath}")
# return downloaded_filepath
# def docx2pdf(local_path_in):
# remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p'
# receive_download_url = upload_file(local_path_in, remote_url)
# filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
# local_filename = os.path.join(folder, filename) # 输出文件名
# downloaded_filepath,file_type = download_file(receive_download_url, local_filename)
# print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
# return downloaded_filepath
def docx2pdf(file_path):
"""
将本地的 .docx .doc 文件转换为 .pdf 文件
参数
- file_path: str, 本地文件的路径支持 .docx .doc 格式
"""
# 检查文件是否存在
if not os.path.isfile(file_path):
raise FileNotFoundError(f"文件未找到: {file_path}")
# 获取文件名和扩展名
base_name = os.path.basename(file_path)
name, ext = os.path.splitext(base_name)
ext = ext.lower().lstrip('.')
if ext not in ['docx', 'doc']:
raise ValueError(f"doc2pdf 仅支持 .docx 和 .doc 文件,当前文件扩展名为: .{ext}")
# 定义转换接口
endpoint = 'http://120.26.236.97:5005/convert_to_pdf'
# 获取文件所在目录
output_dir = os.path.dirname(file_path)
# 准备上传的文件
with open(file_path, 'rb') as f:
files = {'file': (base_name, f)}
try:
print(f"正在将 {base_name} 转换为 .pdf 格式...")
response = requests.post(endpoint, files=files)
response.raise_for_status() # 检查请求是否成功
except requests.RequestException as e:
print(f"转换过程中发生错误: {e}")
return
# 准备保存转换后文件的路径
output_file_name = f"{name}.pdf"
output_path = os.path.join(output_dir, output_file_name)
# 保存转换后的文件
with open(output_path, 'wb') as out_file:
out_file.write(response.content)
print(f"文件已成功转换并保存至: {output_path}")
def doc2docx(file_path):
"""
将本地的 .doc 文件转换为 .docx 文件
参数
- file_path: str, 本地文件的路径支持 .doc 格式
"""
# 检查文件是否存在
if not os.path.isfile(file_path):
raise FileNotFoundError(f"文件未找到: {file_path}")
# 获取文件名和扩展名
base_name = os.path.basename(file_path)
name, ext = os.path.splitext(base_name)
ext = ext.lower().lstrip('.')
if ext != 'doc':
raise ValueError(f"doc2docx 仅支持 .doc 文件,当前文件扩展名为: .{ext}")
# 定义转换接口
endpoint = 'http://120.26.236.97:5005/convert_to_docx'
# 获取文件所在目录
output_dir = os.path.dirname(file_path)
# 准备上传的文件
with open(file_path, 'rb') as f:
files = {'file': (base_name, f)}
try:
print(f"正在将 {base_name} 转换为 .docx 格式...")
response = requests.post(endpoint, files=files)
response.raise_for_status() # 检查请求是否成功
except requests.RequestException as e:
print(f"转换过程中发生错误: {e}")
return
# 准备保存转换后文件的路径
output_file_name = f"{name}.docx"
output_path = os.path.join(output_dir, output_file_name)
# 保存转换后的文件
with open(output_path, 'wb') as out_file:
out_file.write(response.content)
print(f"文件已成功转换并保存至: {output_path}")
if __name__ == '__main__':
# 替换为你的文件路径和API URL
local_path_in="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\6.2定版视频会议磋商文件.doc"
downloaded_file=doc2docx(local_path_in)
# downloaded_file=docx2pdf(local_path_in)
local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf"
# downloaded_file=doc2docx(local_path_in)
downloaded_file=pdf2docx(local_path_in)
print(downloaded_file)

View File

@ -9,7 +9,7 @@ from flask_app.general.json_utils import clean_json_string
from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
from flask_app.general.通义千问long import upload_file
from flask_app.货物标.基础信息解析main import aggregate_basic_info_goods
from flask_app.货物标.货物标截取pdf import truncate_pdf_specific_goods
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_specific_goods
from flask_app.main.截取pdf import truncate_pdf_specific_engineering
from flask_app.general.post_processing import inner_post_processing
from flask_app.old_version.基础信息整合 import aggregate_basic_info_engineering

View File

@ -56,3 +56,17 @@ def merge_and_cleanup(output_pdf_path, suffix_to_merge):
merge_pdfs(paths, output_pdf_path)
os.remove(another_file_path)
print(f"文件 {another_file_path} 已删除。")
def find_and_merge(target_path, output_suffix):
# 获取 target_path 所在的目录
directory = os.path.dirname(target_path)
full_path=""
# 遍历目录中的所有文件,寻找以 output_suffix 结尾的文件
for filename in os.listdir(directory):
if filename.endswith(output_suffix):
# 拼接目录路径和文件名,生成完整路径
full_path = os.path.join(directory, filename)
if not full_path:
paths=[target_path]
else:
paths=[target_path,full_path]
merge_pdfs(paths,target_path)

View File

@ -13,7 +13,7 @@ import concurrent.futures
from flask_app.main.基础信息整合快速版 import combine_basic_info
from flask_app.main.资格审查模块 import combine_review_standards
from flask_app.main.商务标技术标整合 import combine_evaluation_standards
from flask_app.general.format_change import pdf2docx, docx2pdf
from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx
from flask_app.general.docx截取docx import copy_docx
def get_global_logger(unique_id):
@ -37,6 +37,9 @@ def preprocess_files(output_folder, downloaded_file_path, file_type):
elif file_type == 2: # pdf
pdf_path = downloaded_file_path
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
elif file_type == 3: #doc
pdf_path=docx2pdf(downloaded_file_path)
docx_path=doc2docx(downloaded_file_path)
else:
logger.error("Unsupported file type provided. Preprocessing halted.")
return None

View File

@ -464,7 +464,7 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections):
# TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下比如说就不进行跳转见xx表 开评定标这里也要考虑 如果评分表为空,也要处理。
#TODO:zbtest8 zbtest18有问题 后期需要完善,截取需要截两次,第一次严格第二次宽松
if __name__ == "__main__":
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest1.pdf" # 可以是单个PDF文件路径或文件夹路径
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\e378e002-45ff-440d-a65a-9974b5015472\\ztbfile.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\output6"

View File

@ -1,76 +1,29 @@
# -*- encoding:utf-8 -*-
import re
import json
def preprocess_data(data):
# 根据需要预处理数据,这里假设原始数据已经是一个字典
return data
# 定义修改后的正则表达式
pattern = re.compile(
r'^(?!.*(?:资格|符合))(?:附录.*?[:]|附件.*?[:]|附表.*?[:]).*$|'
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
re.MULTILINE
)
def transform_json(data):
result = {}
temp = {0: result} # 初始化根字典
data = preprocess_data(data)
# 测试用例文本
text = """附表 :审查审查表(注:本表不需要投标人填写)
技术评审表
序号 评审项目 详细评资格审内容 分值 投标人A 投标人B 投标人C
1 技术方案的总体评价 对供应商技术响应方案是否全面完整清晰合理性等进行评价
对供应商技术响应方案全面完整清晰合理得3分
对供应商技术响应方案比较全面完整清晰合理得1分
对供应商技术响应方案不全面完整清晰合理得0.5
没有提供不得分
"""
# 首先, 创建一个临时字典用于检查是否存在三级标题
has_subkey = {}
for key in data.keys():
parts = key.split('.')
if len(parts) > 2 and parts[1]:
parent_key = '.'.join(parts[:2])
has_subkey[parent_key] = True
# 匹配测试
matches = pattern.findall(text)
for key, value in data.items():
match = re.match(r'(\d+)(?:\.(\d+))?(?:\.(\d+))?', key)
if match:
levels = [int(l) for l in match.groups() if l is not None]
current_level = len(levels)
parent = temp.get(current_level - 1, result)
# 获取标题内容
if current_level == 1:
# 一级标题
parent[value] = {}
temp[current_level] = parent[value]
elif current_level == 2:
# 二级标题
if has_subkey.get(key, False):
parent[value] = {}
if matches:
print("匹配成功!")
for match in matches:
print("匹配内容:", match)
else:
parent[value] = value # 如果没有子键,直接赋值内容
temp[current_level] = parent[value]
elif current_level == 3:
# 三级标题
if isinstance(parent, dict):
# 确保父级是一个字典
parent[value] = value
elif isinstance(parent, list):
parent.append(value)
temp[current_level] = value
else:
# 处理无法匹配的键
print(f"无法匹配的键: {key}")
def remove_single_item_lists(node):
if isinstance(node, dict):
for key in list(node.keys()):
node[key] = remove_single_item_lists(node[key])
if isinstance(node[key], list) and len(node[key]) == 1:
node[key] = node[key][0]
return node
return remove_single_item_lists(result)
# 示例数据
data = {
"20.": "投标文件的式样和签署",
"20.1": "投标文件的式样投标人应准备一份投标文件正本、电子文件和投标须知前附表中规定数目的副本投标文件的副本可采用正本的复印件。每套投标文件须清楚地标明“正本”或“副本”。若副本与正本不符以正本为准装订如下序号投标文件名称装订备注1唱标信封/投标报价一览表加盖公章、《投标保证金汇入情况说明》加盖公章、投标保证金缴付凭证复印件加盖公章2电子文件含价格文件、商务技术文件3价格文件独立装订成册含正、副本4商务技术文件商务文件和技术文件可一起独立装订成册也可含正、副本以分开独立装订成册",
"20.2": "投标文件的装订要求:投标报价表部分必须单独装订,若将投标报价表与商务、技术部分等内容合为装订或在商务、技术部分等内容出现投标报价,则作无效投标处理。",
"20.3": "电子文件用MSWORD/EXCEL简体中文版制作内容包括由投标人自行制作的与正本文件一致的所有文件。电子文件由CD-R光盘或优盘储存放在唱标信封内。",
"20.4": "投标文件的签署:",
"20.4.1": "投标文件的正本需打印或用不褪色墨水书写,副本可以复印。授权代表须将以书面形式出具的《法定代表人授权委托书》附在投标文件中。投标人的投标文件必须按照招标文件“第六部分投标文件格式”已明示法定代表人(或其授权代表)签署和盖章处,必须进行签字(或盖私章)和盖公章,否则视为无效投标。",
"20.4.2": "投标文件中的任何重要的插字、涂改和增删,必须由法定代表人或经其正式授权的代表在旁边签署才有效。",
"20.4.3": "投标文件的“正本”及所有“副本”的封面及每一页都必须由投标人加盖公章或骑缝章或法定代表人(或其授权代表)签署,对于未按要求盖章或签署的投标文件,均作无效投标处理。投标文件的“正本”及所有“副本”的封面及每一页都必须由投标人加盖公章或骑缝章或法定代表人(或其授权代表)签署,对于未按要求盖章或签署的投标文件,均作无效投标处理。"
}
transformed = transform_json(data)
print(json.dumps(transformed, ensure_ascii=False, indent=4))
print("匹配失败!")

View File

@ -4,7 +4,7 @@ from PyPDF2 import PdfReader
from flask_app.general.json_utils import combine_json_results
from flask_app.general.多线程提问 import multi_threading
from flask_app.general.通义千问long import upload_file
from flask_app.货物标.货物标截取pdf import extract_common_header, clean_page_content
from flask_app.货物标.截取pdf货物标版 import extract_common_header, clean_page_content
#正则表达式判断原文中是否有商务、服务、其他要求

View File

@ -202,7 +202,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header)
elif output_suffix == "qualification1":
truncate_pdf_main(pdf_path, output_folder, 2, "qualification3")
truncate_pdf_main(pdf_path, output_folder, 2, "qualification3") #合并'资格审查'章节和'评标办法'章节
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
except Exception as e:
print(f"Error processing {pdf_path}: {e}")
@ -240,7 +240,17 @@ def get_patterns_for_qualification():
r'^附件\s*\d+|^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
return begin_pattern_new, end_pattern_new
def get_patterns_for_qualification2():
begin_pattern = re.compile(
r'^(?:附录(?:一|1)?[:]|附件(?:一|1)?[:]|附表(?:一|1)?[:]).*(?:资格|符合性).*$',
re.MULTILINE
)
end_pattern = re.compile(
r'^(?!.*(?:资格|符合))(?:附录.*?[:]|附件.*?[:]|附表.*?[:]).*$|'
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
re.MULTILINE
)
return begin_pattern, end_pattern
def get_patterns_for_notice():
begin_pattern = re.compile(
@ -292,7 +302,13 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, beg
flags = re.MULTILINE if use_multiline else 0
if re.search(mid_pattern, cleaned_text, flags):
mid_page = i
if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page:
if start_page is not None and re.search(end_pattern, cleaned_text):
if mid_page is None:
if i > start_page:
end_page = i
break
else:
if i > mid_page:
end_page = i
break
return start_page, mid_page, end_page
@ -376,7 +392,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
patterns = [get_patterns_for_evaluation_method()]
begin_page = 5
elif output_suffix == "qualification1":
patterns = [get_patterns_for_qualification()]
patterns = [get_patterns_for_qualification(),get_patterns_for_qualification2()]
begin_page = 5
elif output_suffix == "notice":
patterns = [get_patterns_for_notice()]
@ -384,6 +400,8 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
if patterns:
for pattern_pair in patterns:
# print(pattern_pair[0])
# print(pattern_pair[1])
start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
common_header,
exclusion_pattern, output_suffix)
@ -635,17 +653,18 @@ def truncate_pdf_specific_goods(pdf_path, output_folder,selections):
return truncate_files
# TODO:交通智能系统和招标(1)(1)文件有问题 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况类似工程标
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况类似工程标
if __name__ == "__main__":
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\唐山市公安交通警察支队机动车查验机构视频存储回放系统竞争性谈判-招标文件正文(1).pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件(广水市教育局封闭管理项目二次).pdf"
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1).pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
files = truncate_pdf_multiple(input_path, output_folder)
# files = truncate_pdf_multiple(input_path, output_folder)
# files=truncate_pdf_specific_goods(input_path,output_folder)
print(files)
# selection = 4# 例如1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-公告
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print(generated_files)
# print(files)
selection = 4# 例如1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-公告
generated_files = truncate_pdf_main(input_path, output_folder, selection)
print(generated_files)

View File

@ -5,7 +5,7 @@ import os
from flask_app.general.多线程提问 import multi_threading
from flask_app.general.通义千问long import qianwen_long, upload_file
from flask_app.general.json_utils import clean_json_string, combine_json_results
from flask_app.货物标.货物标截取pdf import truncate_pdf_main
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main
def generate_key_paths(data, parent_key=''):
key_paths = []

View File

@ -5,7 +5,7 @@ import os
import fitz
from PyPDF2 import PdfReader
from flask_app.货物标.货物标截取pdf import clean_page_content,extract_common_header
from flask_app.货物标.截取pdf货物标版 import clean_page_content,extract_common_header
def extract_text_from_docx(file_path):
doc = docx.Document(file_path)

View File

@ -1,11 +1,11 @@
# 竞磋 竞谈 磋商 询价 邀请 单一来源
import json
from flask_app.general.format_change import docx2pdf, pdf2docx
from flask_app.general.format_change import docx2pdf, pdf2docx,doc2docx
from flask_app.general.json_utils import transform_json_values
from flask_app.货物标.基础信息解析main import combine_basic_info
from flask_app.货物标.投标人须知正文提取指定内容货物标版 import extract_from_notice
from flask_app.货物标.货物标截取pdf import truncate_pdf_multiple
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_multiple
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
from flask_app.货物标.投标人须知正文条款提取成json文件货物标版 import convert_clause_to_json
@ -28,7 +28,7 @@ logger = None
executor = ThreadPoolExecutor()
def preprocess_files(output_folder, file_path, file_type, unique_id):
def preprocess_files(output_folder, file_path, file_type):
logger.info("starting 文件预处理...")
logger.info("output_folder..." + output_folder)
@ -41,7 +41,7 @@ def preprocess_files(output_folder, file_path, file_type, unique_id):
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
elif file_type == 3: # doc
pdf_path = docx2pdf(file_path)
docx_path = pdf2docx(pdf_path)
docx_path = doc2docx(file_path)
else:
logger.error("Unsupported file type provided. Preprocessing halted.")
return None
@ -135,7 +135,7 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
logger = get_global_logger(unique_id)
# 预处理文件,获取处理后的数据
processed_data = preprocess_files(output_folder, file_path, file_type, unique_id)
processed_data = preprocess_files(output_folder, file_path, file_type)
if not processed_data:
yield json.dumps({}) # 如果处理数据失败,返回空的 JSON