10.23 无效标优化
This commit is contained in:
parent
7f6ea14439
commit
5cfd6479d4
@ -48,29 +48,123 @@ def pdf2docx(local_path_in):
|
|||||||
print(f"format_change p2d:have downloaded file to: {downloaded_filepath}")
|
print(f"format_change p2d:have downloaded file to: {downloaded_filepath}")
|
||||||
return downloaded_filepath
|
return downloaded_filepath
|
||||||
|
|
||||||
def doc2docx(local_path_in):
|
# def doc2docx(local_path_in):
|
||||||
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d'
|
# remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d'
|
||||||
receive_download_url = upload_file(local_path_in, remote_url)
|
# receive_download_url = upload_file(local_path_in, remote_url)
|
||||||
print(receive_download_url)
|
# print(receive_download_url)
|
||||||
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
|
# filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
|
||||||
local_filename = os.path.join(folder, filename) # 输出文件名
|
# local_filename = os.path.join(folder, filename) # 输出文件名
|
||||||
downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
|
# downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
|
||||||
print(f"format_change d2d:have downloaded file to: {downloaded_filepath}")
|
# print(f"format_change d2d:have downloaded file to: {downloaded_filepath}")
|
||||||
return downloaded_filepath
|
# return downloaded_filepath
|
||||||
def docx2pdf(local_path_in):
|
# def docx2pdf(local_path_in):
|
||||||
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p'
|
# remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p'
|
||||||
receive_download_url = upload_file(local_path_in, remote_url)
|
# receive_download_url = upload_file(local_path_in, remote_url)
|
||||||
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
|
# filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
|
||||||
local_filename = os.path.join(folder, filename) # 输出文件名
|
# local_filename = os.path.join(folder, filename) # 输出文件名
|
||||||
downloaded_filepath,file_type = download_file(receive_download_url, local_filename)
|
# downloaded_filepath,file_type = download_file(receive_download_url, local_filename)
|
||||||
print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
|
# print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
|
||||||
return downloaded_filepath
|
# return downloaded_filepath
|
||||||
|
def docx2pdf(file_path):
|
||||||
|
"""
|
||||||
|
将本地的 .docx 或 .doc 文件转换为 .pdf 文件。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
- file_path: str, 本地文件的路径,支持 .docx 和 .doc 格式。
|
||||||
|
"""
|
||||||
|
# 检查文件是否存在
|
||||||
|
if not os.path.isfile(file_path):
|
||||||
|
raise FileNotFoundError(f"文件未找到: {file_path}")
|
||||||
|
|
||||||
|
# 获取文件名和扩展名
|
||||||
|
base_name = os.path.basename(file_path)
|
||||||
|
name, ext = os.path.splitext(base_name)
|
||||||
|
ext = ext.lower().lstrip('.')
|
||||||
|
|
||||||
|
if ext not in ['docx', 'doc']:
|
||||||
|
raise ValueError(f"doc2pdf 仅支持 .docx 和 .doc 文件,当前文件扩展名为: .{ext}")
|
||||||
|
|
||||||
|
# 定义转换接口
|
||||||
|
endpoint = 'http://120.26.236.97:5005/convert_to_pdf'
|
||||||
|
|
||||||
|
# 获取文件所在目录
|
||||||
|
output_dir = os.path.dirname(file_path)
|
||||||
|
|
||||||
|
# 准备上传的文件
|
||||||
|
with open(file_path, 'rb') as f:
|
||||||
|
files = {'file': (base_name, f)}
|
||||||
|
try:
|
||||||
|
print(f"正在将 {base_name} 转换为 .pdf 格式...")
|
||||||
|
response = requests.post(endpoint, files=files)
|
||||||
|
response.raise_for_status() # 检查请求是否成功
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"转换过程中发生错误: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 准备保存转换后文件的路径
|
||||||
|
output_file_name = f"{name}.pdf"
|
||||||
|
output_path = os.path.join(output_dir, output_file_name)
|
||||||
|
|
||||||
|
# 保存转换后的文件
|
||||||
|
with open(output_path, 'wb') as out_file:
|
||||||
|
out_file.write(response.content)
|
||||||
|
|
||||||
|
print(f"文件已成功转换并保存至: {output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def doc2docx(file_path):
|
||||||
|
"""
|
||||||
|
将本地的 .doc 文件转换为 .docx 文件。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
- file_path: str, 本地文件的路径,支持 .doc 格式。
|
||||||
|
"""
|
||||||
|
# 检查文件是否存在
|
||||||
|
if not os.path.isfile(file_path):
|
||||||
|
raise FileNotFoundError(f"文件未找到: {file_path}")
|
||||||
|
|
||||||
|
# 获取文件名和扩展名
|
||||||
|
base_name = os.path.basename(file_path)
|
||||||
|
name, ext = os.path.splitext(base_name)
|
||||||
|
ext = ext.lower().lstrip('.')
|
||||||
|
|
||||||
|
if ext != 'doc':
|
||||||
|
raise ValueError(f"doc2docx 仅支持 .doc 文件,当前文件扩展名为: .{ext}")
|
||||||
|
|
||||||
|
# 定义转换接口
|
||||||
|
endpoint = 'http://120.26.236.97:5005/convert_to_docx'
|
||||||
|
|
||||||
|
# 获取文件所在目录
|
||||||
|
output_dir = os.path.dirname(file_path)
|
||||||
|
|
||||||
|
# 准备上传的文件
|
||||||
|
with open(file_path, 'rb') as f:
|
||||||
|
files = {'file': (base_name, f)}
|
||||||
|
try:
|
||||||
|
print(f"正在将 {base_name} 转换为 .docx 格式...")
|
||||||
|
response = requests.post(endpoint, files=files)
|
||||||
|
response.raise_for_status() # 检查请求是否成功
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"转换过程中发生错误: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 准备保存转换后文件的路径
|
||||||
|
output_file_name = f"{name}.docx"
|
||||||
|
output_path = os.path.join(output_dir, output_file_name)
|
||||||
|
|
||||||
|
# 保存转换后的文件
|
||||||
|
with open(output_path, 'wb') as out_file:
|
||||||
|
out_file.write(response.content)
|
||||||
|
|
||||||
|
print(f"文件已成功转换并保存至: {output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# 替换为你的文件路径和API URL
|
# 替换为你的文件路径和API URL
|
||||||
local_path_in="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\6.2定版视频会议磋商文件.doc"
|
local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf"
|
||||||
downloaded_file=doc2docx(local_path_in)
|
# downloaded_file=doc2docx(local_path_in)
|
||||||
# downloaded_file=docx2pdf(local_path_in)
|
downloaded_file=pdf2docx(local_path_in)
|
||||||
print(downloaded_file)
|
print(downloaded_file)
|
||||||
|
|
||||||
|
|
||||||
|
@ -9,7 +9,7 @@ from flask_app.general.json_utils import clean_json_string
|
|||||||
from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
|
from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
|
||||||
from flask_app.general.通义千问long import upload_file
|
from flask_app.general.通义千问long import upload_file
|
||||||
from flask_app.货物标.基础信息解析main import aggregate_basic_info_goods
|
from flask_app.货物标.基础信息解析main import aggregate_basic_info_goods
|
||||||
from flask_app.货物标.货物标截取pdf import truncate_pdf_specific_goods
|
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_specific_goods
|
||||||
from flask_app.main.截取pdf import truncate_pdf_specific_engineering
|
from flask_app.main.截取pdf import truncate_pdf_specific_engineering
|
||||||
from flask_app.general.post_processing import inner_post_processing
|
from flask_app.general.post_processing import inner_post_processing
|
||||||
from flask_app.old_version.基础信息整合 import aggregate_basic_info_engineering
|
from flask_app.old_version.基础信息整合 import aggregate_basic_info_engineering
|
||||||
|
@ -56,3 +56,17 @@ def merge_and_cleanup(output_pdf_path, suffix_to_merge):
|
|||||||
merge_pdfs(paths, output_pdf_path)
|
merge_pdfs(paths, output_pdf_path)
|
||||||
os.remove(another_file_path)
|
os.remove(another_file_path)
|
||||||
print(f"文件 {another_file_path} 已删除。")
|
print(f"文件 {another_file_path} 已删除。")
|
||||||
|
def find_and_merge(target_path, output_suffix):
|
||||||
|
# 获取 target_path 所在的目录
|
||||||
|
directory = os.path.dirname(target_path)
|
||||||
|
full_path=""
|
||||||
|
# 遍历目录中的所有文件,寻找以 output_suffix 结尾的文件
|
||||||
|
for filename in os.listdir(directory):
|
||||||
|
if filename.endswith(output_suffix):
|
||||||
|
# 拼接目录路径和文件名,生成完整路径
|
||||||
|
full_path = os.path.join(directory, filename)
|
||||||
|
if not full_path:
|
||||||
|
paths=[target_path]
|
||||||
|
else:
|
||||||
|
paths=[target_path,full_path]
|
||||||
|
merge_pdfs(paths,target_path)
|
@ -13,7 +13,7 @@ import concurrent.futures
|
|||||||
from flask_app.main.基础信息整合快速版 import combine_basic_info
|
from flask_app.main.基础信息整合快速版 import combine_basic_info
|
||||||
from flask_app.main.资格审查模块 import combine_review_standards
|
from flask_app.main.资格审查模块 import combine_review_standards
|
||||||
from flask_app.main.商务标技术标整合 import combine_evaluation_standards
|
from flask_app.main.商务标技术标整合 import combine_evaluation_standards
|
||||||
from flask_app.general.format_change import pdf2docx, docx2pdf
|
from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx
|
||||||
from flask_app.general.docx截取docx import copy_docx
|
from flask_app.general.docx截取docx import copy_docx
|
||||||
|
|
||||||
def get_global_logger(unique_id):
|
def get_global_logger(unique_id):
|
||||||
@ -37,6 +37,9 @@ def preprocess_files(output_folder, downloaded_file_path, file_type):
|
|||||||
elif file_type == 2: # pdf
|
elif file_type == 2: # pdf
|
||||||
pdf_path = downloaded_file_path
|
pdf_path = downloaded_file_path
|
||||||
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
|
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
|
||||||
|
elif file_type == 3: #doc
|
||||||
|
pdf_path=docx2pdf(downloaded_file_path)
|
||||||
|
docx_path=doc2docx(downloaded_file_path)
|
||||||
else:
|
else:
|
||||||
logger.error("Unsupported file type provided. Preprocessing halted.")
|
logger.error("Unsupported file type provided. Preprocessing halted.")
|
||||||
return None
|
return None
|
||||||
|
@ -464,7 +464,7 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections):
|
|||||||
# TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下,比如说就不进行跳转(见xx表) 开评定标这里也要考虑 如果评分表为空,也要处理。
|
# TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下,比如说就不进行跳转(见xx表) 开评定标这里也要考虑 如果评分表为空,也要处理。
|
||||||
#TODO:zbtest8 zbtest18有问题 后期需要完善,截取需要截两次,第一次严格第二次宽松
|
#TODO:zbtest8 zbtest18有问题 后期需要完善,截取需要截两次,第一次严格第二次宽松
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
|
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest1.pdf" # 可以是单个PDF文件路径或文件夹路径
|
||||||
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\e378e002-45ff-440d-a65a-9974b5015472\\ztbfile.pdf"
|
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\e378e002-45ff-440d-a65a-9974b5015472\\ztbfile.pdf"
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
|
||||||
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\output6"
|
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\output6"
|
||||||
|
@ -1,76 +1,29 @@
|
|||||||
# -*- encoding:utf-8 -*-
|
|
||||||
import re
|
import re
|
||||||
import json
|
|
||||||
|
|
||||||
def preprocess_data(data):
|
# 定义修改后的正则表达式
|
||||||
# 根据需要预处理数据,这里假设原始数据已经是一个字典
|
pattern = re.compile(
|
||||||
return data
|
r'^(?!.*(?:资格|符合))(?:附录.*?[::]|附件.*?[::]|附表.*?[::]).*$|'
|
||||||
|
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
|
||||||
|
re.MULTILINE
|
||||||
|
)
|
||||||
|
|
||||||
def transform_json(data):
|
# 测试用例文本
|
||||||
result = {}
|
text = """附表 :审查审查表(注:本表不需要投标人填写)
|
||||||
temp = {0: result} # 初始化根字典
|
技术评审表
|
||||||
data = preprocess_data(data)
|
序号 评审项目 详细评资格审内容 分值 投标人A 投标人B 投标人C
|
||||||
|
1 技术方案的总体评价 对供应商技术响应方案是否全面、完整、清晰、合理性等进行评价:
|
||||||
|
对供应商技术响应方案全面、完整、清晰、合理,得3分;
|
||||||
|
对供应商技术响应方案比较全面、完整、清晰、合理,得1分;
|
||||||
|
对供应商技术响应方案不全面、完整、清晰、合理,得0.5分;
|
||||||
|
没有提供,不得分。
|
||||||
|
"""
|
||||||
|
|
||||||
# 首先, 创建一个临时字典用于检查是否存在三级标题
|
# 匹配测试
|
||||||
has_subkey = {}
|
matches = pattern.findall(text)
|
||||||
for key in data.keys():
|
|
||||||
parts = key.split('.')
|
|
||||||
if len(parts) > 2 and parts[1]:
|
|
||||||
parent_key = '.'.join(parts[:2])
|
|
||||||
has_subkey[parent_key] = True
|
|
||||||
|
|
||||||
for key, value in data.items():
|
if matches:
|
||||||
match = re.match(r'(\d+)(?:\.(\d+))?(?:\.(\d+))?', key)
|
print("匹配成功!")
|
||||||
if match:
|
for match in matches:
|
||||||
levels = [int(l) for l in match.groups() if l is not None]
|
print("匹配内容:", match)
|
||||||
current_level = len(levels)
|
else:
|
||||||
parent = temp.get(current_level - 1, result)
|
print("匹配失败!")
|
||||||
|
|
||||||
# 获取标题内容
|
|
||||||
if current_level == 1:
|
|
||||||
# 一级标题
|
|
||||||
parent[value] = {}
|
|
||||||
temp[current_level] = parent[value]
|
|
||||||
elif current_level == 2:
|
|
||||||
# 二级标题
|
|
||||||
if has_subkey.get(key, False):
|
|
||||||
parent[value] = {}
|
|
||||||
else:
|
|
||||||
parent[value] = value # 如果没有子键,直接赋值内容
|
|
||||||
temp[current_level] = parent[value]
|
|
||||||
elif current_level == 3:
|
|
||||||
# 三级标题
|
|
||||||
if isinstance(parent, dict):
|
|
||||||
# 确保父级是一个字典
|
|
||||||
parent[value] = value
|
|
||||||
elif isinstance(parent, list):
|
|
||||||
parent.append(value)
|
|
||||||
temp[current_level] = value
|
|
||||||
else:
|
|
||||||
# 处理无法匹配的键
|
|
||||||
print(f"无法匹配的键: {key}")
|
|
||||||
|
|
||||||
def remove_single_item_lists(node):
|
|
||||||
if isinstance(node, dict):
|
|
||||||
for key in list(node.keys()):
|
|
||||||
node[key] = remove_single_item_lists(node[key])
|
|
||||||
if isinstance(node[key], list) and len(node[key]) == 1:
|
|
||||||
node[key] = node[key][0]
|
|
||||||
return node
|
|
||||||
|
|
||||||
return remove_single_item_lists(result)
|
|
||||||
|
|
||||||
# 示例数据
|
|
||||||
data = {
|
|
||||||
"20.": "投标文件的式样和签署",
|
|
||||||
"20.1": "投标文件的式样:投标人应准备一份投标文件正本、电子文件和投标须知前附表中规定数目的副本,投标文件的副本可采用正本的复印件。每套投标文件须清楚地标明“正本”或“副本”。若副本与正本不符,以正本为准,装订如下:序号投标文件名称装订备注1唱标信封/投标报价一览表(加盖公章)、《投标保证金汇入情况说明》(加盖公章)、投标保证金缴付凭证复印件(加盖公章)2电子文件含价格文件、商务技术文件3价格文件独立装订成册含正、副本4商务技术文件商务文件和技术文件可一起独立装订成册,也可含正、副本以分开独立装订成册",
|
|
||||||
"20.2": "投标文件的装订要求:投标报价表部分必须单独装订,若将投标报价表与商务、技术部分等内容合为装订或在商务、技术部分等内容出现投标报价,则作无效投标处理。",
|
|
||||||
"20.3": "电子文件用MSWORD/EXCEL简体中文版制作,内容包括:由投标人自行制作的与正本文件一致的所有文件。电子文件由CD-R光盘或优盘储存,放在唱标信封内。",
|
|
||||||
"20.4": "投标文件的签署:",
|
|
||||||
"20.4.1": "投标文件的正本需打印或用不褪色墨水书写,副本可以复印。授权代表须将以书面形式出具的《法定代表人授权委托书》附在投标文件中。投标人的投标文件必须按照招标文件“第六部分投标文件格式”已明示法定代表人(或其授权代表)签署和盖章处,必须进行签字(或盖私章)和盖公章,否则视为无效投标。",
|
|
||||||
"20.4.2": "投标文件中的任何重要的插字、涂改和增删,必须由法定代表人或经其正式授权的代表在旁边签署才有效。",
|
|
||||||
"20.4.3": "投标文件的“正本”及所有“副本”的封面及每一页都必须由投标人加盖公章或骑缝章或法定代表人(或其授权代表)签署,对于未按要求盖章或签署的投标文件,均作无效投标处理。投标文件的“正本”及所有“副本”的封面及每一页都必须由投标人加盖公章或骑缝章或法定代表人(或其授权代表)签署,对于未按要求盖章或签署的投标文件,均作无效投标处理。"
|
|
||||||
}
|
|
||||||
|
|
||||||
transformed = transform_json(data)
|
|
||||||
print(json.dumps(transformed, ensure_ascii=False, indent=4))
|
|
||||||
|
@ -4,7 +4,7 @@ from PyPDF2 import PdfReader
|
|||||||
from flask_app.general.json_utils import combine_json_results
|
from flask_app.general.json_utils import combine_json_results
|
||||||
from flask_app.general.多线程提问 import multi_threading
|
from flask_app.general.多线程提问 import multi_threading
|
||||||
from flask_app.general.通义千问long import upload_file
|
from flask_app.general.通义千问long import upload_file
|
||||||
from flask_app.货物标.货物标截取pdf import extract_common_header, clean_page_content
|
from flask_app.货物标.截取pdf货物标版 import extract_common_header, clean_page_content
|
||||||
|
|
||||||
|
|
||||||
#正则表达式判断原文中是否有商务、服务、其他要求
|
#正则表达式判断原文中是否有商务、服务、其他要求
|
||||||
|
@ -202,7 +202,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
|||||||
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
||||||
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header)
|
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header)
|
||||||
elif output_suffix == "qualification1":
|
elif output_suffix == "qualification1":
|
||||||
truncate_pdf_main(pdf_path, output_folder, 2, "qualification3")
|
truncate_pdf_main(pdf_path, output_folder, 2, "qualification3") #合并'资格审查'章节和'评标办法'章节
|
||||||
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing {pdf_path}: {e}")
|
print(f"Error processing {pdf_path}: {e}")
|
||||||
@ -240,7 +240,17 @@ def get_patterns_for_qualification():
|
|||||||
r'^附件\s*\d+|^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
r'^附件\s*\d+|^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||||
|
|
||||||
return begin_pattern_new, end_pattern_new
|
return begin_pattern_new, end_pattern_new
|
||||||
|
def get_patterns_for_qualification2():
|
||||||
|
begin_pattern = re.compile(
|
||||||
|
r'^(?:附录(?:一|1)?[::]|附件(?:一|1)?[::]|附表(?:一|1)?[::]).*(?:资格|符合性).*$',
|
||||||
|
re.MULTILINE
|
||||||
|
)
|
||||||
|
end_pattern = re.compile(
|
||||||
|
r'^(?!.*(?:资格|符合))(?:附录.*?[::]|附件.*?[::]|附表.*?[::]).*$|'
|
||||||
|
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
|
||||||
|
re.MULTILINE
|
||||||
|
)
|
||||||
|
return begin_pattern, end_pattern
|
||||||
|
|
||||||
def get_patterns_for_notice():
|
def get_patterns_for_notice():
|
||||||
begin_pattern = re.compile(
|
begin_pattern = re.compile(
|
||||||
@ -292,7 +302,13 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, beg
|
|||||||
flags = re.MULTILINE if use_multiline else 0
|
flags = re.MULTILINE if use_multiline else 0
|
||||||
if re.search(mid_pattern, cleaned_text, flags):
|
if re.search(mid_pattern, cleaned_text, flags):
|
||||||
mid_page = i
|
mid_page = i
|
||||||
if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page:
|
if start_page is not None and re.search(end_pattern, cleaned_text):
|
||||||
|
if mid_page is None:
|
||||||
|
if i > start_page:
|
||||||
|
end_page = i
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
if i > mid_page:
|
||||||
end_page = i
|
end_page = i
|
||||||
break
|
break
|
||||||
return start_page, mid_page, end_page
|
return start_page, mid_page, end_page
|
||||||
@ -376,7 +392,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
|
|||||||
patterns = [get_patterns_for_evaluation_method()]
|
patterns = [get_patterns_for_evaluation_method()]
|
||||||
begin_page = 5
|
begin_page = 5
|
||||||
elif output_suffix == "qualification1":
|
elif output_suffix == "qualification1":
|
||||||
patterns = [get_patterns_for_qualification()]
|
patterns = [get_patterns_for_qualification(),get_patterns_for_qualification2()]
|
||||||
begin_page = 5
|
begin_page = 5
|
||||||
elif output_suffix == "notice":
|
elif output_suffix == "notice":
|
||||||
patterns = [get_patterns_for_notice()]
|
patterns = [get_patterns_for_notice()]
|
||||||
@ -384,6 +400,8 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
|
|||||||
|
|
||||||
if patterns:
|
if patterns:
|
||||||
for pattern_pair in patterns:
|
for pattern_pair in patterns:
|
||||||
|
# print(pattern_pair[0])
|
||||||
|
# print(pattern_pair[1])
|
||||||
start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
|
start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
|
||||||
common_header,
|
common_header,
|
||||||
exclusion_pattern, output_suffix)
|
exclusion_pattern, output_suffix)
|
||||||
@ -635,17 +653,18 @@ def truncate_pdf_specific_goods(pdf_path, output_folder,selections):
|
|||||||
return truncate_files
|
return truncate_files
|
||||||
|
|
||||||
|
|
||||||
# TODO:交通智能系统和招标(1)(1)文件有问题 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况,类似工程标
|
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况,类似工程标
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\唐山市公安交通警察支队机动车查验机构视频存储回放系统竞争性谈判-招标文件正文(1).pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件(广水市教育局封闭管理项目二次).pdf"
|
||||||
|
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1).pdf"
|
||||||
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
||||||
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
|
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
|
||||||
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
|
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
|
||||||
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
|
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
|
||||||
files = truncate_pdf_multiple(input_path, output_folder)
|
# files = truncate_pdf_multiple(input_path, output_folder)
|
||||||
# files=truncate_pdf_specific_goods(input_path,output_folder)
|
# files=truncate_pdf_specific_goods(input_path,output_folder)
|
||||||
print(files)
|
# print(files)
|
||||||
# selection = 4# 例如:1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-公告
|
selection = 4# 例如:1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-公告
|
||||||
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||||
# print(generated_files)
|
print(generated_files)
|
@ -5,7 +5,7 @@ import os
|
|||||||
from flask_app.general.多线程提问 import multi_threading
|
from flask_app.general.多线程提问 import multi_threading
|
||||||
from flask_app.general.通义千问long import qianwen_long, upload_file
|
from flask_app.general.通义千问long import qianwen_long, upload_file
|
||||||
from flask_app.general.json_utils import clean_json_string, combine_json_results
|
from flask_app.general.json_utils import clean_json_string, combine_json_results
|
||||||
from flask_app.货物标.货物标截取pdf import truncate_pdf_main
|
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main
|
||||||
|
|
||||||
def generate_key_paths(data, parent_key=''):
|
def generate_key_paths(data, parent_key=''):
|
||||||
key_paths = []
|
key_paths = []
|
||||||
|
@ -5,7 +5,7 @@ import os
|
|||||||
|
|
||||||
import fitz
|
import fitz
|
||||||
from PyPDF2 import PdfReader
|
from PyPDF2 import PdfReader
|
||||||
from flask_app.货物标.货物标截取pdf import clean_page_content,extract_common_header
|
from flask_app.货物标.截取pdf货物标版 import clean_page_content,extract_common_header
|
||||||
|
|
||||||
def extract_text_from_docx(file_path):
|
def extract_text_from_docx(file_path):
|
||||||
doc = docx.Document(file_path)
|
doc = docx.Document(file_path)
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
# 竞磋 竞谈 磋商 询价 邀请 单一来源
|
# 竞磋 竞谈 磋商 询价 邀请 单一来源
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from flask_app.general.format_change import docx2pdf, pdf2docx
|
from flask_app.general.format_change import docx2pdf, pdf2docx,doc2docx
|
||||||
from flask_app.general.json_utils import transform_json_values
|
from flask_app.general.json_utils import transform_json_values
|
||||||
from flask_app.货物标.基础信息解析main import combine_basic_info
|
from flask_app.货物标.基础信息解析main import combine_basic_info
|
||||||
from flask_app.货物标.投标人须知正文提取指定内容货物标版 import extract_from_notice
|
from flask_app.货物标.投标人须知正文提取指定内容货物标版 import extract_from_notice
|
||||||
from flask_app.货物标.货物标截取pdf import truncate_pdf_multiple
|
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_multiple
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
from flask_app.货物标.投标人须知正文条款提取成json文件货物标版 import convert_clause_to_json
|
from flask_app.货物标.投标人须知正文条款提取成json文件货物标版 import convert_clause_to_json
|
||||||
@ -28,7 +28,7 @@ logger = None
|
|||||||
executor = ThreadPoolExecutor()
|
executor = ThreadPoolExecutor()
|
||||||
|
|
||||||
|
|
||||||
def preprocess_files(output_folder, file_path, file_type, unique_id):
|
def preprocess_files(output_folder, file_path, file_type):
|
||||||
logger.info("starting 文件预处理...")
|
logger.info("starting 文件预处理...")
|
||||||
logger.info("output_folder..." + output_folder)
|
logger.info("output_folder..." + output_folder)
|
||||||
|
|
||||||
@ -41,7 +41,7 @@ def preprocess_files(output_folder, file_path, file_type, unique_id):
|
|||||||
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
|
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
|
||||||
elif file_type == 3: # doc
|
elif file_type == 3: # doc
|
||||||
pdf_path = docx2pdf(file_path)
|
pdf_path = docx2pdf(file_path)
|
||||||
docx_path = pdf2docx(pdf_path)
|
docx_path = doc2docx(file_path)
|
||||||
else:
|
else:
|
||||||
logger.error("Unsupported file type provided. Preprocessing halted.")
|
logger.error("Unsupported file type provided. Preprocessing halted.")
|
||||||
return None
|
return None
|
||||||
@ -135,7 +135,7 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
|
|||||||
logger = get_global_logger(unique_id)
|
logger = get_global_logger(unique_id)
|
||||||
|
|
||||||
# 预处理文件,获取处理后的数据
|
# 预处理文件,获取处理后的数据
|
||||||
processed_data = preprocess_files(output_folder, file_path, file_type, unique_id)
|
processed_data = preprocess_files(output_folder, file_path, file_type)
|
||||||
if not processed_data:
|
if not processed_data:
|
||||||
yield json.dumps({}) # 如果处理数据失败,返回空的 JSON
|
yield json.dumps({}) # 如果处理数据失败,返回空的 JSON
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user