10.25切分改为多线程
This commit is contained in:
parent
772aaa17f0
commit
d692f06125
@ -2,6 +2,7 @@ from PyPDF2 import PdfReader, PdfWriter
|
|||||||
import re # 导入正则表达式库
|
import re # 导入正则表达式库
|
||||||
import os # 用于文件和文件夹操作
|
import os # 用于文件和文件夹操作
|
||||||
from flask_app.general.merge_pdfs import merge_pdfs
|
from flask_app.general.merge_pdfs import merge_pdfs
|
||||||
|
import concurrent.futures
|
||||||
def clean_page_content(text, common_header):
|
def clean_page_content(text, common_header):
|
||||||
# 首先删除抬头公共部分
|
# 首先删除抬头公共部分
|
||||||
if common_header: # 确保有公共抬头才进行替换
|
if common_header: # 确保有公共抬头才进行替换
|
||||||
@ -390,15 +391,42 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na
|
|||||||
print(f"合并 PDF 文件时出错: {e}")
|
print(f"合并 PDF 文件时出错: {e}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def truncate_pdf_multiple(input_path, output_folder):
|
def truncate_pdf_multiple(input_path, output_folder):
|
||||||
|
"""
|
||||||
|
处理 PDF 文件,选择 selection 1-6 的部分,并合并结果。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_path (str): 要处理的 PDF 文件路径。
|
||||||
|
output_folder (str): 截取后的文件保存文件夹路径。
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: 截取的文件路径列表,包括合并后的文件路径(如果有)。
|
||||||
|
"""
|
||||||
base_file_name = os.path.splitext(os.path.basename(input_path))[0] # 纯文件名
|
base_file_name = os.path.splitext(os.path.basename(input_path))[0] # 纯文件名
|
||||||
truncate_files = []
|
truncate_files = []
|
||||||
for selection in range(1, 7):
|
selections = range(1, 7) # 选择 1 到 6
|
||||||
files = truncate_pdf_main(input_path, output_folder, selection)
|
|
||||||
if files:
|
# 使用 ThreadPoolExecutor 进行多线程处理
|
||||||
truncate_files.extend(files)
|
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
|
||||||
else:
|
# 提交所有任务并获取未来对象
|
||||||
truncate_files.append("") # 截取失败时添加空字符串
|
future_to_selection = {executor.submit(truncate_pdf_main, input_path, output_folder, selection): selection for
|
||||||
|
selection in selections}
|
||||||
|
|
||||||
|
# 逐个获取完成的任务
|
||||||
|
for future in concurrent.futures.as_completed(future_to_selection):
|
||||||
|
selection = future_to_selection[future]
|
||||||
|
try:
|
||||||
|
files = future.result()
|
||||||
|
if files and any(f for f in files):
|
||||||
|
# 过滤空字符串
|
||||||
|
valid_files = [f for f in files if f]
|
||||||
|
truncate_files.extend(valid_files)
|
||||||
|
else:
|
||||||
|
truncate_files.append("") # 截取失败时添加空字符串
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Selection {selection} generated an exception: {e}")
|
||||||
|
truncate_files.append("") # 发生异常时添加空字符串
|
||||||
|
|
||||||
if any(f for f in truncate_files if f): # 检查是否有有效的文件路径
|
if any(f for f in truncate_files if f): # 检查是否有有效的文件路径
|
||||||
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
|
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
|
||||||
@ -414,14 +442,16 @@ def truncate_pdf_multiple(input_path, output_folder):
|
|||||||
print(f"没有文件需要合并 for {input_path}")
|
print(f"没有文件需要合并 for {input_path}")
|
||||||
|
|
||||||
return truncate_files
|
return truncate_files
|
||||||
|
|
||||||
|
|
||||||
def truncate_pdf_specific_engineering(pdf_path, output_folder, selections):
|
def truncate_pdf_specific_engineering(pdf_path, output_folder, selections):
|
||||||
"""
|
"""
|
||||||
处理 PDF 文件,选择 selection 为指定部分,并合并结果。
|
处理 PDF 文件,选择指定的 selections,并合并结果。
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pdf_path (str): 要处理的 PDF 文件路径。
|
pdf_path (str): 要处理的 PDF 文件路径。
|
||||||
output_folder (str): 截取后的文件保存文件夹路径。
|
output_folder (str): 截取后的文件保存文件夹路径。
|
||||||
selections (list): 需要截取的部分
|
selections (list): 需要截取的部分(例如 [4, 5])。
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list: 截取的文件路径列表,包括合并后的文件路径(如果有)。
|
list: 截取的文件路径列表,包括合并后的文件路径(如果有)。
|
||||||
@ -430,19 +460,30 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections):
|
|||||||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||||
truncate_files = []
|
truncate_files = []
|
||||||
|
|
||||||
for selection in selections:
|
# 使用 ThreadPoolExecutor 进行多线程处理
|
||||||
files = truncate_pdf_main(pdf_path, output_folder, selection)
|
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
|
||||||
if files:
|
# 提交所有任务并获取未来对象
|
||||||
if isinstance(files, list):
|
future_to_selection = {executor.submit(truncate_pdf_main, pdf_path, output_folder, selection): selection for
|
||||||
truncate_files.extend(files)
|
selection in selections}
|
||||||
elif isinstance(files, str):
|
|
||||||
truncate_files.append(files)
|
# 逐个获取完成的任务
|
||||||
else:
|
for future in concurrent.futures.as_completed(future_to_selection):
|
||||||
truncate_files.append("") # 截取失败时添加空字符串
|
selection = future_to_selection[future]
|
||||||
print(f"截取 selection {selection} 失败,已添加空字符串。")
|
try:
|
||||||
|
files = future.result()
|
||||||
|
if files:
|
||||||
|
# 过滤空字符串
|
||||||
|
valid_files = [f for f in files if f]
|
||||||
|
truncate_files.extend(valid_files)
|
||||||
|
else:
|
||||||
|
truncate_files.append("") # 截取失败时添加空字符串
|
||||||
|
print(f"截取 selection {selection} 失败,已添加空字符串。")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Selection {selection} generated an exception: {e}")
|
||||||
|
truncate_files.append("") # 发生异常时添加空字符串
|
||||||
|
|
||||||
if any(f for f in truncate_files if f): # 检查是否有有效的文件路径
|
if any(f for f in truncate_files if f): # 检查是否有有效的文件路径
|
||||||
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
|
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_specific.pdf")
|
||||||
merged_result = merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name)
|
merged_result = merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name)
|
||||||
if merged_result:
|
if merged_result:
|
||||||
truncate_files.append(merged_result)
|
truncate_files.append(merged_result)
|
||||||
@ -469,10 +510,10 @@ if __name__ == "__main__":
|
|||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
|
||||||
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\output6"
|
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\output6"
|
||||||
# files=truncate_pdf_multiple(input_path,output_folder)
|
# files=truncate_pdf_multiple(input_path,output_folder)
|
||||||
# selections = [5, 1] # 仅处理 selection 5、1 和 3
|
selections = [5, 1] # 仅处理 selection 5、1 和 3
|
||||||
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
|
files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
|
||||||
# print(files)
|
print(files)
|
||||||
selection = 6 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
|
# selection = 6 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
|
||||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||||
print(generated_files)
|
# print(generated_files)
|
||||||
# print("生成的文件:", generated_files)
|
# print("生成的文件:", generated_files)
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
# -*- encoding:utf-8 -*-
|
||||||
import json
|
import json
|
||||||
from flask_app.general.json_utils import clean_json_string
|
from flask_app.general.json_utils import clean_json_string
|
||||||
from flask_app.general.多线程提问 import multi_threading,read_questions_from_file
|
from flask_app.general.多线程提问 import multi_threading,read_questions_from_file
|
||||||
@ -10,31 +11,38 @@ with open('test.json', 'r', encoding='utf-8') as f:
|
|||||||
# 定义目标名称列表
|
# 定义目标名称列表
|
||||||
target_names = [
|
target_names = [
|
||||||
"营业执照",
|
"营业执照",
|
||||||
"开户信息",
|
# "开户信息",
|
||||||
"法定代表人身份证",
|
"法定代表人身份证",
|
||||||
"法定代表人授权人身份证",
|
# "法定代表人授权人身份证",
|
||||||
# "人员证书",
|
"人员证书",
|
||||||
# "人员社保资料",
|
"人员社保资料",
|
||||||
# "劳动合同",
|
# "劳动合同",
|
||||||
# "企业证书",
|
"企业证书",
|
||||||
# "企业业绩",
|
"企业业绩",
|
||||||
# "财务信息(财务审计报告)",
|
"财务信息(财务审计报告)",
|
||||||
# "财务信息(缴纳税收证明)",
|
"财务信息(缴纳税收证明)",
|
||||||
# "财务信息(缴纳社保证明)"
|
"财务信息(缴纳社保证明)"
|
||||||
]
|
]
|
||||||
|
|
||||||
# 定义user_query模板
|
# 定义user_query模板
|
||||||
def generate_user_query(target, chapters, keywords):
|
def generate_user_query(target, chapters, keywords):
|
||||||
template = f"""该文件为投标文件格式要求的部分,请你根据该招标文件回答:{target}应该附在该文件哪个地方?你可能需要查找以下章节(若有):{', '.join([f"'{chapter}'" for chapter in chapters])};或者查找以下关键字出现的地方:{', '.join([f"'{kw}'" for kw in keywords])},你需要确定相关信息所在章节。
|
template = f"""这是投标文件格式要求的部分,以序号和标题作为投标方,我需要把不同的资格证明材料填充到指定区域,请你根据该文件回答:{target}应该附在该文件哪个地方?以下是可能匹配的章节名称:{', '.join([f"'{chapter}'" for chapter in chapters])};或者可能匹配的关键字:{', '.join([f"'{kw}'" for kw in keywords])},你需要根据以上规则确定相关信息所在章节或小节,章节名格式通常是如'三、联合体协议书'这样的序号+标题。现在我需要将{target}贴在该章节的最后面,但是在下一章之前,目前我需要定位到插入的位置,请你返回给我插入位置的上下文,上文是该章节末尾的内容,下文应该是下一章的章节名或开头内容,上下文应该是连续的,字数限制在30字以内,以json格式返回,键名分别是'上文','下文',上下文格式内容应完全与原文保持一致,不得擅自删减总结,输出格式示例如下:
|
||||||
我需要将{target}贴在该章节的最后面,但是在下一章(别的内容)之前,目前我需要定位到插入的位置,请你返回给我插入位置的上下文,其中下文应该是下一章的章节名或开头内容,字数限制在30字以内,以json格式返回,键名分别是'上文','下文',上下文格式内容应完全与原文保持一致,不得擅自删减总结,示例输出如下:
|
|
||||||
{{
|
{{
|
||||||
"上文":"投标人: (盖单位章)
|
"上文":"上文相关内容
|
||||||
年 月 日",
|
测试",
|
||||||
"下文":"四、投标保证金
|
"下文":"四、下文章节名
|
||||||
(招标人名称):"
|
(招标人名称):测试"
|
||||||
}}
|
}}
|
||||||
"""
|
"""
|
||||||
return template
|
template2=f"""该文件为投标文件格式要求,请你根据该招标文件回答:{target}应该附在哪个地方?你可能需要查找以下章节出现的地方:{', '.join([f"'{chapter}'" for chapter in chapters])};或者可能匹配的关键字:{', '.join([f"'{kw}'" for kw in keywords])},并确定所在章节。我需要将{target}贴在该章节的最后面,目前我需要定位到插入的位置,请你返回给我插入位置的上下文,下文应该是下一章的章节名或开头内容,字数限制在30字以内,以json格式返回,键名分别是'上文','下文',上下文格式内容应完全与原文保持一致,不得擅自删减总结,示例输出如下:
|
||||||
|
{{
|
||||||
|
"上文":"上文相关内容
|
||||||
|
测试",
|
||||||
|
"下文":"四、下文章节名
|
||||||
|
(招标人名称):测试"
|
||||||
|
}}
|
||||||
|
"""
|
||||||
|
return template2
|
||||||
|
|
||||||
# 生成user_query_list
|
# 生成user_query_list
|
||||||
user_query_list = []
|
user_query_list = []
|
||||||
|
@ -98,95 +98,147 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo
|
|||||||
return "" # 返回空字符串
|
return "" # 返回空字符串
|
||||||
|
|
||||||
|
|
||||||
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
|
def extract_pages_generic(pdf_document, begin_pattern, begin_page, common_header, total_pages):
|
||||||
"""
|
"""
|
||||||
通用函数,根据模式提取起始页和结束页。
|
通用函数,根据开始模式提取起始页,并将结束页设为文件末尾。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
- pdf_document: PdfReader 对象。
|
||||||
|
- begin_pattern: 用于匹配起始页的正则模式。
|
||||||
|
- begin_page: 起始页的页面索引下限。
|
||||||
|
- common_header: 公共页头,用于清理页面内容。
|
||||||
|
- total_pages: PDF文档的总页数(索引从0开始)。
|
||||||
|
|
||||||
|
返回:
|
||||||
|
- start_page: 匹配到的起始页的页面索引。
|
||||||
|
- end_page: 结束页的页面索引(设为文件末尾)。
|
||||||
"""
|
"""
|
||||||
start_page = None
|
start_page = None
|
||||||
end_page = None
|
end_page = total_pages # 结束页设为文件末尾
|
||||||
for i, page in enumerate(pdf_document.pages):
|
|
||||||
|
for i in range(total_pages, -1, -1):
|
||||||
|
page = pdf_document.pages[i]
|
||||||
text = page.extract_text() or ""
|
text = page.extract_text() or ""
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
# print(cleaned_text)
|
# print(cleaned_text) # 可根据需要取消注释以调试
|
||||||
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
|
|
||||||
|
if re.search(begin_pattern, cleaned_text) and i > begin_page:
|
||||||
start_page = i
|
start_page = i
|
||||||
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
|
print(f"找到起始页: 页码 {i + 1}") # 页码通常从1开始
|
||||||
end_page = i
|
break # 找到起始页后退出循环
|
||||||
break
|
|
||||||
return start_page, end_page
|
return start_page, end_page
|
||||||
|
|
||||||
|
|
||||||
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
|
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
|
||||||
pdf_document = PdfReader(pdf_path)
|
|
||||||
total_pages = len(pdf_document.pages) - 1 # 获取总页数
|
|
||||||
begin_page = 10
|
|
||||||
start_page = None
|
|
||||||
end_page = None
|
|
||||||
begin_patterns = [
|
|
||||||
re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*', re.MULTILINE),
|
|
||||||
re.compile(r"\s*(投标文件格式|响应文件格式|响应性文件格式)\s*")
|
|
||||||
]
|
|
||||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|目录|评标办法|技术标准|其他材料|合同')
|
|
||||||
end_pattern = re.compile(
|
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
|
||||||
for begin_pattern in begin_patterns:
|
|
||||||
for i, page in enumerate(pdf_document.pages):
|
|
||||||
text = page.extract_text() or ""
|
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
|
||||||
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text):
|
|
||||||
continue
|
|
||||||
if re.search(begin_pattern, cleaned_text) and i > begin_page:
|
|
||||||
start_page = i
|
|
||||||
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
|
|
||||||
end_page = i
|
|
||||||
break
|
|
||||||
if start_page is not None:
|
|
||||||
break
|
|
||||||
if start_page is None:
|
|
||||||
print(f"{output_suffix}: 未找到起始页,提取失败! 文件: {pdf_path}")
|
|
||||||
return ""
|
|
||||||
if end_page is None:
|
|
||||||
# 如果未匹配到结束页,默认截取到文件末尾
|
|
||||||
end_page = total_pages
|
|
||||||
print(f"{output_suffix}: 未找到结束页,默认截取到文件末尾。 文件: {pdf_path}")
|
|
||||||
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
|
||||||
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
|
||||||
"""
|
"""
|
||||||
根据开始和结束模式提取PDF页面。
|
二次提取PDF页面,仅根据开始模式从后往前提取。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
- pdf_path: PDF文件的路径。
|
||||||
|
- output_folder: 提取后页面的输出文件夹。
|
||||||
|
- output_suffix: 输出文件的后缀名。
|
||||||
|
- common_header: 公共页头,用于清理页面内容。
|
||||||
|
|
||||||
|
返回:
|
||||||
|
- 提取的页面保存结果,或空字符串表示失败。
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
total_pages = len(pdf_document.pages) - 1 # 获取总页数(索引从0开始)
|
||||||
|
begin_page = 10 # 起始页的页面索引下限
|
||||||
|
|
||||||
|
start_page = None
|
||||||
|
end_page = total_pages # 结束页设为文件末尾
|
||||||
|
|
||||||
|
# 定义开始模式列表
|
||||||
|
begin_patterns = [
|
||||||
|
re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*', re.MULTILINE),
|
||||||
|
re.compile(r"\s*(投标文件格式|响应文件格式|响应性文件格式)\s*")
|
||||||
|
]
|
||||||
|
|
||||||
|
# 定义排除模式
|
||||||
|
exclusion_pattern = re.compile(r'文件的构成|文件的组成|目录|评标办法|技术标准|其他材料|合同')
|
||||||
|
|
||||||
|
# 遍历每个开始模式
|
||||||
|
for begin_pattern in begin_patterns:
|
||||||
|
# 从后往前遍历页面
|
||||||
|
for i in range(total_pages, -1, -1):
|
||||||
|
page = pdf_document.pages[i]
|
||||||
|
text = page.extract_text() or ""
|
||||||
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
|
|
||||||
|
# 如果页面内容匹配排除模式,则跳过
|
||||||
|
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 检查是否匹配开始模式,并且页面索引大于 begin_page
|
||||||
|
if re.search(begin_pattern, cleaned_text) and i > begin_page:
|
||||||
|
start_page = i
|
||||||
|
print(f"二次提取找到起始页: 页码 {i + 1}") # 页码通常从1开始
|
||||||
|
break # 找到起始页后退出循环
|
||||||
|
|
||||||
|
if start_page is not None:
|
||||||
|
break # 如果找到起始页,退出外层循环
|
||||||
|
|
||||||
|
if start_page is None:
|
||||||
|
print(f"{output_suffix}: 未找到起始页,二次提取失败! 文件: {pdf_path}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# end_page 已经设为文件末尾
|
||||||
|
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing {pdf_path} in extract_pages_twice: {e}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, output_suffix):
|
||||||
|
"""
|
||||||
|
根据开始模式从后往前提取PDF页面。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
- pdf_path: PDF文件的路径。
|
||||||
|
- output_folder: 提取后页面的输出文件夹。
|
||||||
|
- begin_pattern: 用于匹配起始页的正则模式。
|
||||||
|
- begin_page: 起始页的页面索引下限。
|
||||||
|
- output_suffix: 输出文件的后缀名。
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
common_header = extract_common_header(pdf_path)
|
common_header = extract_common_header(pdf_path)
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
total_pages = len(pdf_document.pages) - 1 # 获取总页数
|
total_pages = len(pdf_document.pages) - 1 # 获取总页数(索引从0开始)
|
||||||
|
|
||||||
# 提取起始页和结束页
|
# 提取起始页,结束页直接设为文件末尾
|
||||||
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
|
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, begin_page, common_header,
|
||||||
|
total_pages)
|
||||||
|
|
||||||
if output_suffix == "format":
|
if output_suffix == "format":
|
||||||
if start_page is None:
|
if start_page is None:
|
||||||
print(f"{output_suffix}: 未找到起始页,提取失败! 文件: {pdf_path} 尝试二次提取!")
|
print(f"{output_suffix}: 未找到起始页,提取失败! 文件: {pdf_path} 尝试二次提取!")
|
||||||
return extract_pages_twice(pdf_path,output_folder,output_suffix,common_header)
|
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header)
|
||||||
if end_page is None:
|
# end_page 已经设为文件末尾
|
||||||
# 如果未匹配到结束页,默认截取到文件末尾
|
|
||||||
end_page = total_pages
|
|
||||||
print(f"{output_suffix}: 未找到结束页,默认截取到文件末尾。 文件: {pdf_path}")
|
|
||||||
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
||||||
|
|
||||||
|
# 如果 output_suffix 不是 "format",同样保存提取的页面
|
||||||
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing {pdf_path}: {e}")
|
print(f"Error processing {pdf_path}: {e}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
def process_files(file_path, output_folder, begin_pattern, begin_page, output_suffix):
|
||||||
"""
|
"""
|
||||||
处理单个文件:转换为PDF(如果需要)并提取页面。
|
处理单个文件:转换为PDF(如果需要)并提取页面。
|
||||||
"""
|
"""
|
||||||
# pdf_path = convert_to_pdf(file_path)
|
# pdf_path = convert_to_pdf(file_path)
|
||||||
pdf_path=file_path
|
pdf_path=file_path
|
||||||
result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, output_suffix)
|
||||||
return result or ""
|
return result or ""
|
||||||
|
|
||||||
|
|
||||||
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
def process_input(input_path, output_folder, begin_pattern, begin_page,output_suffix):
|
||||||
"""
|
"""
|
||||||
处理输入路径,可以是文件或目录。
|
处理输入路径,可以是文件或目录。
|
||||||
"""
|
"""
|
||||||
@ -198,13 +250,13 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
|
|||||||
for file_name in os.listdir(input_path):
|
for file_name in os.listdir(input_path):
|
||||||
file_path = os.path.join(input_path, file_name)
|
file_path = os.path.join(input_path, file_name)
|
||||||
if is_pdf_or_doc(file_path):
|
if is_pdf_or_doc(file_path):
|
||||||
result = process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
result = process_files(file_path, output_folder, begin_pattern, begin_page, output_suffix)
|
||||||
if isinstance(result, tuple):
|
if isinstance(result, tuple):
|
||||||
generated_files.extend([f if f else "" for f in result]) # 保留空字符串
|
generated_files.extend([f if f else "" for f in result]) # 保留空字符串
|
||||||
else:
|
else:
|
||||||
generated_files.append(result) # 直接添加result,可能是空字符串
|
generated_files.append(result) # 直接添加result,可能是空字符串
|
||||||
elif os.path.isfile(input_path) and is_pdf_or_doc(input_path):
|
elif os.path.isfile(input_path) and is_pdf_or_doc(input_path):
|
||||||
result = process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
result = process_files(input_path, output_folder, begin_pattern, begin_page, output_suffix)
|
||||||
if isinstance(result, tuple):
|
if isinstance(result, tuple):
|
||||||
generated_files.extend([f if f else "" for f in result]) # 保留空字符串
|
generated_files.extend([f if f else "" for f in result]) # 保留空字符串
|
||||||
else:
|
else:
|
||||||
@ -224,16 +276,16 @@ def truncate_pdf_main(input_path, output_folder, selection,begin_page=10):
|
|||||||
begin_pattern = re.compile(
|
begin_pattern = re.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*'
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*'
|
||||||
)
|
)
|
||||||
end_pattern = re.compile(
|
# end_pattern = re.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE
|
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE
|
||||||
)
|
# )
|
||||||
local_output_suffix = "format"
|
local_output_suffix = "format"
|
||||||
else:
|
else:
|
||||||
print("无效的选择:请选择1")
|
print("无效的选择:请选择1")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# 调用相应的处理函数
|
# 调用相应的处理函数
|
||||||
return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, local_output_suffix) or ""
|
return process_input(input_path, output_folder, begin_pattern, begin_page, local_output_suffix) or ""
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error in truncate_pdf_main: {e}")
|
print(f"Error in truncate_pdf_main: {e}")
|
||||||
return "" # 返回空字符串
|
return "" # 返回空字符串
|
||||||
@ -241,8 +293,8 @@ def truncate_pdf_main(input_path, output_folder, selection,begin_page=10):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# 定义输入和输出路径
|
# 定义输入和输出路径
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件(2).pdf"
|
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
||||||
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest12.pdf"
|
# input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest13.pdf"
|
||||||
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
|
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
|
||||||
selection = 1 # 1 - 投标文件格式
|
selection = 1 # 1 - 投标文件格式
|
||||||
# 执行截取
|
# 执行截取
|
||||||
|
@ -3,7 +3,7 @@ import re # 导入正则表达式库
|
|||||||
import os # 用于文件和文件夹操作
|
import os # 用于文件和文件夹操作
|
||||||
from flask_app.general.format_change import docx2pdf
|
from flask_app.general.format_change import docx2pdf
|
||||||
from flask_app.general.merge_pdfs import merge_and_cleanup,merge_pdfs
|
from flask_app.general.merge_pdfs import merge_and_cleanup,merge_pdfs
|
||||||
|
import concurrent.futures
|
||||||
def clean_page_content(text, common_header):
|
def clean_page_content(text, common_header):
|
||||||
# 首先删除抬头公共部分
|
# 首先删除抬头公共部分
|
||||||
if common_header: # 确保有公共抬头才进行替换
|
if common_header: # 确保有公共抬头才进行替换
|
||||||
@ -623,14 +623,29 @@ def truncate_pdf_main(input_path, output_folder, selection, output_suffix="defau
|
|||||||
def truncate_pdf_multiple(pdf_path, output_folder):
|
def truncate_pdf_multiple(pdf_path, output_folder):
|
||||||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||||
truncate_files = []
|
truncate_files = []
|
||||||
for selection in range(1, 6):
|
|
||||||
files = truncate_pdf_main(pdf_path, output_folder, selection)
|
# 定义要处理的选择范围
|
||||||
if files:
|
selections = range(1, 6)
|
||||||
truncate_files.extend(files)
|
|
||||||
|
# 使用 ThreadPoolExecutor 进行多线程处理
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
||||||
|
# 提交所有任务并获取未来对象
|
||||||
|
future_to_selection = {executor.submit(truncate_pdf_main, pdf_path, output_folder, selection,output_suffix="default"): selection for
|
||||||
|
selection in selections}
|
||||||
|
|
||||||
|
# 逐个获取完成的任务
|
||||||
|
for future in concurrent.futures.as_completed(future_to_selection):
|
||||||
|
selection = future_to_selection[future]
|
||||||
|
try:
|
||||||
|
files = future.result()
|
||||||
|
if files:
|
||||||
|
truncate_files.extend(files)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Selection {selection} generated an exception: {e}")
|
||||||
|
|
||||||
if truncate_files:
|
if truncate_files:
|
||||||
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
|
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
|
||||||
merge_selected_pdfs(output_folder, truncate_files, merged_output_path,base_file_name)
|
merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name)
|
||||||
truncate_files.append(merged_output_path)
|
truncate_files.append(merged_output_path)
|
||||||
print(f"已生成合并文件: {merged_output_path}")
|
print(f"已生成合并文件: {merged_output_path}")
|
||||||
else:
|
else:
|
||||||
@ -639,13 +654,14 @@ def truncate_pdf_multiple(pdf_path, output_folder):
|
|||||||
return truncate_files
|
return truncate_files
|
||||||
|
|
||||||
#小解析,只需要前三章内容
|
#小解析,只需要前三章内容
|
||||||
def truncate_pdf_specific_goods(pdf_path, output_folder,selections):
|
def truncate_pdf_specific_goods(pdf_path, output_folder, selections):
|
||||||
"""
|
"""
|
||||||
处理 PDF 文件,选择 selection 为 4 和 5 的部分,并合并结果。
|
处理 PDF 文件,选择指定的 selections,并合并结果。
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pdf_path (str): 要处理的 PDF 文件路径。
|
pdf_path (str): 要处理的 PDF 文件路径。
|
||||||
output_folder (str): 截取后的文件保存文件夹路径。
|
output_folder (str): 截取后的文件保存文件夹路径。
|
||||||
|
selections (iterable): 要处理的 selection 列表。
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list: 截取的文件路径列表,包括合并后的文件路径(如果有)。
|
list: 截取的文件路径列表,包括合并后的文件路径(如果有)。
|
||||||
@ -653,13 +669,24 @@ def truncate_pdf_specific_goods(pdf_path, output_folder,selections):
|
|||||||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||||
truncate_files = []
|
truncate_files = []
|
||||||
|
|
||||||
for selection in selections:
|
# 使用 ThreadPoolExecutor 进行多线程处理
|
||||||
files = truncate_pdf_main(pdf_path, output_folder, selection)
|
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
|
||||||
if files:
|
# 提交所有任务并获取未来对象
|
||||||
if isinstance(files, list):
|
future_to_selection = {executor.submit(truncate_pdf_main, pdf_path, output_folder, selection,output_suffix="default"): selection for
|
||||||
truncate_files.extend(files)
|
selection in selections}
|
||||||
elif isinstance(files, str):
|
|
||||||
truncate_files.append(files)
|
# 逐个获取完成的任务
|
||||||
|
for future in concurrent.futures.as_completed(future_to_selection):
|
||||||
|
selection = future_to_selection[future]
|
||||||
|
try:
|
||||||
|
files = future.result()
|
||||||
|
if files:
|
||||||
|
if isinstance(files, list):
|
||||||
|
truncate_files.extend(files)
|
||||||
|
elif isinstance(files, str):
|
||||||
|
truncate_files.append(files)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Selection {selection} generated an exception: {e}")
|
||||||
|
|
||||||
if truncate_files:
|
if truncate_files:
|
||||||
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_specific.pdf")
|
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_specific.pdf")
|
||||||
|
@ -23,6 +23,17 @@ user_query1 = "这是一份货物标中采购要求部分的内容,请告诉
|
|||||||
|
|
||||||
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,每条信息规定了各方不得存在的情形,请回答:在这些信息中,主语是投标人或供应商或联合体投标各方或中标人的信息有哪些?不要返回主语是采购人或评标委员会的信息,请你准确筛选所需的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,示例返回为[1,4,6],若情况不存在,返回[]。",
|
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,每条信息规定了各方不得存在的情形,请回答:在这些信息中,主语是投标人或供应商或联合体投标各方或中标人的信息有哪些?不要返回主语是采购人或评标委员会的信息,请你准确筛选所需的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,示例返回为[1,4,6],若情况不存在,返回[]。",
|
||||||
|
|
||||||
|
|
||||||
|
--- 营业执照 ---
|
||||||
|
该文件为投标文件格式要求,请你根据该招标文件回答:营业执照应该附在哪个地方?你可能需要查找以下章节出现的地方:'具有独立承担民事责任能力的法人', '投标人基本信息表', '法人或者其他组织的营业执照等证明文件,自然人的身份证明', '投标人情况介绍', '投标人简介', '企业相关证件',并确定所在章节。
|
||||||
|
我需要将营业执照贴在该章节的最后面,目前我需要定位到插入的位置,请你返回给我插入位置的上下文,字数限制在30字以内,以json格式返回,键名分别是'上文','下文',上下文格式内容应完全与原文保持一致,不得擅自删减总结,示例输出如下:
|
||||||
|
{
|
||||||
|
"上文":"投标人: (盖单位章)
|
||||||
|
年 月 日",
|
||||||
|
"下文":"四、投标保证金
|
||||||
|
(招标人名称):"
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
"采购需求": {
|
"采购需求": {
|
||||||
"硬盘录像机 A": {},
|
"硬盘录像机 A": {},
|
||||||
|
@ -180,7 +180,7 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
|
|||||||
|
|
||||||
#TODO:目前的无效标这块的键值都删去空格了,所有的键名都删去空格
|
#TODO:目前的无效标这块的键值都删去空格了,所有的键名都删去空格
|
||||||
#广水市 2022 年义务教育学校多媒体补充采购项目 资格审查有问题
|
#广水市 2022 年义务教育学校多媒体补充采购项目 资格审查有问题
|
||||||
#TODO:区分output目录 陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1)_tobidders_notice_part2.pdf提取有问题 #一包二包问题 107国道
|
#TODO:区分output目录 陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1)_tobidders_notice_part2.pdf提取有问题 #一包二包问题 107国道 #日期格式统一
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user