11.5货物标截取优化
This commit is contained in:
parent
d0e7f060c8
commit
ccab078ac3
@ -184,9 +184,9 @@ if __name__ == '__main__':
|
||||
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
|
||||
# local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
|
||||
local_path_in="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\6.2定版视频会议磋商文件.doc"
|
||||
# downloaded_file=doc2docx(local_path_in)
|
||||
downloaded_file=doc2docx(local_path_in)
|
||||
# downloaded_file=pdf2docx(local_path_in)
|
||||
downloaded_file=docx2pdf(local_path_in)
|
||||
# downloaded_file=docx2pdf(local_path_in)
|
||||
print(downloaded_file)
|
||||
|
||||
|
||||
|
@ -95,7 +95,7 @@ def extract_text_by_page(file_path):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
||||
file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\交警支队机动车查验监管系统项目采购.pdf"
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
||||
|
@ -16,8 +16,6 @@ def get_global_logger(unique_id):
|
||||
|
||||
|
||||
logger = None
|
||||
|
||||
|
||||
def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header):
|
||||
try:
|
||||
# 获取文件基本名称
|
||||
@ -238,7 +236,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
|
||||
begin_pattern = r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::])'
|
||||
end_pattern = re.compile(
|
||||
r'^(?:附录[一二三四五六七八九1-9]*[::]|附件[一二三四五六七八九1-9]*[::]|附表[一二三四五六七八九1-9]*[::])(?!.*(?:资质|能力|信誉)).*$|'
|
||||
r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、]\s*$|\s*评标(办法|方法)前附表\s*$|投标人须知',
|
||||
r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()()]\s*$|\s*评标(办法|方法)前附表\s*$|投标人须知',
|
||||
re.MULTILINE
|
||||
)
|
||||
start_page = None
|
||||
|
@ -110,12 +110,14 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
||||
if output_suffix == "tobidders_notice":
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
|
||||
start_page, mid_page, end_page = extract_pages_tobidders_notice(
|
||||
pdf_document, begin_pattern, begin_page, common_header, exclusion_pattern
|
||||
pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern
|
||||
)
|
||||
|
||||
if start_page is None or end_page is None or mid_page is None:
|
||||
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
||||
return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header)
|
||||
if not start_page or not mid_page or not end_page:
|
||||
print(f"三次提取tobidders_notice均失败!!{pdf_path}")
|
||||
return "",""
|
||||
# if start_page is None or end_page is None or mid_page is None:
|
||||
# print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
||||
# return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page)
|
||||
|
||||
path1 = save_extracted_pages(pdf_document, start_page, mid_page, pdf_path, output_folder, "tobidders_notice_part1")
|
||||
path2 = save_extracted_pages(pdf_document, mid_page, end_page, pdf_path, output_folder, "tobidders_notice_part2")
|
||||
@ -138,7 +140,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
||||
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
||||
if start_page is None or end_page is None:
|
||||
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
||||
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header)
|
||||
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page)
|
||||
elif output_suffix == "qualification1":
|
||||
truncate_pdf_main(pdf_path, output_folder, 2, "qualification3") # 合并'资格审查'章节和'评标办法'章节
|
||||
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
||||
@ -175,28 +177,33 @@ def get_patterns_for_evaluation_method():
|
||||
return begin_pattern, end_pattern
|
||||
|
||||
|
||||
# def get_patterns_for_qualification():
|
||||
# # # 原始匹配逻辑
|
||||
# # begin_pattern_original = re.compile(
|
||||
# # r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE)
|
||||
# # end_pattern_original = re.compile(
|
||||
# # r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||
#
|
||||
# # 新匹配逻辑
|
||||
# begin_pattern_new = re.compile(
|
||||
# r'^资格性检查', re.MULTILINE)
|
||||
# end_pattern_new = re.compile(
|
||||
# r'^附件\s*\d+|^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||
#
|
||||
# return begin_pattern_new, end_pattern_new
|
||||
def get_patterns_for_qualification():
|
||||
# # 原始匹配逻辑
|
||||
# begin_pattern_original = re.compile(
|
||||
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE)
|
||||
# end_pattern_original = re.compile(
|
||||
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||
|
||||
# 新匹配逻辑
|
||||
begin_pattern_new = re.compile(
|
||||
r'^资格性检查', re.MULTILINE)
|
||||
end_pattern_new = re.compile(
|
||||
r'^附件\s*\d+|^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||
|
||||
return begin_pattern_new, end_pattern_new
|
||||
def get_patterns_for_qualification2():
|
||||
# begin_pattern = re.compile(
|
||||
# r'^(?:附录(?:一|1)?[::]|附件(?:一|1)?[::]|附表(?:一|1)?[::]|资格性检查|资格审查表).*(?:资格|符合性)?.*$',
|
||||
# re.MULTILINE
|
||||
# )
|
||||
begin_pattern = re.compile(
|
||||
r'^(?:附录(?:一|1)?[::]|附件(?:一|1)?[::]|附表(?:一|1)?[::]).*(?:资格|符合性).*$',
|
||||
r'^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?|资格性检查|资格审查表|符合性审查表)', #目前可以匹配 附录3,因为允许了'附录'匹配
|
||||
re.MULTILINE
|
||||
)
|
||||
end_pattern = re.compile(
|
||||
r'^(?!.*(?:资格|符合))(?:附录.*?[::]|附件.*?[::]|附表.*?[::]).*$|'
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
|
||||
r'^(?!.*(?:资格|符合))(?:附录.*?[::]|附件.*?[::]|附表.*?[::]|附件\s*\d+).*$|'
|
||||
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
|
||||
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', #更宽松的匹配
|
||||
re.MULTILINE
|
||||
)
|
||||
return begin_pattern, end_pattern
|
||||
@ -239,7 +246,7 @@ def get_patterns_for_notice_twice():
|
||||
# break
|
||||
# return start_page, mid_page, end_page
|
||||
|
||||
def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, common_header, exclusion_pattern):
|
||||
def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern):
|
||||
"""
|
||||
从PDF文档中提取起始页、中间页和结束页。
|
||||
|
||||
@ -274,7 +281,7 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, comm
|
||||
end_page = None
|
||||
chapter_type = None # 用于存储“章”或“部分”
|
||||
combined_mid_pattern = None # 中间页的组合模式
|
||||
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
for i, page in enumerate(pdf_document.pages):
|
||||
text = page.extract_text() or ""
|
||||
cleaned_text = clean_page_content(text, common_header)
|
||||
@ -345,8 +352,7 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, comm
|
||||
)
|
||||
else:
|
||||
# 如果提供了固定的 end_pattern,则使用默认的 mid_pattern
|
||||
base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \
|
||||
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则)'
|
||||
base_mid_pattern = r'.*[((]?\s*[一二12]?[))]?\s*[、..]*\s*(说\s*明|总\s*则)\s*$' #可以匹配"东莞市粤隆招标有限公司编制 46一、说明"
|
||||
combined_mid_pattern = re.compile(
|
||||
rf'{base_mid_pattern}',
|
||||
re.MULTILINE
|
||||
@ -355,6 +361,8 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, comm
|
||||
|
||||
# 识别中间页
|
||||
if start_page is not None and mid_page is None and combined_mid_pattern:
|
||||
if (start_page+1==i) and re.search(local_begin_pattern,cleaned_text):
|
||||
continue
|
||||
if re.search(combined_mid_pattern, cleaned_text):
|
||||
mid_page = i
|
||||
|
||||
@ -374,6 +382,12 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, comm
|
||||
|
||||
# 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取
|
||||
if not (start_page and mid_page and end_page):
|
||||
print(f"第二次尝试 tobidders_notice!{pdf_path}")
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
start_page,mid_page,end_page=extract_pages_twice_tobidders_notice(pdf_document,common_header,begin_page)
|
||||
if start_page and end_page and mid_page:
|
||||
return start_page, mid_page, end_page
|
||||
else:
|
||||
# 定义新的 begin_pattern 和 end_pattern
|
||||
new_begin_pattern = re.compile(
|
||||
r'.*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|'
|
||||
@ -384,13 +398,67 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, comm
|
||||
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*$',
|
||||
re.MULTILINE
|
||||
)
|
||||
print("第三次尝试 tobidders_notice! ")
|
||||
# 第二次提取尝试,使用新的模式
|
||||
start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern)
|
||||
|
||||
return start_page, mid_page, end_page
|
||||
|
||||
|
||||
def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header):
|
||||
#投标人须知分为两个章节
|
||||
# def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page):
|
||||
# begin_pattern = re.compile(
|
||||
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+'
|
||||
# )
|
||||
# end_pattern = re.compile(
|
||||
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)' # 捕获中文部分
|
||||
# )
|
||||
# exclusion_words = ["合同", "评标", "开标","评审","采购","资格"] # 在这里添加需要排除的关键词
|
||||
#
|
||||
# pdf_document = PdfReader(pdf_path)
|
||||
# exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
|
||||
#
|
||||
# # 提取第一部分
|
||||
# start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
|
||||
# if start_page1 is None or end_page1 is None:
|
||||
# print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
|
||||
# return "", ""
|
||||
#
|
||||
# # 保存第一部分的路径
|
||||
# path1 = save_extracted_pages(pdf_document, start_page1, end_page1, pdf_path, output_folder,
|
||||
# "tobidders_notice_part1")
|
||||
#
|
||||
# # 提取第二部分
|
||||
# start_page2 = end_page1
|
||||
#
|
||||
# # 检查end_page1页面的内容
|
||||
# text = pdf_document.pages[end_page1].extract_text() or ""
|
||||
# cleaned_text = clean_page_content(text, common_header)
|
||||
# match = end_pattern.search(cleaned_text)
|
||||
#
|
||||
# if match:
|
||||
# # 获取匹配到的中文部分
|
||||
# chapter_title = match.group(1)
|
||||
# # 检查是否包含排除关键词
|
||||
# if any(word in chapter_title for word in exclusion_words):
|
||||
# # 如果包含排除关键词,直接返回相同的路径
|
||||
# return path1, path1
|
||||
#
|
||||
# # 如果不包含排除关键词,继续提取第二部分
|
||||
# _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
|
||||
# exclusion_pattern)
|
||||
#
|
||||
# if end_page2 is None:
|
||||
# print(f"second: {output_suffix} 未找到第二部分的结束页在文件 {pdf_path} 中!")
|
||||
# return path1, path1
|
||||
#
|
||||
# # 保存第二部分的路径
|
||||
# path2 = save_extracted_pages(pdf_document, start_page2, end_page2, pdf_path, output_folder,
|
||||
# "tobidders_notice_part2")
|
||||
#
|
||||
# return path1, path2
|
||||
|
||||
def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page):
|
||||
begin_pattern = re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+'
|
||||
)
|
||||
@ -399,18 +467,16 @@ def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix,
|
||||
)
|
||||
exclusion_words = ["合同", "评标", "开标","评审","采购","资格"] # 在这里添加需要排除的关键词
|
||||
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
|
||||
|
||||
# 提取第一部分
|
||||
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, -1, common_header)
|
||||
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
|
||||
if start_page1 is None or end_page1 is None:
|
||||
print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
|
||||
return "", ""
|
||||
return "", "",""
|
||||
|
||||
# 保存第一部分的路径
|
||||
path1 = save_extracted_pages(pdf_document, start_page1, end_page1, pdf_path, output_folder,
|
||||
"tobidders_notice_part1")
|
||||
# # 保存第一部分的路径
|
||||
# path1 = save_extracted_pages(pdf_document, start_page1, end_page1, pdf_path, output_folder,
|
||||
# "tobidders_notice_part1")
|
||||
|
||||
# 提取第二部分
|
||||
start_page2 = end_page1
|
||||
@ -426,52 +492,71 @@ def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix,
|
||||
# 检查是否包含排除关键词
|
||||
if any(word in chapter_title for word in exclusion_words):
|
||||
# 如果包含排除关键词,直接返回相同的路径
|
||||
return path1, path1
|
||||
return start_page1, end_page1,end_page1
|
||||
|
||||
# 如果不包含排除关键词,继续提取第二部分
|
||||
_, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
|
||||
exclusion_pattern)
|
||||
|
||||
if end_page2 is None:
|
||||
print(f"second: {output_suffix} 未找到第二部分的结束页在文件 {pdf_path} 中!")
|
||||
return path1, path1
|
||||
return start_page1, end_page1,end_page1
|
||||
|
||||
# 保存第二部分的路径
|
||||
path2 = save_extracted_pages(pdf_document, start_page2, end_page2, pdf_path, output_folder,
|
||||
"tobidders_notice_part2")
|
||||
# # 保存第二部分的路径
|
||||
# path2 = save_extracted_pages(pdf_document, start_page2, end_page2, pdf_path, output_folder,
|
||||
# "tobidders_notice_part2")
|
||||
|
||||
return path1, path2
|
||||
return start_page1, end_page1,end_page2
|
||||
|
||||
def extract_pages_qualification(pdf_document,begin_page,begin_pattern,end_pattern, common_header):
|
||||
start_page = None
|
||||
end_page = None
|
||||
include_keywords = ["资格审查", "资质审查", "符合性审查"]
|
||||
exclude_keywords = ["声明函", "承诺函"]
|
||||
# 从章节开始后的位置进行检查
|
||||
for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
cleaned_text = clean_page_content(text, common_header)
|
||||
# 确定起始页,需在last_begin_index之后
|
||||
if (
|
||||
any(keyword in cleaned_text for keyword in include_keywords) and
|
||||
all(keyword not in cleaned_text for keyword in exclude_keywords) and
|
||||
start_page is None
|
||||
):
|
||||
if begin_pattern.search(cleaned_text):
|
||||
print("附件")
|
||||
start_page = i
|
||||
# 确定结束页
|
||||
if start_page is not None and end_pattern.search(cleaned_text):
|
||||
if i > start_page:
|
||||
end_page = i
|
||||
break # 找到结束页后退出循环
|
||||
return start_page,end_page
|
||||
|
||||
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
|
||||
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page):
|
||||
try:
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
patterns = None
|
||||
begin_page = 0
|
||||
start_page = None
|
||||
end_page = None
|
||||
|
||||
if output_suffix == "procurement":
|
||||
patterns = [get_patterns_for_procurement()]
|
||||
begin_page = 5
|
||||
elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3":
|
||||
patterns = [get_patterns_for_evaluation_method()]
|
||||
begin_page = 5
|
||||
elif output_suffix == "qualification1":
|
||||
patterns = [get_patterns_for_qualification(),get_patterns_for_qualification2()]
|
||||
begin_page = 5
|
||||
patterns = [get_patterns_for_qualification()]
|
||||
elif output_suffix == "notice":
|
||||
patterns = [get_patterns_for_notice(),get_patterns_for_notice_twice()]
|
||||
begin_page = 0
|
||||
|
||||
if patterns:
|
||||
for pattern_pair in patterns:
|
||||
# print(pattern_pair[0])
|
||||
# print(pattern_pair[1])
|
||||
if output_suffix=="qualification1":
|
||||
start_page,end_page=extract_pages_qualification(pdf_document,begin_page,pattern_pair[0],pattern_pair[1],common_header)
|
||||
else:
|
||||
start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page,
|
||||
common_header,
|
||||
exclusion_pattern, output_suffix)
|
||||
common_header,exclusion_pattern, output_suffix)
|
||||
if start_page is not None and end_page is not None:
|
||||
break
|
||||
|
||||
@ -544,7 +629,7 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo
|
||||
def get_start_and_common_header(input_path):
|
||||
common_header = extract_common_header(input_path)
|
||||
last_begin_index = 0
|
||||
begin_pattern = re.compile(r'.*(?:招标公告|邀请书|邀请函|投标邀请)[\))]?\s*$',re.MULTILINE)
|
||||
begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',re.MULTILINE)
|
||||
pdf_document = PdfReader(input_path)
|
||||
for i, page in enumerate(pdf_document.pages):
|
||||
if i > 10:
|
||||
@ -604,8 +689,8 @@ def process_input(input_path, output_folder, selection, output_suffix):
|
||||
|
||||
# 根据选择设置对应的模式和结束模式
|
||||
if selection == 1:
|
||||
begin_pattern = re.compile(r'.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$', re.MULTILINE)
|
||||
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||
begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$', re.MULTILINE)
|
||||
end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE)
|
||||
local_output_suffix = "notice"
|
||||
elif selection == 2:
|
||||
begin_pattern = re.compile(
|
||||
@ -737,9 +822,10 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
|
||||
logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}")
|
||||
|
||||
return truncate_files
|
||||
#TODO:截取还有问题
|
||||
|
||||
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况,类似工程标 唐山投标只有正文,没有附表
|
||||
|
||||
#ztbfile.pdf jiao通 广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次)_qualification2.pdf 唐山 包头
|
||||
if __name__ == "__main__":
|
||||
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
||||
# input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\f8b793b5-aa60-42d3-ae59-a3f474e06610\\ztbfile.pdf"
|
||||
@ -752,6 +838,6 @@ if __name__ == "__main__":
|
||||
# selections = [1,4]
|
||||
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
|
||||
# print(files)
|
||||
selection = 1# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
||||
selection = 4# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
# print(generated_files)
|
@ -2,12 +2,12 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from flask_app.general.通义千问long import upload_file
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long
|
||||
from flask_app.general.多线程提问 import multi_threading
|
||||
from flask_app.general.json_utils import clean_json_string
|
||||
from flask_app.货物标.投标人须知正文条款提取成json文件货物标版 import convert_clause_to_json
|
||||
import copy
|
||||
|
||||
import concurrent.futures
|
||||
# 这个字典可能有嵌套,你需要遍历里面的键名,对键名作判断,而不是键值,具体是这样的:如果处于同一层级的键的数量>1并且键名全由数字或点号组成。那么就将这些序号键名全部删除,重新组织成一个字典格式的数据,你可以考虑用字符串列表来保持部分平级的数据
|
||||
# 对于同级的键,如果数量>1且键名都统一,那么将键名去掉,用列表保持它们的键值
|
||||
|
||||
@ -379,23 +379,53 @@ def combine_qualification_review(invalid_path, output_folder, qualification_path
|
||||
}
|
||||
}
|
||||
|
||||
def process_file(file_path):
|
||||
def process_file(file_path, invalid_path):
|
||||
file_id = upload_file(file_path)
|
||||
user_queries = [
|
||||
"该招标文件中规定的资格性审查标准是怎样的?请以json格式给出,外层为'资格性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关符合性性审查的内容。",
|
||||
"该招标文件中规定的符合性审查标准是怎样的?请以json格式给出,外层为'符合性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关资格性审查的内容。"
|
||||
]
|
||||
results = multi_threading(user_queries, "", file_id, 2)
|
||||
combined_res = {}
|
||||
for _, response in results:
|
||||
if response:
|
||||
# print(response)
|
||||
cleaned_data = clean_json_string(response)
|
||||
processed = process_dict(preprocess_dict(cleaned_data))
|
||||
combined_res.update(processed)
|
||||
else:
|
||||
print(f"Warning: No response for a query.")
|
||||
|
||||
first_query = """
|
||||
该文档中是否有关于资格性审查标准的具体内容,是否有关于符合性审查标准的具体内容?请以json格式给出回答,外键分别为'资格性审查'和'符合性审查',键值仅限于'是','否',输出格式示例如下:
|
||||
{
|
||||
"资格性审查":"是",
|
||||
"符合性审查":"是"
|
||||
}
|
||||
"""
|
||||
qianwen_ans = clean_json_string(qianwen_long(file_id, first_query))
|
||||
user_queries = [
|
||||
{
|
||||
"key": "资格性审查",
|
||||
"query": "该招标文件中规定的资格性审查标准是怎样的?请以json格式给出,外层为'资格性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关符合性性审查的内容。"
|
||||
},
|
||||
{
|
||||
"key": "符合性审查",
|
||||
"query": "该招标文件中规定的符合性审查标准是怎样的?请以json格式给出,外层为'符合性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关资格性审查的内容。"
|
||||
}
|
||||
]
|
||||
combined_res = {}
|
||||
file_id2 = None # 延迟上传 invalid_path
|
||||
def process_single_query(query_info):
|
||||
nonlocal file_id2
|
||||
key = query_info["key"]
|
||||
query = query_info["query"]
|
||||
# 根据键值决定使用哪个 file_id
|
||||
if qianwen_ans.get(key) == "否":
|
||||
if not file_id2:
|
||||
file_id2 = upload_file(invalid_path)
|
||||
current_file_id = file_id2
|
||||
else:
|
||||
current_file_id = file_id
|
||||
|
||||
# 调用大模型获取回答
|
||||
ans = qianwen_long(current_file_id, query)
|
||||
cleaned_data = clean_json_string(ans)
|
||||
processed = process_dict(preprocess_dict(cleaned_data))
|
||||
return processed
|
||||
|
||||
# 使用线程池并行处理查询
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
||||
futures = [executor.submit(process_single_query, q) for q in user_queries]
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
result = future.result()
|
||||
combined_res.update(result)
|
||||
return combined_res
|
||||
|
||||
try:
|
||||
|
Loading…
x
Reference in New Issue
Block a user