12.9 截取pdf逻辑优化
This commit is contained in:
parent
329f5680ec
commit
d83583dc41
@ -12,10 +12,9 @@ def extract_text_by_page(file_path):
|
||||
page = reader.pages[page_num]
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
print(text)
|
||||
# print(text)
|
||||
cleaned_text = clean_page_content(text,common_header)
|
||||
# cleaned_text=text
|
||||
# print(cleaned_text)
|
||||
print(cleaned_text)
|
||||
print("-----------------"+str(page_num))
|
||||
result += cleaned_text
|
||||
# print(f"Page {page_num + 1} Content:\n{cleaned_text}")
|
||||
@ -119,8 +118,8 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
|
||||
|
||||
if __name__ == '__main__':
|
||||
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
||||
# file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\东莞支队查验招标文件.pdf"
|
||||
file_path = r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf'
|
||||
file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件.pdf"
|
||||
# file_path = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
||||
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
||||
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
|
||||
|
@ -1,55 +1,26 @@
|
||||
import re
|
||||
line_stripped="""1.采购人:陕西省某单位
|
||||
2、采购代理机构:陕西坤硕项目管理有限公司
|
||||
3、供应商:响应招标并且符合招标文件规定资格条件和参加投标竞
|
||||
争的法人、其他组织或者自然人
|
||||
"""
|
||||
pure_number_match = re.match(r'^(\d+)([^.\d)()、].*)', line_stripped) # 不允许出现右括号
|
||||
if pure_number_match:
|
||||
print("yes")
|
||||
import regex
|
||||
|
||||
# 测试字符串
|
||||
begin_pattern = regex.compile(
|
||||
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
|
||||
r'(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文\s*$',
|
||||
regex.MULTILINE
|
||||
)
|
||||
|
||||
|
||||
# 测试示例
|
||||
test_strings = [
|
||||
"""
|
||||
.4评标委员会成员因缺席、回避、擅评标办法前附表康等原因不能继续履评标办法前附表
|
||||
责的,采购人或者采购代理机构有权向相关监督管理部门通报。
|
||||
17. 投标人资格审查和投标文件符合性审查
|
||||
17.1投标人资格审查指依据法律、法规和招标文件的规定,对投标文件中的资格、资信证
|
||||
明等进行审查,以确定投标人是否具备投标资格;投标文件符合性审查指依据招标文件的
|
||||
规定,从投标文件的有效性、完整性和对招标文件的响应程度进行审查,以确定是否对招
|
||||
标文件的实质性要求作出响应。
|
||||
17.2投标人未通过资格审查的不得进入投标文件符合性审查 ; 投标人未通过符合性审查的,
|
||||
不得进入投标文件的综合比较与评价。
|
||||
17.3品牌及型号必须为清单中有效期内产品并提供证明文件, 否则其投标将作为无效投标
|
||||
被拒绝。
|
||||
17.3.1如本项目使用最低评标价法, 提供相同品牌产品的不同投标人以其中通过资格审查、
|
||||
符合性审查且报价最低的参加评标;报价相同的,由采购人或者采购人委托评标委员会按
|
||||
照招标文件中评标办法规定的方式确定 一个参加评标的投标人;其他投标无效。
|
||||
17.3.2如本项目使用综合评分法,提供相同品牌产品且通过资格审查、符合性审查的不同
|
||||
投标人,按一家投标人计算,评审后得分最高的同品牌投标人获得中标人推荐资格;评审
|
||||
得分相同的,由采购人或者采购人委托评标 委员会按照招标文件中评标办法规定的方式确
|
||||
定一个投标人获得中标人推荐资格;
|
||||
17.4如一个分包内包含多种产品的, 采购人或采购代理机构将在投标人须知前附表中载明
|
||||
核心产品,多家投标人提供的所有核心产品品牌均相同的, 按第 18.3.2 条及相关法律法
|
||||
规处理。
|
||||
17.5投标人所投产品如被列入财政部与国家主管部门颁发的节能产品目录或环境标志产
|
||||
品目录,应提供相关证明,在评标时予以优先采购,具体优先采购办见第五章评标方法
|
||||
和标准。如采购人所采购产品为政府强制采购的节能产品,投标人所投产品的品牌及型号
|
||||
必须为清单中有效期内产品并提供证明文件,否则其投标将作为无效投标被拒绝。
|
||||
17.6投标人不良信用记录以采购人或采购代理机构查询结果为准。
|
||||
17.7资格审查和符合性审查标准详见第五章评标方法和标准。
|
||||
18. 投标文件的澄清和修正
|
||||
18.1对于投标文件中含义不明确、 同类问题表述不一致或者有明显文字和计算错误的内容,
|
||||
评标委员会应当以书面形式要求投标人作出必要的澄 清、说明或者补正。
|
||||
18.2投标人的澄清、说明或者补正应当采用书面形式,并加盖公章,或者由法定代表人或
|
||||
其授权的代表签字。投标人的澄清、说明或者补正不得超出投标文件的范围或者改变投标
|
||||
文件的实质性内容。澄清文件将作为投标文件内容的一部分。
|
||||
"""
|
||||
'投标人须知正文', # 匹配
|
||||
'”投标人须知正文', # 不匹配
|
||||
'” 投标人须知正文', # 不匹配
|
||||
'与 投标人须知正文', # 不匹配
|
||||
'见 投标人须知正文', # 不匹配
|
||||
'“ 投标人须知正文', # 不匹配
|
||||
'供应商须知正文', # 匹配
|
||||
'谈判供应商须知正文' # 匹配
|
||||
]
|
||||
|
||||
# for test_string in test_strings:
|
||||
# match = re.search(begin_pattern, test_string)
|
||||
# if match:
|
||||
# print("Matched Content:", match.group()) # 输出匹配的内容
|
||||
# else:
|
||||
# print("No match found.")
|
||||
for s in test_strings:
|
||||
if begin_pattern.search(s):
|
||||
print(f"匹配: {s}")
|
||||
else:
|
||||
print(f"不匹配: {s}")
|
||||
|
@ -1,4 +1,4 @@
|
||||
import re
|
||||
import regex
|
||||
import os
|
||||
import time
|
||||
from PyPDF2 import PdfReader, PdfWriter
|
||||
@ -41,7 +41,7 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
|
||||
for page_num in range(min(start_page, total_pages)):
|
||||
page_text = pdf_document.pages[page_num].extract_text()
|
||||
cleaned_text = clean_page_content(page_text, common_header)
|
||||
if re.search(r'目\s*录', cleaned_text, re.MULTILINE):
|
||||
if regex.search(r'目\s*录', cleaned_text, regex.MULTILINE):
|
||||
toc_page = page_num
|
||||
break
|
||||
|
||||
@ -75,7 +75,7 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
|
||||
def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header,
|
||||
is_secondary_match):
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
|
||||
exclusion_pattern = regex.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
|
||||
|
||||
def run_extraction():
|
||||
start_page = None
|
||||
@ -87,11 +87,11 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
|
||||
text = page.extract_text() or ""
|
||||
cleaned_text = clean_page_content(text, common_header)
|
||||
|
||||
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and mid_page is not None:
|
||||
if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
|
||||
continue
|
||||
|
||||
if start_page is None:
|
||||
match = re.search(begin_pattern, cleaned_text)
|
||||
match = regex.search(begin_pattern, cleaned_text)
|
||||
if match and i > begin_page:
|
||||
start_page = i
|
||||
matched_text = match.group(0) # 获取整个匹配的文本
|
||||
@ -105,16 +105,16 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
|
||||
if chapter_type:
|
||||
# 根据 chapter_type 动态生成 end_pattern
|
||||
if not is_secondary_match:
|
||||
end_pattern = re.compile(
|
||||
end_pattern = regex.compile(
|
||||
rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|'
|
||||
r'^评标办法前附表|'
|
||||
r'^评标(方法|办法)前附表|'
|
||||
r'^附录(?:一)?[::]|'
|
||||
r'^附件(?:一)?[::]|'
|
||||
r'^附表(?:一)?[::]',
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
)
|
||||
else:
|
||||
end_pattern = re.compile(
|
||||
end_pattern = regex.compile(
|
||||
rf'第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00 -\u9fff]+\s*$'
|
||||
)
|
||||
# 根据 chapter_type 动态生成 additional_mid_pattern
|
||||
@ -126,51 +126,50 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin
|
||||
additional_mid_pattern = ''
|
||||
|
||||
# 定义基础的 mid_pattern
|
||||
base_mid_pattern = (
|
||||
r'^\s*(?:[((]\s*[一1]?\s*[))]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)'
|
||||
r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文部分)'
|
||||
)
|
||||
base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \
|
||||
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
|
||||
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文\s*$'
|
||||
# 合并基础模式和额外模式
|
||||
if additional_mid_pattern:
|
||||
combined_mid_pattern = re.compile(
|
||||
combined_mid_pattern = regex.compile(
|
||||
rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})',
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
)
|
||||
else:
|
||||
combined_mid_pattern = re.compile(
|
||||
combined_mid_pattern = regex.compile(
|
||||
rf'{base_mid_pattern}',
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
)
|
||||
# print(f"生成的 combined_mid_pattern: {combined_mid_pattern.pattern}") # 打印 combined_mid_pattern
|
||||
else:
|
||||
# 如果未匹配到“章”或“部分”,使用默认的 end_pattern 和 mid_pattern
|
||||
if not is_secondary_match:
|
||||
end_pattern = re.compile(
|
||||
end_pattern = regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
|
||||
r'^评标办法前附表|' # Match "评标办法前附表" at the beginning of a line
|
||||
r'^评标(方法|办法)前附表|' # Match "评标办法前附表" at the beginning of a line
|
||||
r'^附录(?:一)?[::]|' # Match "附录一" with optional "一" and colon
|
||||
r'^附件(?:一)?[::]|' # Match "附件一" with optional "一" and colon
|
||||
r'^附表(?:一)?[::]', # Match "附表一" with optional "一" and colon
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
)
|
||||
else:
|
||||
end_pattern = re.compile(
|
||||
end_pattern = regex.compile(
|
||||
rf'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$'
|
||||
)
|
||||
# print(f"使用默认的 end_pattern: {end_pattern.pattern}") # 打印默认的 end_pattern
|
||||
# 定义基础的 mid_pattern
|
||||
base_mid_pattern = r'^\s*(?:[((]\s*[一1]?\s*[))]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)'
|
||||
combined_mid_pattern = re.compile(
|
||||
combined_mid_pattern = regex.compile(
|
||||
rf'{base_mid_pattern}',
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
)
|
||||
continue
|
||||
|
||||
if start_page is not None and mid_page is None and combined_mid_pattern:
|
||||
if re.search(combined_mid_pattern, cleaned_text):
|
||||
if regex.search(combined_mid_pattern, cleaned_text):
|
||||
mid_page = i
|
||||
if start_page is not None and mid_page is not None and chapter_type:
|
||||
if re.search(end_pattern, cleaned_text):
|
||||
if regex.search(end_pattern, cleaned_text):
|
||||
if i > mid_page:
|
||||
end_page = i
|
||||
break
|
||||
@ -195,22 +194,22 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
start_page = None
|
||||
end_page = None
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
|
||||
exclusion_pattern = regex.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
|
||||
# 遍历文档的每一页,查找开始和结束短语的位置
|
||||
for i in range(len(pdf_document.pages)):
|
||||
page = pdf_document.pages[i]
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
cleaned_text = clean_page_content(text, common_header)
|
||||
# if is_secondary_match and re.search(exclusion_pattern, cleaned_text): # 跳过投标人须知正文中的"投标文件的组成"
|
||||
if re.search(exclusion_pattern, cleaned_text):
|
||||
# if is_secondary_match and regex.search(exclusion_pattern, cleaned_text): # 跳过投标人须知正文中的"投标文件的组成"
|
||||
if regex.search(exclusion_pattern, cleaned_text):
|
||||
continue
|
||||
if re.search(begin_pattern, cleaned_text) and i >= begin_page:
|
||||
if regex.search(begin_pattern, cleaned_text) and i >= begin_page:
|
||||
if start_page and (output_suffix == "notice" or output_suffix == "invalid"):
|
||||
pass
|
||||
else:
|
||||
start_page = i
|
||||
if start_page is not None and re.search(end_pattern, cleaned_text) and not re.search(begin_pattern,cleaned_text):
|
||||
if start_page is not None and regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern,cleaned_text):
|
||||
condition = i > start_page
|
||||
if condition:
|
||||
is_invalid_condition = output_suffix == "invalid" and i > 30 # 这边默认无效投标至少有30页
|
||||
@ -240,16 +239,16 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
|
||||
# 定义起始匹配模式,仅匹配“附录”、“附件”或“附表”
|
||||
begin_pattern = r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::])'
|
||||
# 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻)
|
||||
end_pattern_attachment = re.compile(
|
||||
end_pattern_attachment = regex.compile(
|
||||
r'^(?:附录[一二三四五六七八九1-9]*[::]|附件[一二三四五六七八九1-9]*[::]|附表[一二三四五六七八九1-9]*[::]).*$',
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
)
|
||||
# 定义结束匹配模式 - 章节标题、评标附表、投标人须知等
|
||||
end_pattern_chapter = re.compile(
|
||||
end_pattern_chapter = regex.compile(
|
||||
r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$|'
|
||||
r'\s*评标(?:办法|方法)前附表\s*$|'
|
||||
r'投标人须知',
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
)
|
||||
start_page = None
|
||||
end_page = None
|
||||
@ -260,7 +259,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
|
||||
cleaned_text = clean_page_content(text, common_header)
|
||||
# 确定起始页,需在last_begin_index之后
|
||||
if any(key in cleaned_text for key in include_keys):
|
||||
if re.search(begin_pattern, cleaned_text, re.MULTILINE):
|
||||
if regex.search(begin_pattern, cleaned_text, regex.MULTILINE):
|
||||
if start_page is None:
|
||||
start_page = i # 确保起始页不小于章节的开始页码
|
||||
continue
|
||||
@ -305,23 +304,28 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
|
||||
def get_start_and_common_header(input_path):
|
||||
common_header = extract_common_header(input_path)
|
||||
last_begin_index = 0
|
||||
begin_pattern = re.compile(
|
||||
r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\))]?\s*$',
|
||||
re.MULTILINE
|
||||
begin_pattern = regex.compile(
|
||||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
||||
regex.MULTILINE
|
||||
)
|
||||
# 新增目录匹配模式
|
||||
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
||||
|
||||
pdf_document = PdfReader(input_path)
|
||||
for i, page in enumerate(pdf_document.pages):
|
||||
if i > 25:
|
||||
return common_header, 0 # 如果页码大于25,直接返回last_begin_index=0
|
||||
if i > 10:
|
||||
return common_header, 0 # 如果页码大于10,直接返回last_begin_index=0
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
cleaned_text = clean_page_content(text, common_header)
|
||||
# 检查是否存在"目录"
|
||||
if catalog_pattern.search(cleaned_text):
|
||||
continue # 如果存在目录,跳过当前页面
|
||||
if begin_pattern.search(cleaned_text):
|
||||
last_begin_index = i # 更新第一个匹配的索引,页码从0开始
|
||||
return common_header, last_begin_index
|
||||
return common_header, last_begin_index
|
||||
|
||||
|
||||
def truncate_pdf_main(input_path, output_folder, selection):
|
||||
if os.path.isdir(input_path):
|
||||
generated_files = []
|
||||
@ -340,19 +344,21 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
||||
# Selection 1: 投标人须知前附表
|
||||
pattern_pairs = [
|
||||
(
|
||||
re.compile(
|
||||
r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表)'),
|
||||
re.compile(
|
||||
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标办法前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]',
|
||||
re.MULTILINE)
|
||||
regex.compile(
|
||||
r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知|(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表)\s*$',regex.MULTILINE),
|
||||
regex.compile(
|
||||
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]',
|
||||
regex.MULTILINE)
|
||||
),
|
||||
(
|
||||
re.compile(
|
||||
r'.*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表',
|
||||
re.MULTILINE),
|
||||
re.compile(
|
||||
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标办法前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]',
|
||||
re.MULTILINE)
|
||||
regex.compile(
|
||||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|'
|
||||
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表\s*$',
|
||||
regex.MULTILINE
|
||||
),
|
||||
regex.compile(
|
||||
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]',
|
||||
regex.MULTILINE)
|
||||
)
|
||||
]
|
||||
output_suffix = "tobidders_notice"
|
||||
@ -360,23 +366,23 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
||||
# Selection 2: 评标办法
|
||||
pattern_pairs = [
|
||||
(
|
||||
re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))'),
|
||||
regex.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))',regex.MULTILINE),
|
||||
# Alternative begin pattern
|
||||
re.compile(r'评标办法正文|评标方法正文|^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
||||
regex.compile(r'^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+|[::]清标报告\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:评标|定标)详细程序\s*$',regex.MULTILINE)
|
||||
# Alternative end pattern
|
||||
),
|
||||
(
|
||||
re.compile(
|
||||
r'(?<!见)' # 确保前面不是“见”
|
||||
regex.compile(
|
||||
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 确保前面不是“见”
|
||||
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
|
||||
r'[\u4e00-\u9fff、()()]*?' # 匹配允许的字符(中文、顿号、括号)
|
||||
r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审”
|
||||
r'(?=.*(?:办法|方法))' # 确保包含“办法”或“方法”
|
||||
r'[\u4e00-\u9fff、()()]*\s*$' # 继续匹配允许的字符直到行尾
|
||||
r'|\s*评标(办法|方法)前附表\s*$', # 或匹配“评标办法前附表”或“评标方法前附表”
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
),
|
||||
re.compile(r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
||||
regex.compile(r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', regex.MULTILINE)
|
||||
)
|
||||
]
|
||||
output_suffix = "evaluation_method"
|
||||
@ -385,19 +391,19 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
||||
# Selection 3: 资格审查条件
|
||||
pattern_pairs = [
|
||||
# (
|
||||
# re.compile(r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::]).*(?:资质|能力|信誉).*$|^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格).*',
|
||||
# re.MULTILINE),
|
||||
# re.compile(
|
||||
# regex.compile(r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::]).*(?:资质|能力|信誉).*$|^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格).*',
|
||||
# regex.MULTILINE),
|
||||
# regex.compile(
|
||||
# r'^(?:附录[一二三四五六七八九1-9]*[::]|附件[一二三四五六七八九1-9]*[::]|附表[一二三四五六七八九1-9]*[::])(?!.*(?:资质|能力|信誉)).*$'
|
||||
# r'^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||
# r'^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
|
||||
# ),
|
||||
(
|
||||
re.compile(
|
||||
r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::]).*(?:资质|能力|信誉).*$|第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]*资格[\u4e00-\u9fff、()()]*\s*$',
|
||||
re.MULTILINE),
|
||||
re.compile(
|
||||
regex.compile(
|
||||
r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::]).*(?:资质|能力|信誉).*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]*资格[\u4e00-\u9fff、()()]*\s*$',
|
||||
regex.MULTILINE),
|
||||
regex.compile(
|
||||
r'^(?:附录[一二三四五六七八九1-9]*[::]|附件[一二三四五六七八九1-9]*[::]|附表[一二三四五六七八九1-9]*[::])(?!.*(?:资质|能力|信誉)).*|'
|
||||
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
||||
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', regex.MULTILINE)
|
||||
)
|
||||
]
|
||||
output_suffix = "qualification"
|
||||
@ -405,13 +411,16 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
||||
# Selection 4: 招标公告
|
||||
pattern_pairs = [
|
||||
(
|
||||
re.compile(
|
||||
regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'),
|
||||
re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||
regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
|
||||
),
|
||||
(
|
||||
re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\))]?\s*$', re.MULTILINE),
|
||||
re.compile(r".*(?:投标人须知|投标人须知前附表)\s*$", re.MULTILINE)
|
||||
regex.compile(r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$', regex.MULTILINE),
|
||||
regex.compile(
|
||||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知(?:前附表)?\s*$',
|
||||
regex.MULTILINE
|
||||
)
|
||||
)
|
||||
]
|
||||
output_suffix = "notice"
|
||||
@ -420,13 +429,13 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
||||
# Selection 5: 无效标
|
||||
pattern_pairs = [
|
||||
(
|
||||
re.compile(
|
||||
regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'),
|
||||
re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷', re.MULTILINE)
|
||||
regex.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷', regex.MULTILINE)
|
||||
),
|
||||
(
|
||||
re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\))]?\s*$', re.MULTILINE),
|
||||
re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷', re.MULTILINE)
|
||||
regex.compile(r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$', regex.MULTILINE),
|
||||
regex.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷', regex.MULTILINE)
|
||||
)
|
||||
]
|
||||
output_suffix = "invalid"
|
||||
@ -591,13 +600,13 @@ if __name__ == "__main__":
|
||||
# input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标"
|
||||
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf"
|
||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
|
||||
input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest20.pdf"
|
||||
input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\all"
|
||||
# files=truncate_pdf_multiple(input_path,output_folder)
|
||||
# print(files)
|
||||
# selections = [4, 1] # 仅处理 selection 4、1
|
||||
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
|
||||
selection = 2 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
|
||||
selection = 4 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
|
||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
print(generated_files)
|
||||
# print("生成的文件:", generated_files)
|
||||
|
@ -1,7 +1,8 @@
|
||||
import glob
|
||||
import logging
|
||||
|
||||
from PyPDF2 import PdfReader, PdfWriter
|
||||
import re # 导入正则表达式库
|
||||
import regex # 导入正则表达式库
|
||||
import os # 用于文件和文件夹操作
|
||||
|
||||
from flask_app.general.clean_pdf import clean_page_content, extract_common_header
|
||||
@ -16,41 +17,6 @@ def get_global_logger(unique_id):
|
||||
logger = logging.getLogger(unique_id)
|
||||
return logger
|
||||
|
||||
|
||||
# fitz库版本
|
||||
# def extract_common_header(pdf_path):
|
||||
# doc = fitz.open(pdf_path)
|
||||
# headers = []
|
||||
# total_pages = len(doc)
|
||||
#
|
||||
# if total_pages == 2:
|
||||
# pages_to_read = 2
|
||||
# start_page = 0
|
||||
# else:
|
||||
# pages_to_read = 3
|
||||
# middle_page = total_pages // 2
|
||||
# start_page = max(0, middle_page - 1)
|
||||
#
|
||||
# for i in range(start_page, min(start_page + pages_to_read, total_pages)):
|
||||
# page = doc[i]
|
||||
# text = page.get_text()
|
||||
# if text:
|
||||
# first_lines = text.strip().split('\n')[:3]
|
||||
# headers.append(first_lines)
|
||||
#
|
||||
# doc.close()
|
||||
#
|
||||
# if len(headers) < 2:
|
||||
# return ""
|
||||
#
|
||||
# common_headers = []
|
||||
# for lines in zip(*headers):
|
||||
# common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]])
|
||||
# if common_line:
|
||||
# common_headers.append(' '.join(common_line))
|
||||
#
|
||||
# return '\n'.join(common_headers)
|
||||
|
||||
def is_pdf_or_doc(filename):
|
||||
# 判断文件是否为PDF或Word文档
|
||||
return filename.lower().endswith(('.pdf', '.doc', '.docx'))
|
||||
@ -77,31 +43,33 @@ def process_files(file_path, output_folder, begin_pattern, begin_page, end_patte
|
||||
return [result or ""]
|
||||
return [""] # 返回空字符串
|
||||
|
||||
|
||||
# 默认逻辑是start_page匹配上就不再设置了,一般不匹配上目录的原因是设置了begin_page=5,但是匹配'第一章 招标公告'的时候start_page可能会错误匹配到目录。
|
||||
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
|
||||
output_suffix="normal"):
|
||||
start_page = None
|
||||
end_page = None
|
||||
flag=True
|
||||
for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
|
||||
text = page.extract_text() or ""
|
||||
cleaned_text = clean_page_content(text, common_header)
|
||||
if output_suffix == "tobidders_notice":
|
||||
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and start_page is not None:
|
||||
if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern, cleaned_text):
|
||||
flag=False
|
||||
continue
|
||||
else:
|
||||
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text):
|
||||
if exclusion_pattern and flag and regex.search(exclusion_pattern, cleaned_text):
|
||||
flag=False
|
||||
continue
|
||||
if start_page is None and re.search(begin_pattern, cleaned_text):
|
||||
if start_page is None and regex.search(begin_pattern, cleaned_text):
|
||||
if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
|
||||
start_page = i
|
||||
continue
|
||||
if start_page is not None:
|
||||
if output_suffix == "tobidders_notice":
|
||||
if re.search(end_pattern, cleaned_text) and i > start_page:
|
||||
if regex.search(end_pattern, cleaned_text) and i > start_page:
|
||||
end_page = i
|
||||
break
|
||||
else:
|
||||
if re.search(end_pattern, cleaned_text) and i > start_page and not re.search(begin_pattern,cleaned_text):
|
||||
if regex.search(end_pattern, cleaned_text) and i > start_page and not regex.search(begin_pattern,cleaned_text):
|
||||
end_page = i
|
||||
break
|
||||
return start_page, end_page
|
||||
@ -115,8 +83,8 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
||||
total_pages = len(pdf_document.pages) - 1 # 获取总页数
|
||||
|
||||
if output_suffix == "tobidders_notice":
|
||||
exclusion_pattern = re.compile(
|
||||
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
|
||||
exclusion_pattern = regex.compile(
|
||||
r'文件的构成|文件的组成|文件构成|文件组成')
|
||||
start_page, mid_page, end_page = extract_pages_tobidders_notice(
|
||||
pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern
|
||||
)
|
||||
@ -138,8 +106,8 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
||||
else:
|
||||
# 原有的处理逻辑保持不变
|
||||
if output_suffix == "qualification1" or output_suffix == "procurement" or output_suffix == "evaluation_method":
|
||||
exclusion_pattern = re.compile(
|
||||
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
|
||||
exclusion_pattern = regex.compile(
|
||||
r'文件的构成|文件的组成|文件构成|文件组成')
|
||||
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
|
||||
common_header, exclusion_pattern, output_suffix)
|
||||
# 针对 selection = 6 的特殊处理
|
||||
@ -164,62 +132,45 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
||||
|
||||
|
||||
def get_patterns_for_procurement():
|
||||
# begin_pattern = re.compile(
|
||||
# begin_pattern = regex.compile(
|
||||
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十1-9]+(?:章|部分).*(?:采购|需求).*',
|
||||
# re.MULTILINE)
|
||||
begin_pattern = re.compile(
|
||||
r'(?<!见)'
|
||||
# regex.MULTILINE)
|
||||
begin_pattern = regex.compile(
|
||||
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
|
||||
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
|
||||
r'[\u4e00-\u9fff、()()]*?' # 匹配允许的字符
|
||||
r'(?:(?:服务|项目|商务|技术)[\u4e00-\u9fff、()()]*?要求[\u4e00-\u9fff、()()]*?\s*$|' # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求”
|
||||
r'(?:采购|需求)[\u4e00-\u9fff、()()]*?)\s*$', # 或者匹配“采购”或“需求”
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
)
|
||||
end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
||||
end_pattern = regex.compile(r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', regex.MULTILINE)
|
||||
return begin_pattern, end_pattern
|
||||
|
||||
|
||||
def get_patterns_for_evaluation_method():
|
||||
# begin_pattern = re.compile(
|
||||
# r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]*?(磋商|谈判|评标|评定|评审)\s*(办法|方法)[\u4e00-\u9fff、()()]*\s*$',
|
||||
# re.MULTILINE
|
||||
# )
|
||||
begin_pattern = re.compile(
|
||||
r'(?<!见)'
|
||||
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
|
||||
r'(?:[\u4e00-\u9fff、()()]*?)' # 匹配允许的字符(中文、顿号、括号)
|
||||
r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审” 注意这里的'.*'是允许这些关键词出现在任意位置,但主体匹配部分仍然受到字符集的限制。
|
||||
r'(?=.*(?:办法|方法|内容))' # 确保包含“办法”或“方法”
|
||||
r'[\u4e00-\u9fff、()()]*\s*$', # 继续匹配允许的字符直到行尾
|
||||
re.MULTILINE
|
||||
begin_pattern = regex.compile(
|
||||
r'(?:(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 第一种模式
|
||||
r'(?:[\u4e00-\u9fff、()()]*?)'
|
||||
r'(?=.*(?:磋商|谈判|评标|评定|评审))'
|
||||
r'(?=.*(?:办法|方法|内容))'
|
||||
r'[\u4e00-\u9fff、()()]*\s*$|'
|
||||
r'^\s*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$)', # 第二种模式
|
||||
regex.MULTILINE
|
||||
)
|
||||
end_pattern = re.compile(
|
||||
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
||||
end_pattern = regex.compile(
|
||||
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', regex.MULTILINE)
|
||||
return begin_pattern, end_pattern
|
||||
|
||||
|
||||
def get_patterns_for_notice():
|
||||
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*')
|
||||
end_pattern = re.compile(
|
||||
begin_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*')
|
||||
end_pattern = regex.compile(
|
||||
# r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)',
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
)
|
||||
return begin_pattern, end_pattern
|
||||
|
||||
|
||||
def get_patterns_for_notice_twice():
|
||||
begin_pattern = re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*', re.MULTILINE
|
||||
)
|
||||
end_pattern = re.compile(
|
||||
# r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)',
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
|
||||
re.MULTILINE
|
||||
)
|
||||
return begin_pattern, end_pattern
|
||||
|
||||
|
||||
# def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header,
|
||||
# exclusion_pattern):
|
||||
# start_page = None
|
||||
@ -228,14 +179,14 @@ def get_patterns_for_notice_twice():
|
||||
# for i, page in enumerate(pdf_document.pages):
|
||||
# text = page.extract_text() or ""
|
||||
# cleaned_text = clean_page_content(text, common_header)
|
||||
# if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and mid_page is not None:
|
||||
# if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
|
||||
# continue
|
||||
# if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
|
||||
# if start_page is None and regex.search(begin_pattern, cleaned_text) and i > begin_page:
|
||||
# start_page = i
|
||||
# if start_page is not None and mid_page is None and re.search(
|
||||
# if start_page is not None and mid_page is None and regex.search(
|
||||
# r'^\s*[((]?\s*[一1]\s*[))]?\s*[、..]*\s*(说\s*明|总\s*则)', cleaned_text):
|
||||
# mid_page = i
|
||||
# if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page:
|
||||
# if start_page is not None and mid_page is not None and regex.search(end_pattern, cleaned_text) and i > mid_page:
|
||||
# end_page = i
|
||||
# break
|
||||
# return start_page, mid_page, end_page
|
||||
@ -248,10 +199,10 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
||||
|
||||
参数:
|
||||
pdf_document (PDFDocument): 要处理的PDF文档对象。
|
||||
begin_pattern (str 或 re.Pattern): 用于识别起始的正则表达式模式。
|
||||
begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。
|
||||
begin_page (int): 开始搜索的页码。
|
||||
common_header (str): 每页需要清理的公共头部文本。
|
||||
exclusion_pattern (str 或 re.Pattern): 用于排除某些页的模式。
|
||||
exclusion_pattern (str 或 regex.Pattern): 用于排除某些页的模式。
|
||||
|
||||
返回:
|
||||
tuple: (start_page, mid_page, end_page) 如果成功,否则 (None, None, None)
|
||||
@ -264,8 +215,8 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
||||
如果未提供 local_end_pattern,则根据匹配的章节类型动态生成 end_pattern。
|
||||
|
||||
参数:
|
||||
local_begin_pattern (str 或 re.Pattern): 用于识别起始的正则表达式模式。
|
||||
local_end_pattern (str 或 re.Pattern, 可选): 用于识别结束的正则表达式模式。
|
||||
local_begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。
|
||||
local_end_pattern (str 或 regex.Pattern, 可选): 用于识别结束的正则表达式模式。
|
||||
|
||||
返回:
|
||||
tuple: (start_page, mid_page, end_page)
|
||||
@ -281,12 +232,12 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
||||
cleaned_text = clean_page_content(text, common_header)
|
||||
|
||||
# 如果已经找到中间页,且当前页匹配排除模式,则跳过
|
||||
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and mid_page is not None:
|
||||
if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
|
||||
continue
|
||||
|
||||
# 识别起始页
|
||||
if start_page is None:
|
||||
match = re.search(local_begin_pattern, cleaned_text)
|
||||
match = regex.search(local_begin_pattern, cleaned_text)
|
||||
if match and i > begin_page:
|
||||
start_page = i
|
||||
matched_text = match.group(0) # 获取整个匹配的文本
|
||||
@ -302,9 +253,9 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
||||
|
||||
if chapter_type:
|
||||
# 根据 chapter_type 动态生成 end_pattern
|
||||
end_pattern_dynamic = re.compile(
|
||||
rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+',
|
||||
re.MULTILINE
|
||||
end_pattern_dynamic = regex.compile(
|
||||
rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
|
||||
regex.MULTILINE
|
||||
)
|
||||
|
||||
# 根据 chapter_type 动态生成 additional_mid_pattern
|
||||
@ -317,54 +268,55 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
||||
|
||||
# 定义基础的 mid_pattern
|
||||
base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \
|
||||
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
|
||||
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
|
||||
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文\s*$'
|
||||
|
||||
# 合并基础模式和额外模式
|
||||
if additional_mid_pattern:
|
||||
combined_mid_pattern = re.compile(
|
||||
combined_mid_pattern = regex.compile(
|
||||
rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})',
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
)
|
||||
else:
|
||||
combined_mid_pattern = re.compile(
|
||||
combined_mid_pattern = regex.compile(
|
||||
rf'{base_mid_pattern}',
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
)
|
||||
else:
|
||||
# 如果未匹配到“章”或“部分”,使用默认的 end_pattern 和 mid_pattern
|
||||
end_pattern_dynamic = re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
|
||||
re.MULTILINE
|
||||
end_pattern_dynamic = regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
|
||||
regex.MULTILINE
|
||||
)
|
||||
|
||||
# 定义基础的 mid_pattern
|
||||
base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \
|
||||
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
|
||||
combined_mid_pattern = re.compile(
|
||||
combined_mid_pattern = regex.compile(
|
||||
rf'{base_mid_pattern}',
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
)
|
||||
else:
|
||||
# 如果提供了固定的 end_pattern,则使用默认的 mid_pattern
|
||||
base_mid_pattern = r'.*[((]?\s*[一二12]?[))]?\s*[、..]*\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)\s*$' # 可以匹配"东莞市粤隆招标有限公司编制 46一、说明"
|
||||
combined_mid_pattern = re.compile(
|
||||
combined_mid_pattern = regex.compile(
|
||||
rf'{base_mid_pattern}',
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
)
|
||||
continue
|
||||
|
||||
# 识别中间页
|
||||
if start_page is not None and mid_page is None and combined_mid_pattern:
|
||||
if (start_page + 1 == i) and re.search(local_begin_pattern, cleaned_text):
|
||||
if (start_page + 1 == i) and regex.search(local_begin_pattern, cleaned_text):
|
||||
continue
|
||||
if re.search(combined_mid_pattern, cleaned_text):
|
||||
if regex.search(combined_mid_pattern, cleaned_text):
|
||||
mid_page = i
|
||||
|
||||
# 识别结束页
|
||||
if start_page is not None and mid_page is not None:
|
||||
# 使用提供的 end_pattern 或动态生成的 end_pattern
|
||||
current_end_pattern = local_end_pattern if local_end_pattern else end_pattern_dynamic
|
||||
if re.search(current_end_pattern, cleaned_text):
|
||||
if regex.search(current_end_pattern, cleaned_text):
|
||||
if i > mid_page:
|
||||
end_page = i
|
||||
break
|
||||
@ -376,21 +328,21 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
||||
|
||||
# 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取
|
||||
if not (start_page and mid_page and end_page):
|
||||
print(f"第二次尝试 tobidders_notice!{pdf_path}")
|
||||
print(f"第二次尝试 tobidders_notice!")
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
|
||||
if start_page and end_page and mid_page:
|
||||
return start_page, mid_page, end_page
|
||||
else:
|
||||
# 定义新的 begin_pattern 和 end_pattern
|
||||
new_begin_pattern = re.compile(
|
||||
new_begin_pattern = regex.compile(
|
||||
r'.*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|'
|
||||
r'(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表',
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
)
|
||||
new_end_pattern = re.compile(
|
||||
new_end_pattern = regex.compile(
|
||||
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$',
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
)
|
||||
print("第三次尝试 tobidders_notice! ")
|
||||
# 第二次提取尝试,使用新的模式
|
||||
@ -401,19 +353,19 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
|
||||
|
||||
def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page):
|
||||
output_suffix = "tobidders_notice"
|
||||
begin_pattern = re.compile(
|
||||
begin_pattern = regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+',
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
)
|
||||
end_pattern = re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', re.MULTILINE # 捕获中文部分
|
||||
end_pattern = regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', regex.MULTILINE # 捕获中文部分
|
||||
)
|
||||
exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"] # 在这里添加需要排除的关键词
|
||||
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
|
||||
exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成')
|
||||
|
||||
# 提取第一部分
|
||||
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
|
||||
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,exclusion_pattern,output_suffix)
|
||||
if start_page1 is None or end_page1 is None:
|
||||
return "", "", ""
|
||||
|
||||
@ -438,36 +390,36 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
|
||||
exclusion_pattern, output_suffix)
|
||||
|
||||
if end_page2 is None:
|
||||
return start_page1, end_page1, end_page1
|
||||
return start_page1, end_page1, ""
|
||||
|
||||
return start_page1, end_page1, end_page2
|
||||
|
||||
|
||||
def extract_pages_qualification(pdf_document, begin_page, common_header):
|
||||
# 开始匹配模式,仅匹配“附录”、“附件”或“附表”
|
||||
begin_pattern = re.compile(
|
||||
begin_pattern = regex.compile(
|
||||
r'^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?)',
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
)
|
||||
|
||||
# 优先匹配模式,匹配以“资格性检查”、“资格审查”或“符合性审查”开头
|
||||
priority_pattern = re.compile(
|
||||
priority_pattern = regex.compile(
|
||||
r'^(资格性检查|资格审查|符合性审查)',
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
)
|
||||
|
||||
# 结束匹配模式 - 附录、附件、附表等
|
||||
end_pattern_attachment = re.compile(
|
||||
end_pattern_attachment = regex.compile(
|
||||
r'^(?:附录.*?[::]|附件.*?[::]|附表.*?[::]|附件\s*\d+).*$',
|
||||
re.MULTILINE
|
||||
regex.MULTILINE
|
||||
)
|
||||
# 结束匹配模式 - 章节标题
|
||||
end_pattern_chapter = re.compile(
|
||||
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$',
|
||||
re.MULTILINE
|
||||
end_pattern_chapter = regex.compile(
|
||||
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$',
|
||||
regex.MULTILINE
|
||||
)
|
||||
|
||||
print("第二次尝试:匹配附件")
|
||||
print("第二次尝试 qualification:匹配附件")
|
||||
start_page = None
|
||||
end_page = None
|
||||
include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查", "资格检查"]
|
||||
@ -519,7 +471,7 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
|
||||
|
||||
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page):
|
||||
try:
|
||||
exclusion_pattern = re.compile(
|
||||
exclusion_pattern = regex.compile(
|
||||
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
patterns = None
|
||||
@ -531,7 +483,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, b
|
||||
elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3":
|
||||
patterns = [get_patterns_for_evaluation_method()]
|
||||
elif output_suffix == "notice":
|
||||
patterns = [get_patterns_for_notice(), get_patterns_for_notice_twice()]
|
||||
patterns = [get_patterns_for_notice()]
|
||||
elif output_suffix == "qualification1":
|
||||
start_page, end_page = extract_pages_qualification(pdf_document, begin_page, common_header)
|
||||
if patterns:
|
||||
@ -545,11 +497,17 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, b
|
||||
if output_suffix == "qualification1":
|
||||
# print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
|
||||
print("第三次尝试资格审查:尝试提取评分办法章节...")
|
||||
temp = truncate_pdf_main(pdf_path, output_folder, 2, "qualification2")
|
||||
if len(temp) > 0:
|
||||
return temp[0]
|
||||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||
evaluation_method_file = os.path.join(output_folder, f"{base_file_name}_evaluation_method.pdf")
|
||||
if os.path.isfile(evaluation_method_file):
|
||||
print(f"找到评分办法章节文件: {evaluation_method_file},直接返回。")
|
||||
return evaluation_method_file
|
||||
else:
|
||||
return ""
|
||||
temp = truncate_pdf_main(pdf_path, output_folder, 2, "qualification2")
|
||||
if len(temp) > 0:
|
||||
return temp[0]
|
||||
else:
|
||||
return ""
|
||||
else:
|
||||
print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
|
||||
return ""
|
||||
@ -593,20 +551,27 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo
|
||||
print(f"Error in save_extracted_pages: {e}")
|
||||
return "" # 返回空字符串
|
||||
|
||||
|
||||
def get_start_and_common_header(input_path):
|
||||
common_header = extract_common_header(input_path)
|
||||
last_begin_index = 0
|
||||
begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
||||
re.MULTILINE)
|
||||
begin_pattern = regex.compile(
|
||||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
||||
regex.MULTILINE
|
||||
)
|
||||
# 新增目录匹配模式
|
||||
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
||||
|
||||
pdf_document = PdfReader(input_path)
|
||||
for i, page in enumerate(pdf_document.pages):
|
||||
if i > 10:
|
||||
return common_header, 0 # 如果页码大于25,直接返回last_begin_index=0
|
||||
return common_header, 0 # 如果页码大于10,直接返回last_begin_index=0
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
cleaned_text = clean_page_content(text, common_header)
|
||||
if begin_pattern.search(cleaned_text) and not re.search(r'目\s*录', cleaned_text):
|
||||
# 检查是否存在"目录"
|
||||
if catalog_pattern.search(cleaned_text):
|
||||
continue # 如果存在目录,跳过当前页面
|
||||
if begin_pattern.search(cleaned_text):
|
||||
last_begin_index = i # 更新第一个匹配的索引,页码从0开始
|
||||
return common_header, last_begin_index
|
||||
return common_header, last_begin_index
|
||||
@ -658,41 +623,43 @@ def process_input(input_path, output_folder, selection, output_suffix):
|
||||
|
||||
# 根据选择设置对应的模式和结束模式
|
||||
if selection == 1:
|
||||
begin_pattern = re.compile(
|
||||
r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$', re.MULTILINE)
|
||||
end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
||||
begin_pattern = regex.compile(
|
||||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
||||
regex.MULTILINE
|
||||
)
|
||||
end_pattern = regex.compile(r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表\s*$', regex.MULTILINE)
|
||||
local_output_suffix = "notice"
|
||||
elif selection == 2:
|
||||
begin_pattern = re.compile(
|
||||
begin_pattern = regex.compile(
|
||||
r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法|内容))')
|
||||
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
||||
end_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
||||
local_output_suffix = "evaluation_method"
|
||||
elif selection == 3:
|
||||
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE)
|
||||
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||
begin_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', regex.MULTILINE)
|
||||
end_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
|
||||
local_output_suffix = "qualification1"
|
||||
elif selection == 4:
|
||||
begin_pattern = re.compile(
|
||||
r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表)',
|
||||
re.MULTILINE)
|
||||
begin_pattern = regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表\s*$',
|
||||
regex.MULTILINE)
|
||||
end_pattern = None
|
||||
local_output_suffix = "tobidders_notice"
|
||||
elif selection == 5:
|
||||
begin_pattern = re.compile(
|
||||
begin_pattern = regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购内容|采购要求|需求).*') # 包头中有一章'采购相关说明'
|
||||
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
||||
end_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
||||
local_output_suffix = "procurement"
|
||||
|
||||
# begin_pattern = re.compile(
|
||||
# begin_pattern = regex.compile(
|
||||
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
|
||||
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|'
|
||||
# r'^[一二三四五六七八九十百千]+、\s*采购清单', re.MULTILINE)
|
||||
# end_pattern = re.compile(
|
||||
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||
# r'^[一二三四五六七八九十百千]+、\s*采购清单', regex.MULTILINE)
|
||||
# end_pattern = regex.compile(
|
||||
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
|
||||
|
||||
elif selection == 6:
|
||||
begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:格式).*')
|
||||
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||
begin_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:格式).*')
|
||||
end_pattern = regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
|
||||
local_output_suffix = "format"
|
||||
else:
|
||||
print("无效的选择:请选择1-5")
|
||||
@ -807,17 +774,17 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections, unique_id="
|
||||
# ztbfile.pdf少资格评审 包头少符合性评审
|
||||
if __name__ == "__main__":
|
||||
logger = get_global_logger("123")
|
||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
||||
input_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-陕西-陕西省某单位2024年执勤化妆服采购项目.pdf"
|
||||
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
|
||||
# input_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
||||
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
|
||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
||||
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
|
||||
# input_path=r"C:\Users\Administrator\Desktop\fsdownload\42bd5604-fb85-43ff-821f-a1ea78fec115\ztbfile.pdf"
|
||||
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\招标文件-采购类\all"
|
||||
files = truncate_pdf_multiple(input_path, output_folder,logger)
|
||||
# files = truncate_pdf_multiple(input_path, output_folder,logger)
|
||||
# selections = [3,5]
|
||||
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
|
||||
print(files)
|
||||
# selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
||||
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
# print(generated_files)
|
||||
# print(files)
|
||||
selection = 3 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
print(generated_files)
|
||||
|
@ -393,7 +393,6 @@ def combine_qualification_review(invalid_path, qualification_path, notice_path):
|
||||
1. 既有资格性审查内容,也有符合性审查内容时
|
||||
2. 它们的内容在同一张表格中
|
||||
3. 表中没有两个合并单元格内容为'资格性审查'和'符合性审查'类似的表述,只有'资格性审查和符合性审查'的合并表述。
|
||||
4. 表头
|
||||
|
||||
以下为示例表格1,
|
||||
| 序号 | 资格性检查和符合性检查内容 |
|
||||
@ -462,7 +461,7 @@ def combine_qualification_review(invalid_path, qualification_path, notice_path):
|
||||
输出要求:
|
||||
1.请以json格式给出,外层为'资格性和符合性审查',最内层的值需要用列表包裹。
|
||||
2.一层嵌套内的键需要总结分类为某类评审因素或是直接使用原文中的评审因素字段、标题。
|
||||
3.你的回答要与原文完全一致。
|
||||
3.你的回答要与原文完全一致,若审查标准在表格中,那么单元格内的内容基本都要涵盖,不要遗漏,作为键值中的字符串列表项。
|
||||
4.最大细分为二层嵌套即可。
|
||||
输出示例:
|
||||
{
|
||||
|
Loading…
x
Reference in New Issue
Block a user