This commit is contained in:
zy123 2024-10-09 13:50:28 +08:00
parent e017acc673
commit a2e3e9c0ab
11 changed files with 501 additions and 161 deletions

View File

@ -1,95 +1,261 @@
# -*- encoding:utf-8 -*- import json
import re import re
def find_keys_by_value(target_value, json_data):
# 找到值等于目标值或以目标值开头的键
matched_keys = [k for k, v in json_data.items() if v == target_value]
if not matched_keys:
matched_keys = [k for k, v in json_data.items() if isinstance(v, str) and v.startswith(target_value)]
return matched_keys
def find_keys_with_prefix(prefix, json_data): def compare_headings(current, new):
# 只提取直接子项,比如 prefix 为 '7.2' 时只提取 '7.2.1', '7.2.2' 但不会提取 '7.3' current_nums = [int(num) for num in current.split('.') if num.isdigit()]
return [k for k in json_data.keys() if k.startswith(prefix) and k[len(prefix):].lstrip('.').isdigit()] new_nums = [int(num) for num in new.split('.') if num.isdigit()]
def extract_json(data, target_values): for c, n in zip(current_nums, new_nums):
results = {} if n > c:
return True
elif n < c:
return False
# 遍历所有目标值 return len(new_nums) > len(current_nums)
for target_value in target_values:
# 找到所有与目标值匹配的键
matched_keys = find_keys_by_value(target_value, data)
for key in matched_keys:
# 查找所有以该键为前缀的子键,限制只提取直接子项
key_and_subheadings = find_keys_with_prefix(key, data)
for subkey in key_and_subheadings: def should_add_newline(content, keywords, max_length=20):
# 如果子键有多级结构(比如 '7.2.1'),并且是直接子项 content_str = ''.join(content).strip()
if "." in subkey: return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
parent_key = subkey.rsplit('.', 1)[0]
top_level_key = parent_key.split('.')[0] + '.'
# 确保顶级键不会重复添加
if top_level_key not in results:
results[top_level_key] = data[top_level_key]
# 添加或更新父级键 def handle_content_append(current_content, line_content, append_newline, keywords):
if parent_key not in results: if append_newline:
if parent_key in data: if should_add_newline(current_content, keywords):
results[parent_key] = data[parent_key] current_content.append('\n')
append_newline = False
current_content.append(line_content)
return append_newline
# 添加当前子键和它的值
if subkey in data:
results[subkey] = data[subkey]
return results # def parse_text_by_heading(text):
# keywords = ['包含', '以下']
# data = {}
# current_key = None
# current_content = []
# append_newline = False
# skip_subheadings = False
#
# lines = text.split('\n')
# for i, line in enumerate(lines):
# line_stripped = line.strip().replace('', '.')
# # print(line_stripped)
# # 匹配二级、三级标题形如 '1.1'、'2.2.3' 并确保其前后没有字母或括号
# match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
# if not match:
# #匹配一级标题'1.'
# match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
#
# if match:
# new_key, line_content = match.groups() #new_key:1.1 line_content:本次竞争性磋商文件仅适用于本次竞争性磋商公告中所叙述的广水农商行门禁控制主机及基础验证设备采购项
# line_content = line_content.lstrip('..、,') #删除左边的若干个..、,
#
# # 检查是否包含关键词,决定是否跳过子标题
# if any(keyword in line_content for keyword in keywords):
# skip_subheadings = True
# else:
# skip_subheadings = False
#
# # 检查是否应该更新当前键和内容
# if current_key is None or (compare_headings(current_key, new_key) and (
# len(current_content) == 0 or current_content[-1][-1] != '第')):
# if current_key is not None:
# content_string = ''.join(current_content).strip()
# data[current_key] = content_string.replace(' ', '')
# current_key = new_key
# current_content = [line_content]
# append_newline = len(new_key.rstrip('.').split('.')) <= 2
# else:
# append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
# else:
# if not skip_subheadings:
# # 匹配中文数字标题,包括带括号和不带括号的情况
# pattern_title = re.compile(
# r'^\s*(?:[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
# chinese_match = pattern_title.match(line_stripped)
#
# # 匹配字母标题,如 B.磋商说明
# letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
#
# # 匹配阿拉伯数字开头后跟顿号的情况,如 "7、竞争性磋商采购文件的修改"
# arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
#
# if chinese_match or letter_match or arabic_match:
# if current_key is not None:
# content_string = ''.join(current_content).strip()
# data[current_key] = content_string.replace(' ', '')
# if chinese_match:
# pattern_key = re.compile(r'[一二三四五六七八九十]{1,2}')
# pattern_value = re.compile(r'[^\s、)]+')
# chinese_key_match = pattern_key.search(line_stripped)
# chinese_value_match = pattern_value.search(
# line_stripped[chinese_key_match.end():]) if chinese_key_match else None
# if chinese_key_match and chinese_value_match:
# current_key = chinese_key_match.group()
# current_content = [chinese_value_match.group()]
# elif letter_match:
# letter_key, letter_value = letter_match.groups()
# # 将字母转换为对应的中文数字
# letter_to_chinese = {
# 'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五',
# 'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十',
# 'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
# }
# current_key = letter_to_chinese.get(letter_key, letter_key)
# current_content = [letter_value]
# elif arabic_match:
# arabic_key, arabic_value = arabic_match.groups()
# current_key = arabic_key.replace('、', '.')
# current_content = [arabic_value]
#
# append_newline = True
# continue
#
# if line_stripped:
# append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
#
# if current_key is not None:
# content_string = ''.join(current_content).strip()
# data[current_key] = content_string.replace(' ', '')
#
# return data
def extract_json(data, target_values): def parse_text_by_heading(text):
results = {} keywords = ['包含', '以下']
for target_value in target_values: data = {}
matched_keys = find_keys_by_value(target_value, data) current_key = None
for key in matched_keys: current_key_chinese=None
key_and_subheadings = find_keys_with_prefix(key, data) current_content = []
for subkey in key_and_subheadings: append_newline = False
if "." in subkey: skip_subheadings = False
parent_key = subkey.rsplit('.', 1)[0]
top_level_key = parent_key.split('.')[0] + '.'
# 特别处理定标相关的顶级键,确保不会重复添加其他键
if top_level_key not in results:
results[top_level_key] = target_value
# 添加或更新父级键
if parent_key not in results:
if parent_key in data:
results[parent_key] = data[parent_key]
# 添加当前键
results[subkey] = data[subkey]
return results
# 测试数据 lines = text.split('\n')
data = { for i, line in enumerate(lines):
"7.": "合同授予", line_stripped = line.strip().replace('', '.')
"7.1": "定标方式", match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
"7.1.1": "招标人依据评标委员会推荐的中标候选人确定中标人,国有资金占控股或者主导地位的依法必须进行招标的项目,确定排名第一的中标候选人为中标人。评标委员会推荐中标候选人的数量见投标人须知前附表。", if not match:
"7.1.2": "排名第一的中标候选人放弃中标、因不可抗力不能签订合同、不按照招标文件要求提交履约担保,或者被查实存在影响中标结果的违法行为等情形,不符合中标条件的,招标人可以按照评标委员会提出的中标候选人名单排序依次确定其他中标候选人为中标人,也可以重新招标。", match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
"7.1.3": "招标人将自收到评标报告之日起3日内在投标人须知前附表规定的媒介公示中标候选人。公示期不少于3日。投标人或者其他利害关系人对评标结果有异议的应当在评标结果公示期间提出。招标人自收到异议之日起3日内作出答复作出答复前暂停招标投标活动。投标人的异议与答复应当通过“省电子交易平台”在“异议与答复”菜单以书面形式进行其他利害关系人的异议也应以书面形式向招标人提出。中标候选人公示媒体见投标人须知前附表。",
"7.1.4": "中标候选人经公示无异议、招标人向交通运输主管部门履行评标结果备案手续后,发出中标通知书。",
"7.2": "中标通知",
"7.2.1": "招标人将在本章第3.3款规定的投标有效期内招标人通过“电子交易平台”以书面形式向中标人发出中标通知书同时将中标结果通知未中标的投标人。中标通知书发出前招标人将在投标人须知前附表第7.1.3款规定的媒介发布中标结果公告。",
"7.2.2": "中标候选人的经营、财务状况发生较大变化或者存在违法行为,招标人认为可能影响其履约能力的,将在发出中标通知书前报请行政监督部门批准后,召集原评标委员会按照招标文件规定的标准和方法审查确认。",
"7.2.3": "招标人即使已经向中标人发出了中标通知书若发生下列情况之一者中标无效。1经查实投标人以他人名义投标或以其他方式弄虚作假骗取中标的2经查实投标人有串通投标的或向招标人或评标委员会成员以行贿手段谋取中标的3法律、法规规定的其他情形。",
"7.3": "履约担保",
"7.3.1": "在签订合同前中标人应按投标人须知前附表规定的金额、担保形式和招标文件第四章规定的格式向招标人提交履约担保或事先经过招标人书面认可的履约担保格式向招标人提交履约担保。履约担保金额不得超过中标合同价的10%,详见投标人须知前附表。联合体中标的,其履约担保由牵头人提交,并应符合投标人须知前附表规定的金额、担保形式和招标文件规定的履约担保格式要求。联合体中标的,其履约担保由牵头人递交,并应符合上述要求。",
"7.3.2": "中标人不能按本章第7.3.1项要求提交履约担保的,视为放弃中标,其投标保证金不予退还,给招标人造成的损失超过投标保证金数额的,中标人还应当对超过部分予以赔偿。",
}
# 提取匹配的目标 if match:
target_values = ["中标"] new_key, line_content = match.groups()
line_content = line_content.lstrip('..、,')
# 运行提取函数 if any(keyword in line_content for keyword in keywords):
extracted_json = extract_json(data, target_values) skip_subheadings = True
else:
skip_subheadings = False
if current_key_chinese is not None:
content_string = ''.join(current_content).strip()
data[current_key_chinese] = content_string.replace(' ', '')
current_key_chinese=None
if current_key is None or (compare_headings(current_key, new_key) and (
len(current_content) == 0 or current_content[-1][-1] != '')):
if current_key is not None:
content_string = ''.join(current_content).strip()
if current_key in data:
data[current_key] += content_string.replace(' ', '')
else:
data[current_key] = content_string.replace(' ', '')
current_key = new_key
current_content = [line_content]
append_newline = len(new_key.rstrip('.').split('.')) <= 2
else:
append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
else:
if not skip_subheadings:
pattern_title = re.compile(
r'^\s*(?:[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
chinese_match = pattern_title.match(line_stripped)
# 输出结果 letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
import json
print(json.dumps(extracted_json, ensure_ascii=False, indent=4)) arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
if chinese_match or letter_match or arabic_match:
# if current_key is not None or current_key_chinese is not None:
# key = current_key if current_key is not None else current_key_chinese
# content_string = ''.join(current_content).strip() #TODO: key chinese key
# if key in data:
# data[key] +=content_string.replace(' ', '')
# else:
# data[key] = content_string.replace(' ', '')
# if current_key_chinese is not None:
# current_key_chinese = None
if chinese_match:
pattern_key = re.compile(r'[一二三四五六七八九十]{1,2}')
pattern_value = re.compile(r'[^\s、)]+')
chinese_key_match = pattern_key.search(line_stripped)
chinese_value_match = pattern_value.search(
line_stripped[chinese_key_match.end():]) if chinese_key_match else None
if chinese_key_match and chinese_value_match:
# current_key = chinese_key_match.group()
current_key_chinese=chinese_key_match.group()
current_content = [chinese_value_match.group()]
elif letter_match:
letter_key, letter_value = letter_match.groups()
letter_to_chinese = {
'A': '', 'B': '', 'C': '', 'D': '', 'E': '',
'F': '', 'G': '', 'H': '', 'I': '', 'J': '',
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
}
# current_key = letter_to_chinese.get(letter_key, letter_key)
current_key_chinese =letter_to_chinese.get(letter_key, letter_key)
current_content = [letter_value]
elif arabic_match:
arabic_key, arabic_value = arabic_match.groups()
current_key = arabic_key.replace('', '.')
current_content = [arabic_value]
append_newline = True
continue
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = content_string.replace(' ', '')
# if current_key_chinese is not None:
# content_string = ''.join(current_content).strip()
# if current_key_chinese in data:
# data[current_key] += content_string.replace(' ', '')
# else:
# data[current_key_chinese] = content_string.replace(' ', '')
return data
# 测试文本
text = """
在这种
哈哈哈
红红火火恍恍惚惚
竞争性磋商响应文件的递交
19.竞争性磋商响应文件的加密
19.1 供应商应当按照要求制作电子磋商响应文件并在投标时上传加密的电子磋
商响应文件未加密的电子磋商响应文件采购人电子交易平台将拒收
并提示
22.3 从提交响应文件截止时间至磋商有效期期满这段时间供应商不得修改或
撤销其响应文件供应商所提交的响应文件在磋商结束后无论成交与否都不退
竞争性磋商开标时间和地点
供应商在规定的磋商截止时间开标时间电子交易平台上公开进行
开标所有供应商均应当准时在线参加开标
采购人采购代理机构通过互联网在磋商须知前附表规定的地点组织开标
并在投标截止时间 30 分钟前使用 CA 数字证书登录电子交易平台进入
开标大厅选择相应标段作在线开标的准备工作
供应商应当在能够保证设施设备可靠互联网畅通的任意地点通过互联网
在线参加开标在投标截止时间前使用加密其投标文件的 CA 数字证书登录
子交易平台进入开标大厅选择所投标段进行签到并实时在线关注招标
人的操作情况
:开标结束后请各投标人不要长时间离开电脑随时关注业务系统
待专家资格性审查和符合性审查完成后要求的最后报价若在规定时间内投标人
没有按要求进行多轮报价则作自动弃权处理
磋商程序及步骤
23.竞争性磋商小组
"""
res = parse_text_by_heading(text)
print(json.dumps(res, ensure_ascii=False, indent=4))

View File

@ -194,7 +194,8 @@ if __name__ == "__main__":
# for question, response in results: # for question, response in results:
# print(f"Question: {question}") # print(f"Question: {question}")
# print(f"Response: {response}") # print(f"Response: {response}")
ques=[]
ques=["该招标文件的工程名称项目名称招标编号是招标人是招标代理机构是请按json格式给我提供信息键名分别是'工程名称','招标编号','招标人','招标代理机构',若存在未知信息,在对应的键值中填'未知'","该招标文件的工程概况或项目概况招标范围是请按json格式给我提供信息键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'"]
results = multi_threading(ques, "招标解析5word") results = multi_threading(ques, "招标解析5word")
if not results: if not results:
print("errror!") print("errror!")

View File

@ -1,7 +1,6 @@
from PyPDF2 import PdfReader, PdfWriter from PyPDF2 import PdfReader, PdfWriter
import re # 导入正则表达式库 import re # 导入正则表达式库
import os # 用于文件和文件夹操作 import os # 用于文件和文件夹操作
def clean_page_content(text, common_header): def clean_page_content(text, common_header):
# 首先删除抬头公共部分 # 首先删除抬头公共部分
if common_header: # 确保有公共抬头才进行替换 if common_header: # 确保有公共抬头才进行替换
@ -16,6 +15,8 @@ def clean_page_content(text, common_header):
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码 text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码
return text return text
#PYPDF2库
def extract_common_header(pdf_path): def extract_common_header(pdf_path):
pdf_document = PdfReader(pdf_path) pdf_document = PdfReader(pdf_path)
headers = [] headers = []
@ -51,7 +52,6 @@ def extract_common_header(pdf_path):
return '\n'.join(common_headers) return '\n'.join(common_headers)
def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page): def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page):
"""从原始PDF截取指定范围的页面并保存到新的PDF文件中""" """从原始PDF截取指定范围的页面并保存到新的PDF文件中"""
# 获取文件基本名称 # 获取文件基本名称

View File

@ -266,7 +266,7 @@ def extract_from_notice(clause_path, type):
if __name__ == "__main__": if __name__ == "__main__":
# file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json' # file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause9.json' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause9.json'
file_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp\\clause1.json" file_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\clause1.json"
try: try:
res = extract_from_notice(file_path, 2) # 可以改变此处的 type 参数测试不同的场景 res = extract_from_notice(file_path, 2) # 可以改变此处的 type 参数测试不同的场景
res2 = json.dumps(res, ensure_ascii=False, indent=4) res2 = json.dumps(res, ensure_ascii=False, indent=4)

View File

@ -114,7 +114,7 @@ def parse_text_by_heading(text):
append_newline = False append_newline = False
lines = text.split('\n') lines = text.split('\n')
for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n' for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n'
line_stripped = line.strip() line_stripped = line.strip().replace('', '.')
# 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号 # 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped) match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match: if not match:
@ -122,7 +122,7 @@ def parse_text_by_heading(text):
if match: if match:
new_key, line_content = match.groups() new_key, line_content = match.groups()
line_content = line_content.lstrip('.') line_content = line_content.lstrip('.')
# 检查是否应该更新当前键和内容 # 检查是否应该更新当前键和内容
if current_key is None or (compare_headings(current_key, new_key) and ( if current_key is None or (compare_headings(current_key, new_key) and (
len(current_content) == 0 or current_content[-1][-1] != '')): #current_content 内容占两行则是: ['本招标项目已具备招标条件,现对本项目施工监理进行招标,项目概况见本', '章附件一。'] len(current_content) == 0 or current_content[-1][-1] != '')): #current_content 内容占两行则是: ['本招标项目已具备招标条件,现对本项目施工监理进行招标,项目概况见本', '章附件一。']

View File

@ -19,7 +19,7 @@ def query():
client = docmind_api20220711Client(config) client = docmind_api20220711Client(config)
request = docmind_api20220711_models.QueryDocParserStatusRequest( request = docmind_api20220711_models.QueryDocParserStatusRequest(
# id : 任务提交接口返回的id # id : 任务提交接口返回的id
id='docmind-20240929-befe0a1a529b4640b5fbf51ff359a272' id='docmind-20241008-24363df30d274863894f037dbb7244e8'
) )
try: try:
# 复制代码运行请自行打印 API 的返回值 # 复制代码运行请自行打印 API 的返回值

View File

@ -19,7 +19,7 @@ def query():
client = docmind_api20220711Client(config) client = docmind_api20220711Client(config)
request = docmind_api20220711_models.GetDocParserResultRequest( request = docmind_api20220711_models.GetDocParserResultRequest(
# id : 任务提交接口返回的id # id : 任务提交接口返回的id
id='docmind-20240929-befe0a1a529b4640b5fbf51ff359a272', id='docmind-20241008-24363df30d274863894f037dbb7244e8',
layout_step_size=10, layout_step_size=10,
layout_num=0 layout_num=0
) )

View File

@ -71,7 +71,81 @@ def extract_text_by_page(file_path):
else: else:
print(f"Page {page_num + 1} is empty or text could not be extracted.") print(f"Page {page_num + 1} is empty or text could not be extracted.")
return result return result
# import fitz # PyMuPDF
# import re
#
#
# def clean_page_content(text, common_header):
# if common_header:
# for header_line in common_header.split('\n'):
# if header_line.strip():
# text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
#
# text = re.sub(r'^\s*\d+\s*(?=\D)', '', text)
# text = re.sub(r'\s+\d+\s*$', '', text)
# text = re.sub(r'\s*\/\s*\d+\s*', '', text)
#
# return text
#
#
# def extract_common_header(pdf_path):
# doc = fitz.open(pdf_path)
# headers = []
# total_pages = len(doc)
#
# if total_pages == 2:
# pages_to_read = 2
# start_page = 0
# else:
# pages_to_read = 3
# middle_page = total_pages // 2
# start_page = max(0, middle_page - 1)
#
# for i in range(start_page, min(start_page + pages_to_read, total_pages)):
# page = doc[i]
# text = page.get_text()
# if text:
# first_lines = text.strip().split('\n')[:3]
# headers.append(first_lines)
#
# doc.close()
#
# if len(headers) < 2:
# return ""
#
# common_headers = []
# for lines in zip(*headers):
# common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]])
# if common_line:
# common_headers.append(' '.join(common_line))
#
# return '\n'.join(common_headers)
#
#
# def extract_text_by_page(file_path):
# common_header = extract_common_header(file_path)
# result = ""
# doc = fitz.open(file_path)
# num_pages = len(doc)
#
# for page_num in range(num_pages):
# page = doc[page_num]
# text = page.get_text()
# if text:
# cleaned_text = clean_page_content(text, common_header)
# print(cleaned_text)
# result += cleaned_text
# else:
# print(f"Page {page_num + 1} is empty or text could not be extracted.")
#
# doc.close()
# return result
if __name__ == '__main__': if __name__ == '__main__':
file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\ztbfile_tobidders_notice_part2.pdf" # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件定稿三次_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf'
file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
res=extract_text_by_page(file_path) res=extract_text_by_page(file_path)
# print(res) # print(res)磋商文件_tobidders_notice_part2.pdf

View File

@ -217,9 +217,9 @@ def process_nested_data(data):
# 读取JSON数据提取内容转换结构并打印结果 # 读取JSON数据提取内容转换结构并打印结果
def extract_from_notice(clause_path, type): def extract_from_notice(clause_path, type):
if type == 1: if type == 1:
target_values = ["投标文件","响应文件"] target_values = ["投标文件","响应文件","响应性文件"]
elif type == 2: elif type == 2:
target_values = ["开标", "评标", "定标","磋商程序","中标"] target_values = ["开标", "评标", "定标","磋商程序","中标","程序"]
elif type == 3: elif type == 3:
target_values = ["重新招标、不再招标和终止招标", "重新招标", "不再招标", "终止招标"] target_values = ["重新招标、不再招标和终止招标", "重新招标", "不再招标", "终止招标"]
elif type == 4: elif type == 4:
@ -237,9 +237,9 @@ def extract_from_notice(clause_path, type):
#TODO: 再审视一下zbtest20的处理是否合理 #TODO: 再审视一下zbtest20的处理是否合理
if __name__ == "__main__": if __name__ == "__main__":
file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause9.json' file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause26.json'
try: try:
res = extract_from_notice(file_path, 2) # 可以改变此处的 type 参数测试不同的场景 res = extract_from_notice(file_path, 1) # 可以改变此处的 type 参数测试不同的场景
res2=json.dumps(res,ensure_ascii=False,indent=4) res2=json.dumps(res,ensure_ascii=False,indent=4)
print(res2) print(res2)
except ValueError as e: except ValueError as e:

View File

@ -2,14 +2,16 @@ import json
import docx import docx
import re import re
import os import os
import fitz
from PyPDF2 import PdfReader from PyPDF2 import PdfReader
from flask_app.main.截取pdf import clean_page_content,extract_common_header from flask_app.货物标.货物标截取pdf import clean_page_content,extract_common_header
def extract_text_from_docx(file_path): def extract_text_from_docx(file_path):
doc = docx.Document(file_path) doc = docx.Document(file_path)
return '\n'.join([para.text for para in doc.paragraphs]) return '\n'.join([para.text for para in doc.paragraphs])
#PYPDF2版本
def extract_text_from_pdf(file_path, start_word, end_pattern): def extract_text_from_pdf(file_path, start_word, end_pattern):
# 从PDF文件中提取文本 # 从PDF文件中提取文本
common_header = extract_common_header(file_path) common_header = extract_common_header(file_path)
@ -44,6 +46,54 @@ def extract_text_from_pdf(file_path, start_word, end_pattern):
# print(full_text) # print(full_text)
return full_text return full_text
#fitz库版本
# def extract_text_from_pdf(file_path, start_word, end_pattern):
# # 从PDF文件中提取文本
# common_header = extract_common_header(file_path)
# doc = fitz.open(file_path)
# all_pages_text = []
# start_index = None
#
# # 处理所有页面
# for i in range(len(doc)):
# page = doc[i]
# page_text = page.get_text()
# cleaned_text = clean_page_content(page_text, common_header)
# print(cleaned_text)
# print("yes")
# # 在第一页查找开始位置
# if i == 0 and start_index is None:
# start_match = re.search(start_word, cleaned_text, re.MULTILINE)
# if start_match:
# start_index = start_match.start()
# cleaned_text = cleaned_text[start_index:]
#
# # 在最后一页查找结束位置
# if i == len(doc) - 1:
# for pattern in end_pattern:
# matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE))
# if matches:
# end_index = matches[-1].start()
# cleaned_text = cleaned_text[:end_index]
# break
#
# all_pages_text.append(cleaned_text)
#
# # 合并所有页面的文本
# full_text = "\n".join(all_pages_text)
# # 关闭文档
# doc.close()
#
# return full_text
def chinese_to_arabic(chinese_num):
cn_num = {
'': 1, '': 2, '': 3, '': 4, '': 5,
'': 6, '': 7, '': 8, '': 9, '': 10,
'十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15
}
return cn_num.get(chinese_num, 0)
def compare_headings(current, new): def compare_headings(current, new):
# 使用过滤来确保只处理非空且为数字的部分 # 使用过滤来确保只处理非空且为数字的部分
current_nums = [int(num) for num in current.split('.') if num.isdigit()] current_nums = [int(num) for num in current.split('.') if num.isdigit()]
@ -86,78 +136,87 @@ def parse_text_by_heading(text):
current_key = None current_key = None
current_content = [] current_content = []
append_newline = False append_newline = False
skip_subheadings = False
lines = text.split('\n') #['一、说明', '1.适用范围', '招标文件仅适用于第一章“投标邀请书”中所述项目的货物、工程及服务的采购。'] lines = text.split('\n')
for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n' for i, line in enumerate(lines):
line_stripped = line.strip() line_stripped = line.strip().replace('', '.')
# 匹配中文数字标题,包括带括号和不带括号的情况 # print(line_stripped)
pattern_title = re.compile(r'^\s*(?:[(]\s*[一二三四五六七八九十]\s*[)]\s*|[一二三四五六七八九十]\s*、\s*)') # 匹配二级、三级标题形如 '1.1'、'2.2.3' 并确保其前后没有字母或括号
# 用于提取中文数字部分
pattern_key = re.compile(r'[一二三四五六七八九十]')
# 用于提取标题后的内容部分
pattern_value = re.compile(r'[^\s、)]+')
chinese_match = pattern_title.match(line_stripped) # 匹配是否为标题
if chinese_match:
# 匹配到标题行,先提取中文数字部分
chinese_key_match = pattern_key.search(line_stripped)
# 提取中文数字后的内容部分
chinese_value_match = pattern_value.search(
line_stripped[chinese_key_match.end():]) if chinese_key_match else None
if chinese_key_match and chinese_value_match:
chinese_key = chinese_key_match.group() # 匹配到的中文数字,如 "一"
chinese_value = chinese_value_match.group() # 匹配到的标题内容,如 "招标文件"
# 将提取的标题和内容存储到字典中
if chinese_key:
data[chinese_key] = chinese_value
current_key = None # 重置当前key
current_content = [] # 清空当前内容
continue
# 新增:匹配阿拉伯数字开头后跟顿号的情况,如 "7、竞争性磋商采购文件的修改"
arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
if arabic_match:
arabic_key, arabic_value = arabic_match.groups()
arabic_key = arabic_key.replace('', '.') # 将顿号替换为点号
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = content_string.replace(' ', '')
current_key = arabic_key
current_content = [arabic_value]
append_newline = True
continue
# 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped) match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match: if not match:
#匹配一级标题'1.'
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped) match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
if match: if match:
new_key, line_content = match.groups() new_key, line_content = match.groups() #new_key:1.1 line_content:本次竞争性磋商文件仅适用于本次竞争性磋商公告中所叙述的广水农商行门禁控制主机及基础验证设备采购项
line_content = line_content.lstrip('.') line_content = line_content.lstrip('..、,') #删除左边的若干个..、,
# 检查是否包含关键词,决定是否跳过子标题
if any(keyword in line_content for keyword in keywords):
skip_subheadings = True
else:
skip_subheadings = False
# 检查是否应该更新当前键和内容 # 检查是否应该更新当前键和内容
if current_key is None or (compare_headings(current_key, new_key) and ( if current_key is None or (compare_headings(current_key, new_key) and (
len(current_content) == 0 or current_content[-1][-1] != '')): len(current_content) == 0 or current_content[-1][-1] != '')):
if current_key is not None: if current_key is not None:
# 将之前的内容保存到data中保留第一个换行符后续的换行符转为空字符
# print(current_content)
content_string = ''.join(current_content).strip() content_string = ''.join(current_content).strip()
# print(content_string)
data[current_key] = content_string.replace(' ', '') data[current_key] = content_string.replace(' ', '')
current_key = new_key current_key = new_key
current_content = [line_content] current_content = [line_content]
# 当标题级别为一级1.)和两级(如 1.1)时标题,才设置 append_newline 为 True
append_newline = len(new_key.rstrip('.').split('.')) <= 2 append_newline = len(new_key.rstrip('.').split('.')) <= 2
else: else:
append_newline = handle_content_append(current_content, line_content, append_newline, keywords) append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
else: else:
if not skip_subheadings:
# 匹配中文数字标题,包括带括号和不带括号的情况
pattern_title = re.compile(
r'^\s*(?:[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
chinese_match = pattern_title.match(line_stripped)
# 匹配字母标题,如 B.磋商说明
letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
# 匹配阿拉伯数字开头后跟顿号的情况,如 "7、竞争性磋商采购文件的修改"
arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
if chinese_match or letter_match or arabic_match:
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = content_string.replace(' ', '')
if chinese_match:
pattern_key = re.compile(r'[一二三四五六七八九十]{1,2}')
pattern_value = re.compile(r'[^\s、)]+')
chinese_key_match = pattern_key.search(line_stripped)
chinese_value_match = pattern_value.search(
line_stripped[chinese_key_match.end():]) if chinese_key_match else None
if chinese_key_match and chinese_value_match:
current_key = chinese_key_match.group()
current_content = [chinese_value_match.group()]
elif letter_match:
letter_key, letter_value = letter_match.groups()
# 将字母转换为对应的中文数字
letter_to_chinese = {
'A': '', 'B': '', 'C': '', 'D': '', 'E': '',
'F': '', 'G': '', 'H': '', 'I': '', 'J': '',
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
}
current_key = letter_to_chinese.get(letter_key, letter_key)
current_content = [letter_value]
elif arabic_match:
arabic_key, arabic_value = arabic_match.groups()
current_key = arabic_key.replace('', '.')
current_content = [arabic_value]
append_newline = True
continue
if line_stripped: if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords) append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
if current_key is not None: if current_key is not None:
# 保存最后一部分内容
content_string = ''.join(current_content).strip() content_string = ''.join(current_content).strip()
data[current_key] = content_string.replace(' ', '') data[current_key] = content_string.replace(' ', '')
@ -174,7 +233,7 @@ def parse_text_by_heading(text):
# parsed_data = parse_text_by_heading(text) # parsed_data = parse_text_by_heading(text)
# return parsed_data # return parsed_data
def convert_clause_to_json(file_path,output_folder,type=1): def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json"):
if not os.path.exists(file_path): if not os.path.exists(file_path):
print(f"The specified file does not exist: {file_path}") print(f"The specified file does not exist: {file_path}")
return "" return ""
@ -191,8 +250,8 @@ def convert_clause_to_json(file_path,output_folder,type=1):
if not os.path.exists(output_folder): if not os.path.exists(output_folder):
os.makedirs(output_folder) os.makedirs(output_folder)
print(f"Created output folder: {output_folder}") print(f"Created output folder: {output_folder}")
file_name = "clause1.json" if type == 1 else "clause2.json" # file_name = "clause1.json" if type == 1 else "clause2.json"
# file_name = f"clause{suffix_counter}.json" file_name = f"clause{suffix_counter}.json"
output_path = os.path.join(output_folder, file_name) output_path = os.path.join(output_folder, file_name)
with open(output_path, 'w', encoding='utf-8') as f: with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=4, ensure_ascii=False) json.dump(result, f, indent=4, ensure_ascii=False)
@ -213,7 +272,7 @@ def process_folder(input_folder, output_folder):
suffix = str(suffix_counter) # 后缀为递增的数字 suffix = str(suffix_counter) # 后缀为递增的数字
try: try:
# 调用 convert_clause_to_json传递文件路径、输出文件夹和递增的后缀 # 调用 convert_clause_to_json传递文件路径、输出文件夹和递增的后缀
output_path = convert_clause_to_json(file_path, output_folder, 1, suffix_counter) output_path = convert_clause_to_json(file_path, output_folder, 1, suffix)
print(f"Processed file: {file_name}, JSON saved to: {output_path}") print(f"Processed file: {file_name}, JSON saved to: {output_path}")
except ValueError as e: except ValueError as e:
print(f"Error processing {file_name}: {e}") print(f"Error processing {file_name}: {e}")
@ -221,17 +280,22 @@ def process_folder(input_folder, output_folder):
# 后缀递增 # 后缀递增
suffix_counter += 1 suffix_counter += 1
#TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件定稿三次_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏
#TODO:标题'一'应该越来越大,与'1.1'的逻辑分开'
#TODO:911904c3-4d6e-47e0-acd8-b05e582209cb文件夹运行有问题百炼出错了
if __name__ == "__main__": if __name__ == "__main__":
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf' # # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf' # file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\竞争性谈判文件(3)_tobidders_notice_part2.pdf'
# # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf'
# output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2'
# try:
# output_path = convert_clause_to_json(file_path,output_folder)
# print(f"Final JSON result saved to: {output_path}")
# except ValueError as e:
# print("Error:", e)
input_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4'
output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1' output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1'
try:
output_path = convert_clause_to_json(file_path,output_folder) # 调用 process_folder 来处理整个文件夹中的所有文件
print(f"Final JSON result saved to: {output_path}") process_folder(input_folder, output_folder)
except ValueError as e:
print("Error:", e)
# input_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4'
# output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1'
#
# # 调用 process_folder 来处理整个文件夹中的所有文件
# process_folder(input_folder, output_folder)

View File

@ -1,3 +1,4 @@
import fitz
from PyPDF2 import PdfReader, PdfWriter from PyPDF2 import PdfReader, PdfWriter
import re # 导入正则表达式库 import re # 导入正则表达式库
import os # 用于文件和文件夹操作 import os # 用于文件和文件夹操作
@ -20,7 +21,7 @@ def clean_page_content(text, common_header):
return text return text
#PYPDF2版本
def extract_common_header(pdf_path): def extract_common_header(pdf_path):
pdf_document = PdfReader(pdf_path) pdf_document = PdfReader(pdf_path)
headers = [] headers = []
@ -57,6 +58,40 @@ def extract_common_header(pdf_path):
return '\n'.join(common_headers) return '\n'.join(common_headers)
#fitz库版本
# def extract_common_header(pdf_path):
# doc = fitz.open(pdf_path)
# headers = []
# total_pages = len(doc)
#
# if total_pages == 2:
# pages_to_read = 2
# start_page = 0
# else:
# pages_to_read = 3
# middle_page = total_pages // 2
# start_page = max(0, middle_page - 1)
#
# for i in range(start_page, min(start_page + pages_to_read, total_pages)):
# page = doc[i]
# text = page.get_text()
# if text:
# first_lines = text.strip().split('\n')[:3]
# headers.append(first_lines)
#
# doc.close()
#
# if len(headers) < 2:
# return ""
#
# common_headers = []
# for lines in zip(*headers):
# common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]])
# if common_line:
# common_headers.append(' '.join(common_line))
#
# return '\n'.join(common_headers)
def is_pdf_or_doc(filename): def is_pdf_or_doc(filename):
# 判断文件是否为PDF或Word文档 # 判断文件是否为PDF或Word文档
return filename.lower().endswith(('.pdf', '.doc', '.docx')) return filename.lower().endswith(('.pdf', '.doc', '.docx'))