10.9
This commit is contained in:
parent
e017acc673
commit
a2e3e9c0ab
@ -1,95 +1,261 @@
|
||||
# -*- encoding:utf-8 -*-
|
||||
import json
|
||||
import re
|
||||
|
||||
def find_keys_by_value(target_value, json_data):
|
||||
# 找到值等于目标值或以目标值开头的键
|
||||
matched_keys = [k for k, v in json_data.items() if v == target_value]
|
||||
if not matched_keys:
|
||||
matched_keys = [k for k, v in json_data.items() if isinstance(v, str) and v.startswith(target_value)]
|
||||
return matched_keys
|
||||
|
||||
def find_keys_with_prefix(prefix, json_data):
|
||||
# 只提取直接子项,比如 prefix 为 '7.2' 时只提取 '7.2.1', '7.2.2' 但不会提取 '7.3'
|
||||
return [k for k in json_data.keys() if k.startswith(prefix) and k[len(prefix):].lstrip('.').isdigit()]
|
||||
def compare_headings(current, new):
|
||||
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
|
||||
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
|
||||
|
||||
def extract_json(data, target_values):
|
||||
results = {}
|
||||
for c, n in zip(current_nums, new_nums):
|
||||
if n > c:
|
||||
return True
|
||||
elif n < c:
|
||||
return False
|
||||
|
||||
# 遍历所有目标值
|
||||
for target_value in target_values:
|
||||
# 找到所有与目标值匹配的键
|
||||
matched_keys = find_keys_by_value(target_value, data)
|
||||
return len(new_nums) > len(current_nums)
|
||||
|
||||
for key in matched_keys:
|
||||
# 查找所有以该键为前缀的子键,限制只提取直接子项
|
||||
key_and_subheadings = find_keys_with_prefix(key, data)
|
||||
|
||||
for subkey in key_and_subheadings:
|
||||
# 如果子键有多级结构(比如 '7.2.1'),并且是直接子项
|
||||
if "." in subkey:
|
||||
parent_key = subkey.rsplit('.', 1)[0]
|
||||
top_level_key = parent_key.split('.')[0] + '.'
|
||||
def should_add_newline(content, keywords, max_length=20):
|
||||
content_str = ''.join(content).strip()
|
||||
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
|
||||
|
||||
# 确保顶级键不会重复添加
|
||||
if top_level_key not in results:
|
||||
results[top_level_key] = data[top_level_key]
|
||||
|
||||
# 添加或更新父级键
|
||||
if parent_key not in results:
|
||||
if parent_key in data:
|
||||
results[parent_key] = data[parent_key]
|
||||
def handle_content_append(current_content, line_content, append_newline, keywords):
|
||||
if append_newline:
|
||||
if should_add_newline(current_content, keywords):
|
||||
current_content.append('\n')
|
||||
append_newline = False
|
||||
current_content.append(line_content)
|
||||
return append_newline
|
||||
|
||||
# 添加当前子键和它的值
|
||||
if subkey in data:
|
||||
results[subkey] = data[subkey]
|
||||
|
||||
return results
|
||||
# def parse_text_by_heading(text):
|
||||
# keywords = ['包含', '以下']
|
||||
# data = {}
|
||||
# current_key = None
|
||||
# current_content = []
|
||||
# append_newline = False
|
||||
# skip_subheadings = False
|
||||
#
|
||||
# lines = text.split('\n')
|
||||
# for i, line in enumerate(lines):
|
||||
# line_stripped = line.strip().replace('.', '.')
|
||||
# # print(line_stripped)
|
||||
# # 匹配二级、三级标题形如 '1.1'、'2.2.3' 并确保其前后没有字母或括号
|
||||
# match = re.match(r'^(?<![a-zA-Z((])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
|
||||
# if not match:
|
||||
# #匹配一级标题'1.'
|
||||
# match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
|
||||
#
|
||||
# if match:
|
||||
# new_key, line_content = match.groups() #new_key:1.1 line_content:本次竞争性磋商文件仅适用于本次竞争性磋商公告中所叙述的广水农商行门禁控制主机及基础验证设备采购项
|
||||
# line_content = line_content.lstrip('..、,') #删除左边的若干个..、,
|
||||
#
|
||||
# # 检查是否包含关键词,决定是否跳过子标题
|
||||
# if any(keyword in line_content for keyword in keywords):
|
||||
# skip_subheadings = True
|
||||
# else:
|
||||
# skip_subheadings = False
|
||||
#
|
||||
# # 检查是否应该更新当前键和内容
|
||||
# if current_key is None or (compare_headings(current_key, new_key) and (
|
||||
# len(current_content) == 0 or current_content[-1][-1] != '第')):
|
||||
# if current_key is not None:
|
||||
# content_string = ''.join(current_content).strip()
|
||||
# data[current_key] = content_string.replace(' ', '')
|
||||
# current_key = new_key
|
||||
# current_content = [line_content]
|
||||
# append_newline = len(new_key.rstrip('.').split('.')) <= 2
|
||||
# else:
|
||||
# append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
|
||||
# else:
|
||||
# if not skip_subheadings:
|
||||
# # 匹配中文数字标题,包括带括号和不带括号的情况
|
||||
# pattern_title = re.compile(
|
||||
# r'^\s*(?:[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
|
||||
# chinese_match = pattern_title.match(line_stripped)
|
||||
#
|
||||
# # 匹配字母标题,如 B.磋商说明
|
||||
# letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
|
||||
#
|
||||
# # 匹配阿拉伯数字开头后跟顿号的情况,如 "7、竞争性磋商采购文件的修改"
|
||||
# arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
|
||||
#
|
||||
# if chinese_match or letter_match or arabic_match:
|
||||
# if current_key is not None:
|
||||
# content_string = ''.join(current_content).strip()
|
||||
# data[current_key] = content_string.replace(' ', '')
|
||||
# if chinese_match:
|
||||
# pattern_key = re.compile(r'[一二三四五六七八九十]{1,2}')
|
||||
# pattern_value = re.compile(r'[^\s、))]+')
|
||||
# chinese_key_match = pattern_key.search(line_stripped)
|
||||
# chinese_value_match = pattern_value.search(
|
||||
# line_stripped[chinese_key_match.end():]) if chinese_key_match else None
|
||||
# if chinese_key_match and chinese_value_match:
|
||||
# current_key = chinese_key_match.group()
|
||||
# current_content = [chinese_value_match.group()]
|
||||
# elif letter_match:
|
||||
# letter_key, letter_value = letter_match.groups()
|
||||
# # 将字母转换为对应的中文数字
|
||||
# letter_to_chinese = {
|
||||
# 'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五',
|
||||
# 'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十',
|
||||
# 'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
|
||||
# }
|
||||
# current_key = letter_to_chinese.get(letter_key, letter_key)
|
||||
# current_content = [letter_value]
|
||||
# elif arabic_match:
|
||||
# arabic_key, arabic_value = arabic_match.groups()
|
||||
# current_key = arabic_key.replace('、', '.')
|
||||
# current_content = [arabic_value]
|
||||
#
|
||||
# append_newline = True
|
||||
# continue
|
||||
#
|
||||
# if line_stripped:
|
||||
# append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
|
||||
#
|
||||
# if current_key is not None:
|
||||
# content_string = ''.join(current_content).strip()
|
||||
# data[current_key] = content_string.replace(' ', '')
|
||||
#
|
||||
# return data
|
||||
|
||||
def extract_json(data, target_values):
|
||||
results = {}
|
||||
for target_value in target_values:
|
||||
matched_keys = find_keys_by_value(target_value, data)
|
||||
for key in matched_keys:
|
||||
key_and_subheadings = find_keys_with_prefix(key, data)
|
||||
for subkey in key_and_subheadings:
|
||||
if "." in subkey:
|
||||
parent_key = subkey.rsplit('.', 1)[0]
|
||||
top_level_key = parent_key.split('.')[0] + '.'
|
||||
# 特别处理定标相关的顶级键,确保不会重复添加其他键
|
||||
if top_level_key not in results:
|
||||
results[top_level_key] = target_value
|
||||
# 添加或更新父级键
|
||||
if parent_key not in results:
|
||||
if parent_key in data:
|
||||
results[parent_key] = data[parent_key]
|
||||
# 添加当前键
|
||||
results[subkey] = data[subkey]
|
||||
return results
|
||||
def parse_text_by_heading(text):
|
||||
keywords = ['包含', '以下']
|
||||
data = {}
|
||||
current_key = None
|
||||
current_key_chinese=None
|
||||
current_content = []
|
||||
append_newline = False
|
||||
skip_subheadings = False
|
||||
|
||||
# 测试数据
|
||||
data = {
|
||||
"7.": "合同授予",
|
||||
"7.1": "定标方式",
|
||||
"7.1.1": "招标人依据评标委员会推荐的中标候选人确定中标人,国有资金占控股或者主导地位的依法必须进行招标的项目,确定排名第一的中标候选人为中标人。评标委员会推荐中标候选人的数量见投标人须知前附表。",
|
||||
"7.1.2": "排名第一的中标候选人放弃中标、因不可抗力不能签订合同、不按照招标文件要求提交履约担保,或者被查实存在影响中标结果的违法行为等情形,不符合中标条件的,招标人可以按照评标委员会提出的中标候选人名单排序依次确定其他中标候选人为中标人,也可以重新招标。",
|
||||
"7.1.3": "招标人将自收到评标报告之日起3日内在投标人须知前附表规定的媒介公示中标候选人。公示期不少于3日。投标人或者其他利害关系人对评标结果有异议的,应当在评标结果公示期间提出。招标人自收到异议之日起3日内作出答复;作出答复前,暂停招标投标活动。投标人的异议与答复应当通过“省电子交易平台”在“异议与答复”菜单以书面形式进行,其他利害关系人的异议也应以书面形式向招标人提出。中标候选人公示媒体见投标人须知前附表。",
|
||||
"7.1.4": "中标候选人经公示无异议、招标人向交通运输主管部门履行评标结果备案手续后,发出中标通知书。",
|
||||
"7.2": "中标通知",
|
||||
"7.2.1": "招标人将在本章第3.3款规定的投标有效期内,招标人通过“电子交易平台”以书面形式向中标人发出中标通知书,同时将中标结果通知未中标的投标人。中标通知书发出前,招标人将在投标人须知前附表第7.1.3款规定的媒介发布中标结果公告。",
|
||||
"7.2.2": "中标候选人的经营、财务状况发生较大变化或者存在违法行为,招标人认为可能影响其履约能力的,将在发出中标通知书前报请行政监督部门批准后,召集原评标委员会按照招标文件规定的标准和方法审查确认。",
|
||||
"7.2.3": "招标人即使已经向中标人发出了中标通知书,若发生下列情况之一者,中标无效。(1)经查实投标人以他人名义投标或以其他方式弄虚作假,骗取中标的;(2)经查实投标人有串通投标的,或向招标人或评标委员会成员以行贿手段谋取中标的;(3)法律、法规规定的其他情形。",
|
||||
"7.3": "履约担保",
|
||||
"7.3.1": "在签订合同前,中标人应按投标人须知前附表规定的金额、担保形式和招标文件第四章规定的格式向招标人提交履约担保,或事先经过招标人书面认可的履约担保格式向招标人提交履约担保。履约担保金额不得超过中标合同价的10%,详见投标人须知前附表。联合体中标的,其履约担保由牵头人提交,并应符合投标人须知前附表规定的金额、担保形式和招标文件规定的履约担保格式要求。联合体中标的,其履约担保由牵头人递交,并应符合上述要求。",
|
||||
"7.3.2": "中标人不能按本章第7.3.1项要求提交履约担保的,视为放弃中标,其投标保证金不予退还,给招标人造成的损失超过投标保证金数额的,中标人还应当对超过部分予以赔偿。",
|
||||
lines = text.split('\n')
|
||||
for i, line in enumerate(lines):
|
||||
line_stripped = line.strip().replace('.', '.')
|
||||
match = re.match(r'^(?<![a-zA-Z((])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
|
||||
if not match:
|
||||
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
|
||||
|
||||
if match:
|
||||
new_key, line_content = match.groups()
|
||||
line_content = line_content.lstrip('..、,')
|
||||
|
||||
if any(keyword in line_content for keyword in keywords):
|
||||
skip_subheadings = True
|
||||
else:
|
||||
skip_subheadings = False
|
||||
if current_key_chinese is not None:
|
||||
content_string = ''.join(current_content).strip()
|
||||
data[current_key_chinese] = content_string.replace(' ', '')
|
||||
current_key_chinese=None
|
||||
if current_key is None or (compare_headings(current_key, new_key) and (
|
||||
len(current_content) == 0 or current_content[-1][-1] != '第')):
|
||||
if current_key is not None:
|
||||
content_string = ''.join(current_content).strip()
|
||||
if current_key in data:
|
||||
data[current_key] += content_string.replace(' ', '')
|
||||
else:
|
||||
data[current_key] = content_string.replace(' ', '')
|
||||
current_key = new_key
|
||||
current_content = [line_content]
|
||||
append_newline = len(new_key.rstrip('.').split('.')) <= 2
|
||||
else:
|
||||
append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
|
||||
else:
|
||||
if not skip_subheadings:
|
||||
pattern_title = re.compile(
|
||||
r'^\s*(?:[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
|
||||
chinese_match = pattern_title.match(line_stripped)
|
||||
|
||||
letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
|
||||
|
||||
arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
|
||||
|
||||
if chinese_match or letter_match or arabic_match:
|
||||
# if current_key is not None or current_key_chinese is not None:
|
||||
# key = current_key if current_key is not None else current_key_chinese
|
||||
# content_string = ''.join(current_content).strip() #TODO: key chinese key
|
||||
# if key in data:
|
||||
# data[key] +=content_string.replace(' ', '')
|
||||
# else:
|
||||
# data[key] = content_string.replace(' ', '')
|
||||
# if current_key_chinese is not None:
|
||||
# current_key_chinese = None
|
||||
if chinese_match:
|
||||
pattern_key = re.compile(r'[一二三四五六七八九十]{1,2}')
|
||||
pattern_value = re.compile(r'[^\s、))]+')
|
||||
chinese_key_match = pattern_key.search(line_stripped)
|
||||
chinese_value_match = pattern_value.search(
|
||||
line_stripped[chinese_key_match.end():]) if chinese_key_match else None
|
||||
if chinese_key_match and chinese_value_match:
|
||||
# current_key = chinese_key_match.group()
|
||||
current_key_chinese=chinese_key_match.group()
|
||||
current_content = [chinese_value_match.group()]
|
||||
elif letter_match:
|
||||
letter_key, letter_value = letter_match.groups()
|
||||
letter_to_chinese = {
|
||||
'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五',
|
||||
'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十',
|
||||
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
|
||||
}
|
||||
# current_key = letter_to_chinese.get(letter_key, letter_key)
|
||||
current_key_chinese =letter_to_chinese.get(letter_key, letter_key)
|
||||
current_content = [letter_value]
|
||||
elif arabic_match:
|
||||
arabic_key, arabic_value = arabic_match.groups()
|
||||
current_key = arabic_key.replace('、', '.')
|
||||
current_content = [arabic_value]
|
||||
|
||||
# 提取匹配的目标
|
||||
target_values = ["中标"]
|
||||
|
||||
# 运行提取函数
|
||||
extracted_json = extract_json(data, target_values)
|
||||
append_newline = True
|
||||
continue
|
||||
|
||||
# 输出结果
|
||||
import json
|
||||
if line_stripped:
|
||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
|
||||
|
||||
print(json.dumps(extracted_json, ensure_ascii=False, indent=4))
|
||||
if current_key is not None:
|
||||
content_string = ''.join(current_content).strip()
|
||||
data[current_key] = content_string.replace(' ', '')
|
||||
|
||||
# if current_key_chinese is not None:
|
||||
# content_string = ''.join(current_content).strip()
|
||||
# if current_key_chinese in data:
|
||||
# data[current_key] += content_string.replace(' ', '')
|
||||
# else:
|
||||
# data[current_key_chinese] = content_string.replace(' ', '')
|
||||
return data
|
||||
|
||||
|
||||
# 测试文本
|
||||
text = """
|
||||
一、在这种
|
||||
二、哈哈哈
|
||||
三、红红火火恍恍惚惚
|
||||
四、竞争性磋商响应文件的递交
|
||||
19.竞争性磋商响应文件的加密
|
||||
19.1 供应商应当按照要求制作电子磋商响应文件,并在投标时上传加密的电子磋
|
||||
商响应文件,未加密的电子磋商响应文件,采购人(“电子交易平台”)将拒收
|
||||
并提示。
|
||||
22.3 从提交响应文件截止时间至磋商有效期期满这段时间,供应商不得修改或
|
||||
撤销其响应文件。供应商所提交的响应文件在磋商结束后,无论成交与否都不退
|
||||
还。
|
||||
注:竞争性磋商(开标)时间和地点
|
||||
一、供应商在规定的磋商截止时间(开标时间)在“电子交易平台”上公开进行
|
||||
开标,所有供应商均应当准时在线参加开标。
|
||||
二、采购人(采购代理机构)通过互联网在磋商须知前附表规定的地点组织开标,
|
||||
并在投标截止时间 30 分钟前,使用 CA 数字证书登录“电子交易平台”,进入
|
||||
“开标大厅”选择相应标段作在线开标的准备工作。
|
||||
三、供应商应当在能够保证设施设备可靠、互联网畅通的任意地点,通过互联网
|
||||
在线参加开标。在投标截止时间前,使用加密其投标文件的 CA 数字证书登录“电
|
||||
子交易平台”,进入“开标大厅”选择所投标段进行签到,并实时在线关注招标
|
||||
人的操作情况。
|
||||
注:开标结束后,请各投标人不要长时间离开电脑,随时关注业务系统,等
|
||||
待专家资格性审查和符合性审查完成后要求的最后报价,若在规定时间内投标人
|
||||
没有按要求进行多轮报价,则作自动弃权处理。
|
||||
五、磋商程序及步骤
|
||||
23.竞争性磋商小组
|
||||
"""
|
||||
|
||||
res = parse_text_by_heading(text)
|
||||
print(json.dumps(res, ensure_ascii=False, indent=4))
|
||||
|
@ -194,7 +194,8 @@ if __name__ == "__main__":
|
||||
# for question, response in results:
|
||||
# print(f"Question: {question}")
|
||||
# print(f"Response: {response}")
|
||||
ques=[]
|
||||
|
||||
ques=["该招标文件的工程名称(项目名称)是?招标编号是?招标人是?招标代理机构是?请按json格式给我提供信息,键名分别是'工程名称','招标编号','招标人','招标代理机构',若存在未知信息,在对应的键值中填'未知'。","该招标文件的工程概况(或项目概况)是?招标范围是?请按json格式给我提供信息,键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。"]
|
||||
results = multi_threading(ques, "招标解析5word")
|
||||
if not results:
|
||||
print("errror!")
|
||||
|
@ -1,7 +1,6 @@
|
||||
from PyPDF2 import PdfReader, PdfWriter
|
||||
import re # 导入正则表达式库
|
||||
import os # 用于文件和文件夹操作
|
||||
|
||||
def clean_page_content(text, common_header):
|
||||
# 首先删除抬头公共部分
|
||||
if common_header: # 确保有公共抬头才进行替换
|
||||
@ -16,6 +15,8 @@ def clean_page_content(text, common_header):
|
||||
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码
|
||||
|
||||
return text
|
||||
|
||||
#PYPDF2库
|
||||
def extract_common_header(pdf_path):
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
headers = []
|
||||
@ -51,7 +52,6 @@ def extract_common_header(pdf_path):
|
||||
|
||||
return '\n'.join(common_headers)
|
||||
|
||||
|
||||
def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page):
|
||||
"""从原始PDF截取指定范围的页面并保存到新的PDF文件中"""
|
||||
# 获取文件基本名称
|
||||
|
@ -266,7 +266,7 @@ def extract_from_notice(clause_path, type):
|
||||
if __name__ == "__main__":
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json'
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause9.json'
|
||||
file_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp\\clause1.json"
|
||||
file_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\clause1.json"
|
||||
try:
|
||||
res = extract_from_notice(file_path, 2) # 可以改变此处的 type 参数测试不同的场景
|
||||
res2 = json.dumps(res, ensure_ascii=False, indent=4)
|
||||
|
@ -114,7 +114,7 @@ def parse_text_by_heading(text):
|
||||
append_newline = False
|
||||
lines = text.split('\n')
|
||||
for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n'
|
||||
line_stripped = line.strip()
|
||||
line_stripped = line.strip().replace('.', '.')
|
||||
# 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号
|
||||
match = re.match(r'^(?<![a-zA-Z((])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
|
||||
if not match:
|
||||
@ -122,7 +122,7 @@ def parse_text_by_heading(text):
|
||||
|
||||
if match:
|
||||
new_key, line_content = match.groups()
|
||||
line_content = line_content.lstrip('.')
|
||||
line_content = line_content.lstrip('..')
|
||||
# 检查是否应该更新当前键和内容
|
||||
if current_key is None or (compare_headings(current_key, new_key) and (
|
||||
len(current_content) == 0 or current_content[-1][-1] != '第')): #current_content: 内容占两行则是: ['本招标项目已具备招标条件,现对本项目施工监理进行招标,项目概况见本', '章附件一。']
|
||||
|
@ -19,7 +19,7 @@ def query():
|
||||
client = docmind_api20220711Client(config)
|
||||
request = docmind_api20220711_models.QueryDocParserStatusRequest(
|
||||
# id : 任务提交接口返回的id
|
||||
id='docmind-20240929-befe0a1a529b4640b5fbf51ff359a272'
|
||||
id='docmind-20241008-24363df30d274863894f037dbb7244e8'
|
||||
)
|
||||
try:
|
||||
# 复制代码运行请自行打印 API 的返回值
|
||||
|
@ -19,7 +19,7 @@ def query():
|
||||
client = docmind_api20220711Client(config)
|
||||
request = docmind_api20220711_models.GetDocParserResultRequest(
|
||||
# id : 任务提交接口返回的id
|
||||
id='docmind-20240929-befe0a1a529b4640b5fbf51ff359a272',
|
||||
id='docmind-20241008-24363df30d274863894f037dbb7244e8',
|
||||
layout_step_size=10,
|
||||
layout_num=0
|
||||
)
|
||||
|
@ -71,7 +71,81 @@ def extract_text_by_page(file_path):
|
||||
else:
|
||||
print(f"Page {page_num + 1} is empty or text could not be extracted.")
|
||||
return result
|
||||
|
||||
# import fitz # PyMuPDF
|
||||
# import re
|
||||
#
|
||||
#
|
||||
# def clean_page_content(text, common_header):
|
||||
# if common_header:
|
||||
# for header_line in common_header.split('\n'):
|
||||
# if header_line.strip():
|
||||
# text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
|
||||
#
|
||||
# text = re.sub(r'^\s*\d+\s*(?=\D)', '', text)
|
||||
# text = re.sub(r'\s+\d+\s*$', '', text)
|
||||
# text = re.sub(r'\s*\/\s*\d+\s*', '', text)
|
||||
#
|
||||
# return text
|
||||
#
|
||||
#
|
||||
# def extract_common_header(pdf_path):
|
||||
# doc = fitz.open(pdf_path)
|
||||
# headers = []
|
||||
# total_pages = len(doc)
|
||||
#
|
||||
# if total_pages == 2:
|
||||
# pages_to_read = 2
|
||||
# start_page = 0
|
||||
# else:
|
||||
# pages_to_read = 3
|
||||
# middle_page = total_pages // 2
|
||||
# start_page = max(0, middle_page - 1)
|
||||
#
|
||||
# for i in range(start_page, min(start_page + pages_to_read, total_pages)):
|
||||
# page = doc[i]
|
||||
# text = page.get_text()
|
||||
# if text:
|
||||
# first_lines = text.strip().split('\n')[:3]
|
||||
# headers.append(first_lines)
|
||||
#
|
||||
# doc.close()
|
||||
#
|
||||
# if len(headers) < 2:
|
||||
# return ""
|
||||
#
|
||||
# common_headers = []
|
||||
# for lines in zip(*headers):
|
||||
# common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]])
|
||||
# if common_line:
|
||||
# common_headers.append(' '.join(common_line))
|
||||
#
|
||||
# return '\n'.join(common_headers)
|
||||
#
|
||||
#
|
||||
# def extract_text_by_page(file_path):
|
||||
# common_header = extract_common_header(file_path)
|
||||
# result = ""
|
||||
# doc = fitz.open(file_path)
|
||||
# num_pages = len(doc)
|
||||
#
|
||||
# for page_num in range(num_pages):
|
||||
# page = doc[page_num]
|
||||
# text = page.get_text()
|
||||
# if text:
|
||||
# cleaned_text = clean_page_content(text, common_header)
|
||||
# print(cleaned_text)
|
||||
# result += cleaned_text
|
||||
# else:
|
||||
# print(f"Page {page_num + 1} is empty or text could not be extracted.")
|
||||
#
|
||||
# doc.close()
|
||||
# return result
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\ztbfile_tobidders_notice_part2.pdf"
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次)_tobidders_notice_part2.pdf'
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
||||
file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
|
||||
res=extract_text_by_page(file_path)
|
||||
# print(res)
|
||||
# print(res)磋商文件_tobidders_notice_part2.pdf
|
@ -217,9 +217,9 @@ def process_nested_data(data):
|
||||
# 读取JSON数据,提取内容,转换结构,并打印结果
|
||||
def extract_from_notice(clause_path, type):
|
||||
if type == 1:
|
||||
target_values = ["投标文件","响应文件"]
|
||||
target_values = ["投标文件","响应文件","响应性文件"]
|
||||
elif type == 2:
|
||||
target_values = ["开标", "评标", "定标","磋商程序","中标"]
|
||||
target_values = ["开标", "评标", "定标","磋商程序","中标","程序"]
|
||||
elif type == 3:
|
||||
target_values = ["重新招标、不再招标和终止招标", "重新招标", "不再招标", "终止招标"]
|
||||
elif type == 4:
|
||||
@ -237,9 +237,9 @@ def extract_from_notice(clause_path, type):
|
||||
|
||||
#TODO: 再审视一下zbtest20的处理是否合理
|
||||
if __name__ == "__main__":
|
||||
file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause9.json'
|
||||
file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause26.json'
|
||||
try:
|
||||
res = extract_from_notice(file_path, 2) # 可以改变此处的 type 参数测试不同的场景
|
||||
res = extract_from_notice(file_path, 1) # 可以改变此处的 type 参数测试不同的场景
|
||||
res2=json.dumps(res,ensure_ascii=False,indent=4)
|
||||
print(res2)
|
||||
except ValueError as e:
|
||||
|
@ -2,14 +2,16 @@ import json
|
||||
import docx
|
||||
import re
|
||||
import os
|
||||
|
||||
import fitz
|
||||
from PyPDF2 import PdfReader
|
||||
from flask_app.main.截取pdf import clean_page_content,extract_common_header
|
||||
from flask_app.货物标.货物标截取pdf import clean_page_content,extract_common_header
|
||||
|
||||
def extract_text_from_docx(file_path):
|
||||
doc = docx.Document(file_path)
|
||||
return '\n'.join([para.text for para in doc.paragraphs])
|
||||
|
||||
|
||||
#PYPDF2版本
|
||||
def extract_text_from_pdf(file_path, start_word, end_pattern):
|
||||
# 从PDF文件中提取文本
|
||||
common_header = extract_common_header(file_path)
|
||||
@ -44,6 +46,54 @@ def extract_text_from_pdf(file_path, start_word, end_pattern):
|
||||
# print(full_text)
|
||||
return full_text
|
||||
|
||||
#fitz库版本
|
||||
# def extract_text_from_pdf(file_path, start_word, end_pattern):
|
||||
# # 从PDF文件中提取文本
|
||||
# common_header = extract_common_header(file_path)
|
||||
# doc = fitz.open(file_path)
|
||||
# all_pages_text = []
|
||||
# start_index = None
|
||||
#
|
||||
# # 处理所有页面
|
||||
# for i in range(len(doc)):
|
||||
# page = doc[i]
|
||||
# page_text = page.get_text()
|
||||
# cleaned_text = clean_page_content(page_text, common_header)
|
||||
# print(cleaned_text)
|
||||
# print("yes")
|
||||
# # 在第一页查找开始位置
|
||||
# if i == 0 and start_index is None:
|
||||
# start_match = re.search(start_word, cleaned_text, re.MULTILINE)
|
||||
# if start_match:
|
||||
# start_index = start_match.start()
|
||||
# cleaned_text = cleaned_text[start_index:]
|
||||
#
|
||||
# # 在最后一页查找结束位置
|
||||
# if i == len(doc) - 1:
|
||||
# for pattern in end_pattern:
|
||||
# matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE))
|
||||
# if matches:
|
||||
# end_index = matches[-1].start()
|
||||
# cleaned_text = cleaned_text[:end_index]
|
||||
# break
|
||||
#
|
||||
# all_pages_text.append(cleaned_text)
|
||||
#
|
||||
# # 合并所有页面的文本
|
||||
# full_text = "\n".join(all_pages_text)
|
||||
# # 关闭文档
|
||||
# doc.close()
|
||||
#
|
||||
# return full_text
|
||||
|
||||
def chinese_to_arabic(chinese_num):
|
||||
cn_num = {
|
||||
'一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
|
||||
'六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
|
||||
'十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15
|
||||
}
|
||||
return cn_num.get(chinese_num, 0)
|
||||
|
||||
def compare_headings(current, new):
|
||||
# 使用过滤来确保只处理非空且为数字的部分
|
||||
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
|
||||
@ -86,78 +136,87 @@ def parse_text_by_heading(text):
|
||||
current_key = None
|
||||
current_content = []
|
||||
append_newline = False
|
||||
skip_subheadings = False
|
||||
|
||||
lines = text.split('\n') #['一、说明', '1.适用范围', '招标文件仅适用于第一章“投标邀请书”中所述项目的货物、工程及服务的采购。']
|
||||
for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n'
|
||||
line_stripped = line.strip()
|
||||
# 匹配中文数字标题,包括带括号和不带括号的情况
|
||||
pattern_title = re.compile(r'^\s*(?:[((]\s*[一二三四五六七八九十]\s*[))]\s*|[一二三四五六七八九十]\s*、\s*)')
|
||||
|
||||
# 用于提取中文数字部分
|
||||
pattern_key = re.compile(r'[一二三四五六七八九十]')
|
||||
|
||||
# 用于提取标题后的内容部分
|
||||
pattern_value = re.compile(r'[^\s、))]+')
|
||||
|
||||
chinese_match = pattern_title.match(line_stripped) # 匹配是否为标题
|
||||
if chinese_match:
|
||||
# 匹配到标题行,先提取中文数字部分
|
||||
chinese_key_match = pattern_key.search(line_stripped)
|
||||
# 提取中文数字后的内容部分
|
||||
chinese_value_match = pattern_value.search(
|
||||
line_stripped[chinese_key_match.end():]) if chinese_key_match else None
|
||||
if chinese_key_match and chinese_value_match:
|
||||
chinese_key = chinese_key_match.group() # 匹配到的中文数字,如 "一"
|
||||
chinese_value = chinese_value_match.group() # 匹配到的标题内容,如 "招标文件"
|
||||
# 将提取的标题和内容存储到字典中
|
||||
if chinese_key:
|
||||
data[chinese_key] = chinese_value
|
||||
current_key = None # 重置当前key
|
||||
current_content = [] # 清空当前内容
|
||||
continue
|
||||
|
||||
# 新增:匹配阿拉伯数字开头后跟顿号的情况,如 "7、竞争性磋商采购文件的修改"
|
||||
arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
|
||||
if arabic_match:
|
||||
arabic_key, arabic_value = arabic_match.groups()
|
||||
arabic_key = arabic_key.replace('、', '.') # 将顿号替换为点号
|
||||
if current_key is not None:
|
||||
content_string = ''.join(current_content).strip()
|
||||
data[current_key] = content_string.replace(' ', '')
|
||||
current_key = arabic_key
|
||||
current_content = [arabic_value]
|
||||
append_newline = True
|
||||
continue
|
||||
|
||||
# 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号
|
||||
lines = text.split('\n')
|
||||
for i, line in enumerate(lines):
|
||||
line_stripped = line.strip().replace('.', '.')
|
||||
# print(line_stripped)
|
||||
# 匹配二级、三级标题形如 '1.1'、'2.2.3' 并确保其前后没有字母或括号
|
||||
match = re.match(r'^(?<![a-zA-Z((])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
|
||||
if not match:
|
||||
#匹配一级标题'1.'
|
||||
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
|
||||
|
||||
if match:
|
||||
new_key, line_content = match.groups()
|
||||
line_content = line_content.lstrip('.')
|
||||
new_key, line_content = match.groups() #new_key:1.1 line_content:本次竞争性磋商文件仅适用于本次竞争性磋商公告中所叙述的广水农商行门禁控制主机及基础验证设备采购项
|
||||
line_content = line_content.lstrip('..、,') #删除左边的若干个..、,
|
||||
|
||||
# 检查是否包含关键词,决定是否跳过子标题
|
||||
if any(keyword in line_content for keyword in keywords):
|
||||
skip_subheadings = True
|
||||
else:
|
||||
skip_subheadings = False
|
||||
|
||||
# 检查是否应该更新当前键和内容
|
||||
if current_key is None or (compare_headings(current_key, new_key) and (
|
||||
len(current_content) == 0 or current_content[-1][-1] != '第')):
|
||||
if current_key is not None:
|
||||
# 将之前的内容保存到data中,保留第一个换行符,后续的换行符转为空字符
|
||||
# print(current_content)
|
||||
content_string = ''.join(current_content).strip()
|
||||
# print(content_string)
|
||||
data[current_key] = content_string.replace(' ', '')
|
||||
current_key = new_key
|
||||
current_content = [line_content]
|
||||
# 当标题级别为一级(1.)和两级(如 1.1)时标题,才设置 append_newline 为 True
|
||||
append_newline = len(new_key.rstrip('.').split('.')) <= 2
|
||||
else:
|
||||
append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
|
||||
else:
|
||||
if not skip_subheadings:
|
||||
# 匹配中文数字标题,包括带括号和不带括号的情况
|
||||
pattern_title = re.compile(
|
||||
r'^\s*(?:[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
|
||||
chinese_match = pattern_title.match(line_stripped)
|
||||
|
||||
# 匹配字母标题,如 B.磋商说明
|
||||
letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
|
||||
|
||||
# 匹配阿拉伯数字开头后跟顿号的情况,如 "7、竞争性磋商采购文件的修改"
|
||||
arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
|
||||
|
||||
if chinese_match or letter_match or arabic_match:
|
||||
if current_key is not None:
|
||||
content_string = ''.join(current_content).strip()
|
||||
data[current_key] = content_string.replace(' ', '')
|
||||
if chinese_match:
|
||||
pattern_key = re.compile(r'[一二三四五六七八九十]{1,2}')
|
||||
pattern_value = re.compile(r'[^\s、))]+')
|
||||
chinese_key_match = pattern_key.search(line_stripped)
|
||||
chinese_value_match = pattern_value.search(
|
||||
line_stripped[chinese_key_match.end():]) if chinese_key_match else None
|
||||
if chinese_key_match and chinese_value_match:
|
||||
current_key = chinese_key_match.group()
|
||||
current_content = [chinese_value_match.group()]
|
||||
elif letter_match:
|
||||
letter_key, letter_value = letter_match.groups()
|
||||
# 将字母转换为对应的中文数字
|
||||
letter_to_chinese = {
|
||||
'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五',
|
||||
'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十',
|
||||
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
|
||||
}
|
||||
current_key = letter_to_chinese.get(letter_key, letter_key)
|
||||
current_content = [letter_value]
|
||||
elif arabic_match:
|
||||
arabic_key, arabic_value = arabic_match.groups()
|
||||
current_key = arabic_key.replace('、', '.')
|
||||
current_content = [arabic_value]
|
||||
|
||||
append_newline = True
|
||||
continue
|
||||
|
||||
if line_stripped:
|
||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
|
||||
|
||||
if current_key is not None:
|
||||
# 保存最后一部分内容
|
||||
content_string = ''.join(current_content).strip()
|
||||
data[current_key] = content_string.replace(' ', '')
|
||||
|
||||
@ -174,7 +233,7 @@ def parse_text_by_heading(text):
|
||||
# parsed_data = parse_text_by_heading(text)
|
||||
# return parsed_data
|
||||
|
||||
def convert_clause_to_json(file_path,output_folder,type=1):
|
||||
def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json"):
|
||||
if not os.path.exists(file_path):
|
||||
print(f"The specified file does not exist: {file_path}")
|
||||
return ""
|
||||
@ -191,8 +250,8 @@ def convert_clause_to_json(file_path,output_folder,type=1):
|
||||
if not os.path.exists(output_folder):
|
||||
os.makedirs(output_folder)
|
||||
print(f"Created output folder: {output_folder}")
|
||||
file_name = "clause1.json" if type == 1 else "clause2.json"
|
||||
# file_name = f"clause{suffix_counter}.json"
|
||||
# file_name = "clause1.json" if type == 1 else "clause2.json"
|
||||
file_name = f"clause{suffix_counter}.json"
|
||||
output_path = os.path.join(output_folder, file_name)
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, indent=4, ensure_ascii=False)
|
||||
@ -213,7 +272,7 @@ def process_folder(input_folder, output_folder):
|
||||
suffix = str(suffix_counter) # 后缀为递增的数字
|
||||
try:
|
||||
# 调用 convert_clause_to_json,传递文件路径、输出文件夹和递增的后缀
|
||||
output_path = convert_clause_to_json(file_path, output_folder, 1, suffix_counter)
|
||||
output_path = convert_clause_to_json(file_path, output_folder, 1, suffix)
|
||||
print(f"Processed file: {file_name}, JSON saved to: {output_path}")
|
||||
except ValueError as e:
|
||||
print(f"Error processing {file_name}: {e}")
|
||||
@ -221,17 +280,22 @@ def process_folder(input_folder, output_folder):
|
||||
# 后缀递增
|
||||
suffix_counter += 1
|
||||
|
||||
#TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次)_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏
|
||||
#TODO:标题'一'应该越来越大,与'1.1'的逻辑分开';
|
||||
#TODO:911904c3-4d6e-47e0-acd8-b05e582209cb文件夹运行有问题,百炼出错了
|
||||
if __name__ == "__main__":
|
||||
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
|
||||
file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
||||
# # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
|
||||
# file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\竞争性谈判文件(3)_tobidders_notice_part2.pdf'
|
||||
# # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
||||
# output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2'
|
||||
# try:
|
||||
# output_path = convert_clause_to_json(file_path,output_folder)
|
||||
# print(f"Final JSON result saved to: {output_path}")
|
||||
# except ValueError as e:
|
||||
# print("Error:", e)
|
||||
|
||||
input_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4'
|
||||
output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1'
|
||||
try:
|
||||
output_path = convert_clause_to_json(file_path,output_folder)
|
||||
print(f"Final JSON result saved to: {output_path}")
|
||||
except ValueError as e:
|
||||
print("Error:", e)
|
||||
# input_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4'
|
||||
# output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1'
|
||||
#
|
||||
# # 调用 process_folder 来处理整个文件夹中的所有文件
|
||||
# process_folder(input_folder, output_folder)
|
||||
|
||||
# 调用 process_folder 来处理整个文件夹中的所有文件
|
||||
process_folder(input_folder, output_folder)
|
||||
|
@ -1,3 +1,4 @@
|
||||
import fitz
|
||||
from PyPDF2 import PdfReader, PdfWriter
|
||||
import re # 导入正则表达式库
|
||||
import os # 用于文件和文件夹操作
|
||||
@ -20,7 +21,7 @@ def clean_page_content(text, common_header):
|
||||
|
||||
return text
|
||||
|
||||
|
||||
#PYPDF2版本
|
||||
def extract_common_header(pdf_path):
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
headers = []
|
||||
@ -57,6 +58,40 @@ def extract_common_header(pdf_path):
|
||||
return '\n'.join(common_headers)
|
||||
|
||||
|
||||
#fitz库版本
|
||||
# def extract_common_header(pdf_path):
|
||||
# doc = fitz.open(pdf_path)
|
||||
# headers = []
|
||||
# total_pages = len(doc)
|
||||
#
|
||||
# if total_pages == 2:
|
||||
# pages_to_read = 2
|
||||
# start_page = 0
|
||||
# else:
|
||||
# pages_to_read = 3
|
||||
# middle_page = total_pages // 2
|
||||
# start_page = max(0, middle_page - 1)
|
||||
#
|
||||
# for i in range(start_page, min(start_page + pages_to_read, total_pages)):
|
||||
# page = doc[i]
|
||||
# text = page.get_text()
|
||||
# if text:
|
||||
# first_lines = text.strip().split('\n')[:3]
|
||||
# headers.append(first_lines)
|
||||
#
|
||||
# doc.close()
|
||||
#
|
||||
# if len(headers) < 2:
|
||||
# return ""
|
||||
#
|
||||
# common_headers = []
|
||||
# for lines in zip(*headers):
|
||||
# common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]])
|
||||
# if common_line:
|
||||
# common_headers.append(' '.join(common_line))
|
||||
#
|
||||
# return '\n'.join(common_headers)
|
||||
|
||||
def is_pdf_or_doc(filename):
|
||||
# 判断文件是否为PDF或Word文档
|
||||
return filename.lower().endswith(('.pdf', '.doc', '.docx'))
|
||||
|
Loading…
x
Reference in New Issue
Block a user