1.17 小bug
This commit is contained in:
parent
e62e9dd282
commit
7e8f368324
@ -40,6 +40,7 @@ class TextinOcr(object):
|
|||||||
}
|
}
|
||||||
|
|
||||||
return requests.post(url, data=image, headers=headers, params=options)
|
return requests.post(url, data=image, headers=headers, params=options)
|
||||||
|
#调用textIn:pdf/word->markdown
|
||||||
def convert_file_to_markdown(file_path, file_name="extract1.txt"):
|
def convert_file_to_markdown(file_path, file_name="extract1.txt"):
|
||||||
# 获取文件的绝对路径所在的文件夹
|
# 获取文件的绝对路径所在的文件夹
|
||||||
output_folder = os.path.dirname(os.path.abspath(file_path))
|
output_folder = os.path.dirname(os.path.abspath(file_path))
|
||||||
|
@ -1,178 +0,0 @@
|
|||||||
import os
|
|
||||||
import time
|
|
||||||
|
|
||||||
import requests
|
|
||||||
from ratelimit import sleep_and_retry, limits
|
|
||||||
def read_txt_to_string(file_path):
|
|
||||||
"""
|
|
||||||
读取txt文件内容并返回一个包含所有内容的字符串,保持原有格式。
|
|
||||||
|
|
||||||
参数:
|
|
||||||
- file_path (str): txt文件的路径
|
|
||||||
|
|
||||||
返回:
|
|
||||||
- str: 包含文件内容的字符串
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
with open(file_path, 'r', encoding='utf-8') as file: # 确保使用适当的编码
|
|
||||||
content = file.read() # 使用 read() 保持文件格式
|
|
||||||
return content
|
|
||||||
except FileNotFoundError:
|
|
||||||
return "错误:文件未找到。"
|
|
||||||
except Exception as e:
|
|
||||||
return f"错误:读取文件时发生错误。详细信息:{e}"
|
|
||||||
def generate_full_user_query(file_path, prompt_template):
|
|
||||||
"""
|
|
||||||
根据文件路径和提示词模板生成完整的user_query。
|
|
||||||
|
|
||||||
参数:
|
|
||||||
- file_path (str): 需要解析的文件路径。
|
|
||||||
- prompt_template (str): 包含{full_text}占位符的提示词模板。
|
|
||||||
|
|
||||||
返回:
|
|
||||||
- str: 完整的user_query。
|
|
||||||
"""
|
|
||||||
# 假设extract_text_by_page已经定义,用于提取文件内容
|
|
||||||
full_text=read_txt_to_string(file_path)
|
|
||||||
# 格式化提示词,将提取的文件内容插入到模板中
|
|
||||||
user_query = prompt_template.format(full_text=full_text)
|
|
||||||
|
|
||||||
return user_query
|
|
||||||
def get_total_tokens(text):
|
|
||||||
"""
|
|
||||||
调用 API 计算给定文本的总 Token 数量。
|
|
||||||
|
|
||||||
参数:
|
|
||||||
- text (str): 需要计算 Token 的文本。
|
|
||||||
- model (str): 使用的模型名称,默认值为 "ep-20241119121710-425g6"。
|
|
||||||
|
|
||||||
返回:
|
|
||||||
- int: 文本的 total_tokens 数量。
|
|
||||||
"""
|
|
||||||
# API 请求 URL
|
|
||||||
url = "https://ark.cn-beijing.volces.com/api/v3/tokenization"
|
|
||||||
|
|
||||||
# 获取 API 密钥
|
|
||||||
doubao_api_key = os.getenv("DOUBAO_API_KEY")
|
|
||||||
if not doubao_api_key:
|
|
||||||
raise ValueError("DOUBAO_API_KEY 环境变量未设置")
|
|
||||||
|
|
||||||
# 请求头
|
|
||||||
headers = {
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
"Authorization": "Bearer " + doubao_api_key
|
|
||||||
}
|
|
||||||
model = "ep-20241119121710-425g6"
|
|
||||||
# 请求体
|
|
||||||
payload = {
|
|
||||||
"model": model,
|
|
||||||
"text": [text] # API 文档中要求 text 是一个列表
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = requests.post(url, headers=headers, json=payload)
|
|
||||||
response.raise_for_status()
|
|
||||||
response_data = response.json()
|
|
||||||
total_tokens=response_data["data"][0]["total_tokens"]
|
|
||||||
return total_tokens
|
|
||||||
except Exception as e:
|
|
||||||
print(f"获取 Token 数量失败:{e}")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
@sleep_and_retry
|
|
||||||
@limits(calls=10, period=1) # 每秒最多调用10次
|
|
||||||
def doubao_model(full_user_query, need_extra=False):
|
|
||||||
print("call doubao...")
|
|
||||||
# 相关参数
|
|
||||||
url = "https://ark.cn-beijing.volces.com/api/v3/chat/completions"
|
|
||||||
doubao_api_key = os.getenv("DOUBAO_API_KEY")
|
|
||||||
|
|
||||||
# 定义主模型和备用模型
|
|
||||||
models = {
|
|
||||||
"pro_32k": "ep-20241119121710-425g6", # 豆包Pro 32k模型
|
|
||||||
"pro_128k": "ep-20241119121743-xt6wg" # 128k模型
|
|
||||||
}
|
|
||||||
|
|
||||||
# 判断用户查询字符串的长度
|
|
||||||
token_count = get_total_tokens(full_user_query)
|
|
||||||
if token_count > 31500:
|
|
||||||
selected_model = models["pro_128k"] # 如果长度超过32k,直接使用128k模型
|
|
||||||
else:
|
|
||||||
selected_model = models["pro_32k"] # 默认使用32k模型
|
|
||||||
|
|
||||||
# 请求头
|
|
||||||
headers = {
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
"Authorization": "Bearer " + doubao_api_key
|
|
||||||
}
|
|
||||||
|
|
||||||
max_retries_429 = 2 # 针对 429 错误的最大重试次数
|
|
||||||
max_retries_other = 1 # 针对其他错误的最大重试次数
|
|
||||||
attempt = 0
|
|
||||||
response = None # 确保 response 被定义
|
|
||||||
|
|
||||||
while True:
|
|
||||||
# 请求数据
|
|
||||||
data = {
|
|
||||||
"model": selected_model,
|
|
||||||
"messages": [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": full_user_query
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"temperature": 0.2
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
response = requests.post(url, headers=headers, json=data) # 设置超时时间为10秒
|
|
||||||
response.raise_for_status() # 如果响应状态码不是200,将引发HTTPError
|
|
||||||
|
|
||||||
# 获取响应 JSON
|
|
||||||
response_json = response.json()
|
|
||||||
|
|
||||||
# 获取返回内容
|
|
||||||
content = response_json["choices"][0]["message"]["content"]
|
|
||||||
|
|
||||||
# 获取 completion_tokens
|
|
||||||
completion_tokens = response_json["usage"].get("completion_tokens", 0)
|
|
||||||
|
|
||||||
# 根据 need_extra 返回不同的结果
|
|
||||||
if need_extra:
|
|
||||||
return content, completion_tokens
|
|
||||||
else:
|
|
||||||
return content
|
|
||||||
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
# 获取状态码并处理不同的重试逻辑
|
|
||||||
status_code = response.status_code if response is not None else None
|
|
||||||
print(f"请求失败,状态码: {status_code}")
|
|
||||||
print("请求失败,完整的响应内容如下:")
|
|
||||||
if response is not None:
|
|
||||||
print(response.text) # 打印原始的响应内容,可能是 JSON 格式,也可能是其他格式
|
|
||||||
|
|
||||||
# 如果是 429 错误
|
|
||||||
if status_code == 429:
|
|
||||||
if attempt < max_retries_429:
|
|
||||||
wait_time = 2 if attempt == 0 else 4
|
|
||||||
print(f"状态码为 429,等待 {wait_time} 秒后重试...")
|
|
||||||
time.sleep(wait_time)
|
|
||||||
else:
|
|
||||||
print(f"状态码为 429,已达到最大重试次数 {max_retries_429} 次。")
|
|
||||||
break # 超过最大重试次数,退出循环
|
|
||||||
else:
|
|
||||||
# 针对其他错误
|
|
||||||
if attempt < max_retries_other:
|
|
||||||
print("非 429 错误,等待 1 秒后重试...")
|
|
||||||
time.sleep(1)
|
|
||||||
else:
|
|
||||||
print(f"非 429 错误,已达到最大重试次数 {max_retries_other} 次。")
|
|
||||||
break # 超过最大重试次数,退出循环
|
|
||||||
|
|
||||||
attempt += 1 # 增加重试计数
|
|
||||||
|
|
||||||
# 如果到这里,说明所有尝试都失败了
|
|
||||||
print(f"请求失败,已达到最大重试次数。")
|
|
||||||
if need_extra:
|
|
||||||
return None, 0
|
|
||||||
else:
|
|
||||||
return None
|
|
@ -241,21 +241,12 @@ def reorganize_data(input_dict, include=None):
|
|||||||
reorganized["商务评分"][package] = categories["商务评分"]
|
reorganized["商务评分"][package] = categories["商务评分"]
|
||||||
|
|
||||||
return reorganized
|
return reorganized
|
||||||
# 格式要求:
|
|
||||||
# 请以 JSON 格式返回结果,最外层键名为 '技术评分'、'商务评分' 和 '投标报价评分'。在每大项下,用键值对表示具体评分项,键为具体的评审因素,若评审因素存在嵌套(表格中存在层级),请使用嵌套键值对表示,具体规则如下:
|
|
||||||
# 1. 如果评审因素存在嵌套,使用嵌套键值对表示:
|
|
||||||
# -主评审因素的键名后需附加括号,表示该主因素下所有子因素总分,例如:产品技术响应(8分)
|
|
||||||
# -子评审因素作为主评审因素的内层键名
|
|
||||||
# 2. 如果评审因素不存在嵌套,那么键名就是该评审因素
|
|
||||||
# 3. 每个评审因素的最内层键值都是列表,列表中包含描述评分及要求的字典,字典需包含以下键:
|
|
||||||
# '评分':具体得分或定性指标(如 '合格制'),无评分时可删去'评分'键值对。
|
|
||||||
# '要求':说明评分标准。
|
|
||||||
# 4.若这三大项评分中存在额外信息(不属于某个评审因素,即该大项评分的整体要求),在该评分项内部新增键名为'备注',值为该要求。
|
|
||||||
def combine_evaluation_standards(evaluation_method_path,invalid_path,zb_type):
|
def combine_evaluation_standards(evaluation_method_path,invalid_path,zb_type):
|
||||||
# 定义默认的评审结果字典
|
# 定义默认的评审结果字典
|
||||||
DEFAULT_EVALUATION_REVIEW = {
|
DEFAULT_EVALUATION_REVIEW = {
|
||||||
"技术评分": "",
|
"技术评分": "未解析到'技术评分'项!",
|
||||||
"商务评分": ""
|
"商务评分": "未解析到'商务评分'项!"
|
||||||
}
|
}
|
||||||
# 如果 truncate_file 是空字符串,直接返回包含空字符串的字典
|
# 如果 truncate_file 是空字符串,直接返回包含空字符串的字典
|
||||||
if not evaluation_method_path:
|
if not evaluation_method_path:
|
||||||
|
@ -269,10 +269,9 @@ def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
|
|||||||
for i, page in enumerate(pdf_document.pages[begin_page:end_limit], start=begin_page):
|
for i, page in enumerate(pdf_document.pages[begin_page:end_limit], start=begin_page):
|
||||||
text = page.extract_text() or ""
|
text = page.extract_text() or ""
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
if output_suffix == "tobidders_notice":
|
if output_suffix == "tobidders_notice2": #extract_pages_twice_tobidders_notice会调用该代码
|
||||||
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
||||||
if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern,
|
if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern,cleaned_text):
|
||||||
cleaned_text):
|
|
||||||
flag = False
|
flag = False
|
||||||
continue
|
continue
|
||||||
if begin_page == 0 and catalog_pattern.search(cleaned_text):
|
if begin_page == 0 and catalog_pattern.search(cleaned_text):
|
||||||
@ -287,14 +286,17 @@ def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
|
|||||||
start_page = i
|
start_page = i
|
||||||
continue
|
continue
|
||||||
if start_page is not None:
|
if start_page is not None:
|
||||||
if output_suffix == "tobidders_notice" or output_suffix == 'notice': # 因为投标人须知前附表中经常出现'见招标公告',导致被not regex.search(begin_pattern, cleaned_text)pass掉
|
if output_suffix in ["tobidders_notice", "notice", "tobidders_notice2"]: # 使用列表判断多个匹配项
|
||||||
|
# 判断 end_pattern 是否匹配且当前页大于起始页
|
||||||
if regex.search(end_pattern, cleaned_text) and i > start_page:
|
if regex.search(end_pattern, cleaned_text) and i > start_page:
|
||||||
end_page = i
|
end_page = i
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
|
# 如果 end_pattern 匹配且 begin_pattern 不匹配
|
||||||
if regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern, cleaned_text):
|
if regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern, cleaned_text):
|
||||||
end_page = i
|
end_page = i
|
||||||
break
|
break
|
||||||
|
|
||||||
return start_page, end_page
|
return start_page, end_page
|
||||||
|
|
||||||
|
|
||||||
@ -410,7 +412,7 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
|
|||||||
cleaned_text = clean_page_content(text, common_header)
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
|
|
||||||
# 如果已经找到中间页,且当前页匹配排除模式,则跳过
|
# 如果已经找到中间页,且当前页匹配排除模式,则跳过
|
||||||
if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
|
if exclusion_pattern and mid_page is not None and regex.search(exclusion_pattern, cleaned_text):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 如果为第 0 页之后的目录,直接跳过
|
# 如果为第 0 页之后的目录,直接跳过
|
||||||
@ -532,6 +534,7 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page
|
|||||||
return start_page1, end_page1, end_page1
|
return start_page1, end_page1, end_page1
|
||||||
|
|
||||||
# 如果不包含排除关键词,继续提取第二部分
|
# 如果不包含排除关键词,继续提取第二部分
|
||||||
|
output_suffix='tobidders_notice2'
|
||||||
_, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
|
_, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
|
||||||
exclusion_pattern, output_suffix)
|
exclusion_pattern, output_suffix)
|
||||||
|
|
||||||
|
@ -377,7 +377,7 @@ def extract_from_notice(merged_baseinfo_path, clause_path, type):
|
|||||||
if clause_path and clause_path.strip():
|
if clause_path and clause_path.strip():
|
||||||
with open(clause_path, 'r', encoding='utf-8') as file:
|
with open(clause_path, 'r', encoding='utf-8') as file:
|
||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
if len(data) >= 60:
|
if len(data) >= 60: #默认clause中少于60条视为json提取失败!
|
||||||
# 尝试使用大章节筛选
|
# 尝试使用大章节筛选
|
||||||
extracted_data = extract_between_sections(data, target_values,flag)
|
extracted_data = extract_between_sections(data, target_values,flag)
|
||||||
if extracted_data:
|
if extracted_data:
|
||||||
@ -406,7 +406,7 @@ def extract_from_notice(merged_baseinfo_path, clause_path, type):
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json'
|
||||||
merged_baseinfo_path=r"C:\Users\Administrator\Desktop\fsdownload\b29de31a-297e-42cf-b9ba-6859e530a472\ztbfile_merged_baseinfo.pdf"
|
merged_baseinfo_path=r"C:\Users\Administrator\Desktop\fsdownload\b29de31a-297e-42cf-b9ba-6859e530a472\ztbfile_merged_baseinfo.pdf"
|
||||||
clause_path=r"C:\Users\Administrator\Desktop\fsdownload\b29de31a-297e-42cf-b9ba-6859e530a472\clause1.json"
|
clause_path=r"D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\tmp\clause1.json"
|
||||||
try:
|
try:
|
||||||
res = extract_from_notice(merged_baseinfo_path,clause_path, 2) # 可以改变此处的 type 参数测试不同的场景
|
res = extract_from_notice(merged_baseinfo_path,clause_path, 2) # 可以改变此处的 type 参数测试不同的场景
|
||||||
res2 = json.dumps(res, ensure_ascii=False, indent=4)
|
res2 = json.dumps(res, ensure_ascii=False, indent=4)
|
||||||
|
@ -5,6 +5,8 @@ import regex
|
|||||||
from PyPDF2 import PdfReader
|
from PyPDF2 import PdfReader
|
||||||
|
|
||||||
from flask_app.货物标.截取pdf货物标版 import clean_page_content, extract_common_header
|
from flask_app.货物标.截取pdf货物标版 import clean_page_content, extract_common_header
|
||||||
|
|
||||||
|
|
||||||
def compare_headings(current, new):
|
def compare_headings(current, new):
|
||||||
"""
|
"""
|
||||||
比较两个标题的层次关系,并确保新标题比当前标题大且最高位数字差值不超过5。
|
比较两个标题的层次关系,并确保新标题比当前标题大且最高位数字差值不超过5。
|
||||||
@ -41,6 +43,7 @@ def should_add_newline(content, keywords, max_length=20):
|
|||||||
content_str = ''.join(content).strip()
|
content_str = ''.join(content).strip()
|
||||||
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
|
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
|
||||||
|
|
||||||
|
|
||||||
def handle_content_append(current_content, line_content, append_newline, keywords, in_special_section):
|
def handle_content_append(current_content, line_content, append_newline, keywords, in_special_section):
|
||||||
if append_newline:
|
if append_newline:
|
||||||
if should_add_newline(current_content, keywords):
|
if should_add_newline(current_content, keywords):
|
||||||
@ -51,6 +54,7 @@ def handle_content_append(current_content, line_content, append_newline, keyword
|
|||||||
current_content.append('\n')
|
current_content.append('\n')
|
||||||
return append_newline
|
return append_newline
|
||||||
|
|
||||||
|
|
||||||
def parse_text_by_heading(text):
|
def parse_text_by_heading(text):
|
||||||
keywords = ['包含', '以下']
|
keywords = ['包含', '以下']
|
||||||
data = {}
|
data = {}
|
||||||
@ -66,12 +70,12 @@ def parse_text_by_heading(text):
|
|||||||
# 定义所有需要的正则表达式模式
|
# 定义所有需要的正则表达式模式
|
||||||
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') # 一、
|
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') # 一、
|
||||||
pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*') # (一)
|
pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*') # (一)
|
||||||
pattern_letter_initial = re.compile(r'^([A-Z])[..、]\s*(.*)$') # 初始扫描时匹配 A内容 或 A. 内容
|
pattern_letter_initial = re.compile(r'^([A-Z])[..、]?\s*(.*)$') # 初始扫描时匹配 A内容 或 A. 内容
|
||||||
pattern_letter = re.compile(r'^([A-Z])[..、]\s*(.*)$') # 主循环中严格匹配 A. 内容
|
|
||||||
# pattern_arabic = re.compile(r'^(\d+、)\s*(.+)$') # 1、内容
|
# pattern_arabic = re.compile(r'^(\d+、)\s*(.+)$') # 1、内容
|
||||||
|
|
||||||
initial_heading_pattern = None
|
initial_heading_pattern = None
|
||||||
special_section_keywords = ['文件的组成', '文件的构成','文件包括:','文件包括:','雷同认定','包括以下内容'] # 定义特殊章节关键词
|
special_section_keywords = ['文件的组成', '文件的构成', '文件包括:', '文件包括:', '雷同认定',
|
||||||
|
'包括以下内容'] # 定义特殊章节关键词
|
||||||
in_special_section = False # 标志是否在特殊章节中
|
in_special_section = False # 标志是否在特殊章节中
|
||||||
lines = text.split('\n')
|
lines = text.split('\n')
|
||||||
|
|
||||||
@ -103,17 +107,15 @@ def parse_text_by_heading(text):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# 预先扫描前5行,检查是否有匹配任何标题模式 '一、总则' 这种
|
# 预先扫描前5行,检查是否有匹配任何标题模式 '一、总则' 这种,大标题夹着序号
|
||||||
first_five_lines = lines[:5]
|
first_five_lines = lines[:5]
|
||||||
has_initial_heading_patterns = False
|
has_initial_heading_patterns = False
|
||||||
for line in first_five_lines:
|
for line in first_five_lines:
|
||||||
line_stripped = line.strip().replace('.', '.')
|
line_stripped = line.strip().replace('.', '.')
|
||||||
if line_stripped.startswith("##"):
|
if line_stripped.startswith("##"):
|
||||||
line_stripped = line_stripped[2:] # Remove "##"
|
line_stripped = line_stripped[2:] # Remove "##"
|
||||||
if (pattern_numbered.match(line_stripped) or
|
if (pattern_numbered.match(line_stripped) or pattern_parentheses.match(
|
||||||
pattern_parentheses.match(line_stripped) or
|
line_stripped) or pattern_letter_initial.match(line_stripped)):
|
||||||
pattern_letter_initial.match(line_stripped)
|
|
||||||
):
|
|
||||||
has_initial_heading_patterns = True
|
has_initial_heading_patterns = True
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -183,7 +185,8 @@ def parse_text_by_heading(text):
|
|||||||
append_newline = len(new_key.rstrip('.').split('.')) <= 2
|
append_newline = len(new_key.rstrip('.').split('.')) <= 2
|
||||||
last_main_number = new_key.split('.')[0]
|
last_main_number = new_key.split('.')[0]
|
||||||
else:
|
else:
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
|
||||||
|
in_special_section)
|
||||||
|
|
||||||
elif dot_match:
|
elif dot_match:
|
||||||
if in_double_hash_mode:
|
if in_double_hash_mode:
|
||||||
@ -199,7 +202,8 @@ def parse_text_by_heading(text):
|
|||||||
current_content = [line_content]
|
current_content = [line_content]
|
||||||
append_newline = True
|
append_newline = True
|
||||||
else:
|
else:
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
|
||||||
|
in_special_section)
|
||||||
else:
|
else:
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
|
||||||
in_special_section)
|
in_special_section)
|
||||||
@ -265,13 +269,18 @@ def parse_text_by_heading(text):
|
|||||||
last_main_number = new_key_candidate.rstrip('.')
|
last_main_number = new_key_candidate.rstrip('.')
|
||||||
else:
|
else:
|
||||||
# 将当前行视为当前标题的内容
|
# 将当前行视为当前标题的内容
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
|
||||||
|
in_special_section)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# 根据预先设置的标志决定是否执行这部分代码
|
# 根据预先设置的标志决定是否执行这部分代码
|
||||||
if has_initial_heading_patterns and not skip_subheadings and not in_special_section:
|
if has_initial_heading_patterns and not skip_subheadings and not in_special_section:
|
||||||
numbered_match = pattern_numbered.match(line_stripped) # 一、
|
numbered_match = pattern_numbered.match(line_stripped) # 一、
|
||||||
parentheses_match = pattern_parentheses.match(line_stripped) # (一)
|
parentheses_match = pattern_parentheses.match(line_stripped) # (一)
|
||||||
|
if i < 5:
|
||||||
|
pattern_letter = pattern_letter_initial
|
||||||
|
else:
|
||||||
|
pattern_letter = re.compile(r'^([A-Z])[..、]\s*(.*)$')
|
||||||
letter_match = pattern_letter.match(line_stripped) # A. 内容
|
letter_match = pattern_letter.match(line_stripped) # A. 内容
|
||||||
# arabic_match = pattern_arabic.match(line_stripped) #1、内容
|
# arabic_match = pattern_arabic.match(line_stripped) #1、内容
|
||||||
|
|
||||||
@ -286,7 +295,6 @@ def parse_text_by_heading(text):
|
|||||||
elif letter_match:
|
elif letter_match:
|
||||||
initial_heading_pattern = 'letter'
|
initial_heading_pattern = 'letter'
|
||||||
|
|
||||||
|
|
||||||
# 确定当前匹配的标题模式
|
# 确定当前匹配的标题模式
|
||||||
if numbered_match:
|
if numbered_match:
|
||||||
current_heading_pattern = 'numbered'
|
current_heading_pattern = 'numbered'
|
||||||
@ -295,7 +303,6 @@ def parse_text_by_heading(text):
|
|||||||
elif letter_match:
|
elif letter_match:
|
||||||
current_heading_pattern = 'letter'
|
current_heading_pattern = 'letter'
|
||||||
|
|
||||||
|
|
||||||
# 如果当前标题模式与初始标题模式一致,创建新的键值对
|
# 如果当前标题模式与初始标题模式一致,创建新的键值对
|
||||||
if current_heading_pattern == initial_heading_pattern:
|
if current_heading_pattern == initial_heading_pattern:
|
||||||
new_key_chinese = None
|
new_key_chinese = None
|
||||||
@ -343,15 +350,18 @@ def parse_text_by_heading(text):
|
|||||||
else:
|
else:
|
||||||
# 当前标题模式与初始模式不一致,将该行视为内容
|
# 当前标题模式与初始模式不一致,将该行视为内容
|
||||||
if line_stripped:
|
if line_stripped:
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
|
append_newline = handle_content_append(current_content, line_stripped, append_newline,
|
||||||
|
keywords, in_special_section)
|
||||||
else:
|
else:
|
||||||
# 未匹配到任何标题模式,将该行视为内容
|
# 未匹配到任何标题模式,将该行视为内容
|
||||||
if line_stripped:
|
if line_stripped:
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
|
||||||
|
in_special_section)
|
||||||
else:
|
else:
|
||||||
# 在特殊章节中,所有内容都作为当前标题的内容
|
# 在特殊章节中,所有内容都作为当前标题的内容
|
||||||
if line_stripped:
|
if line_stripped:
|
||||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
|
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
|
||||||
|
in_special_section)
|
||||||
|
|
||||||
# 最后保存最后一个 key 对应的内容
|
# 最后保存最后一个 key 对应的内容
|
||||||
if current_key is not None:
|
if current_key is not None:
|
||||||
@ -418,6 +428,7 @@ def extract_text_from_pdf(file_path, end_pattern, start_pattern_1, start_pattern
|
|||||||
full_text = "\n".join(all_pages_text)
|
full_text = "\n".join(all_pages_text)
|
||||||
return full_text
|
return full_text
|
||||||
|
|
||||||
|
|
||||||
def convert_clause_to_json(file_path, output_folder, type=1):
|
def convert_clause_to_json(file_path, output_folder, type=1):
|
||||||
if not os.path.exists(file_path):
|
if not os.path.exists(file_path):
|
||||||
print(f"The specified file does not exist: 返回空的clause_path")
|
print(f"The specified file does not exist: 返回空的clause_path")
|
||||||
@ -438,8 +449,10 @@ def convert_clause_to_json(file_path,output_folder,type=1):
|
|||||||
)
|
)
|
||||||
start_pattern_2 = (
|
start_pattern_2 = (
|
||||||
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前面的“见”,“与”,“”等字符
|
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前面的“见”,“与”,“”等字符
|
||||||
r'(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$' # 匹配“投标人”,“磋商”,“谈判”,“供应商”,“应答人”后跟“须知”
|
r'(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$|' # 匹配“投标人”,“磋商”,“谈判”,“供应商”,“应答人”后跟“须知”
|
||||||
|
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)' # 添加“第X章/部分”部分
|
||||||
)
|
)
|
||||||
|
|
||||||
end_pattern = (
|
end_pattern = (
|
||||||
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
|
r'^(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
|
||||||
r'(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$|'
|
r'(?<!见)(?<!与)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$|'
|
||||||
@ -466,15 +479,15 @@ def convert_clause_to_json(file_path,output_folder,type=1):
|
|||||||
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
|
print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
|
||||||
return output_path
|
return output_path
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
file_path = r'D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\tmp\ztbfile_tobidders_notice_part2.pdf'
|
file_path = r'C:\Users\Administrator\Desktop\货物标\zbfiles\output4444\招标文件111_tobidders_notice_part2.pdf'
|
||||||
# file_path=r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf'
|
# file_path=r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf'
|
||||||
# file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf'
|
# file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf'
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\6.2定版视频会议磋商文件_tobidders_notice_part2.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\6.2定版视频会议磋商文件_tobidders_notice_part2.pdf'
|
||||||
output_folder = r'D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\tmp'
|
output_folder = r'C:\Users\Administrator\Desktop\货物标\zbfiles\output4444'
|
||||||
try:
|
try:
|
||||||
output_path = convert_clause_to_json(file_path, output_folder)
|
output_path = convert_clause_to_json(file_path, output_folder)
|
||||||
print(f"Final JSON result saved to: {output_path}")
|
print(f"Final JSON result saved to: {output_path}")
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
print("Error:", e)
|
print("Error:", e)
|
||||||
|
|
||||||
|
@ -496,7 +496,7 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
|
|||||||
# print(all_texts)
|
# print(all_texts)
|
||||||
# Proceed only if there is content to write
|
# Proceed only if there is content to write
|
||||||
selected_contents = {}
|
selected_contents = {}
|
||||||
final_list=[]
|
final_list=[f"未解析到'{result_key}'!"]
|
||||||
seen_contents = set() # 使用集合跟踪已添加的内容以去重
|
seen_contents = set() # 使用集合跟踪已添加的内容以去重
|
||||||
if all_texts1_list:
|
if all_texts1_list:
|
||||||
with open(output_file, 'w', encoding='utf-8') as file:
|
with open(output_file, 'w', encoding='utf-8') as file:
|
||||||
@ -536,7 +536,8 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
|
|||||||
return {result_key: final_list}
|
return {result_key: final_list}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"handle_query 在处理 {result_key} 时发生异常: {e}")
|
print(f"handle_query 在处理 {result_key} 时发生异常: {e}")
|
||||||
return {result_key: []}
|
return {result_key: [f"未解析到'{result_key}'!"]}
|
||||||
|
|
||||||
|
|
||||||
def combine_find_invalid(invalid_docpath, output_dir):
|
def combine_find_invalid(invalid_docpath, output_dir):
|
||||||
os.makedirs(output_dir, exist_ok=True)
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
@ -273,8 +273,6 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
|
|||||||
#TODO: ec7d5328-9c57-450f-baf4-2e5a6f90ed1d
|
#TODO: ec7d5328-9c57-450f-baf4-2e5a6f90ed1d
|
||||||
|
|
||||||
#TODO:
|
#TODO:
|
||||||
# D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3排查一下 clause 有问题+
|
|
||||||
# C:\Users\Administrator\Desktop\fsdownload\bbf7504f-3c75-45e5-b3e2-ab0a15ec9c14
|
|
||||||
# 解决禅道 测试的bug
|
# 解决禅道 测试的bug
|
||||||
# 货物标和工程标的资格审查整合
|
# 货物标和工程标的资格审查整合
|
||||||
##TODO:招标文件111_tobidders_notice_part2.pdf 陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目(1)_tobidders_notice_part2.pdf 唐山市公安交通警察支队机动车查验机构视频存储回放系统竞争性谈判-招标文件正文(1)_tobidders_notice_part1.pdf
|
##TODO:招标文件111_tobidders_notice_part2.pdf 陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目(1)_tobidders_notice_part2.pdf 唐山市公安交通警察支队机动车查验机构视频存储回放系统竞争性谈判-招标文件正文(1)_tobidders_notice_part1.pdf
|
||||||
|
@ -315,12 +315,12 @@ if __name__ == "__main__":
|
|||||||
logger = get_global_logger("123")
|
logger = get_global_logger("123")
|
||||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
||||||
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
||||||
pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
|
pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件111.pdf"
|
||||||
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
||||||
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
||||||
output_folder = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output111"
|
output_folder = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output4444"
|
||||||
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
||||||
selection = 1 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
|
selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
|
||||||
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
||||||
print(generated_files)
|
print(generated_files)
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user