2.6 结构整理、增加了cache命中缓存
This commit is contained in:
parent
d79ca21f69
commit
19f7d7b38b
4
.idea/encodings.xml
generated
4
.idea/encodings.xml
generated
@ -15,5 +15,9 @@
|
||||
<file url="file://$PROJECT_DIR$/flask_app/static/output/output1/4cadda82-be0c-4c6e-be4a-89d42e6960b4/log.txt" charset="GBK" />
|
||||
<file url="file://$PROJECT_DIR$/flask_app/static/output/output1/64f3b2a7-9a80-4311-8380-2190c9d17420/log.txt" charset="GBK" />
|
||||
<file url="file://$PROJECT_DIR$/flask_app/static/output/output1/aedb03fa-b23c-4cb2-9168-386c7c2a9a32/log.txt" charset="GBK" />
|
||||
<file url="file://$PROJECT_DIR$/flask_app/static/output/output4/0ca584a6-92e4-497d-92b4-07493c23c25b/log.txt" charset="GBK" />
|
||||
<file url="file://$PROJECT_DIR$/flask_app/static/output/output4/0db88043-f658-4128-83e1-4d9709264996/log.txt" charset="GBK" />
|
||||
<file url="file://$PROJECT_DIR$/flask_app/static/output/output4/0fa4ec3f-5492-4de0-8a64-a4601d58aedd/log.txt" charset="GBK" />
|
||||
<file url="file://$PROJECT_DIR$/flask_app/static/output/output4/3a12ea82-c87d-40ab-814c-796af42fd55a/log.txt" charset="GBK" />
|
||||
</component>
|
||||
</project>
|
@ -1,9 +1,7 @@
|
||||
import re
|
||||
|
||||
import fitz
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
|
||||
def extract_common_header(pdf_path):
|
||||
"""
|
||||
提取 PDF 文件的公共页眉。
|
||||
|
@ -1,54 +1,108 @@
|
||||
from docx import Document
|
||||
import re
|
||||
import os
|
||||
import copy
|
||||
import regex
|
||||
from docx import Document
|
||||
from docx.text.paragraph import Paragraph
|
||||
from docx.table import Table
|
||||
|
||||
def iter_block_items(parent):
|
||||
"""
|
||||
遍历给定 parent(Document 或 _Cell)的块级对象,
|
||||
依次返回其中的 Paragraph(段落)和 Table(表格)。
|
||||
"""
|
||||
# 对于 Document 对象,获取 body 节点
|
||||
parent_elm = parent.element.body
|
||||
for child in parent_elm.iterchildren():
|
||||
tag = child.tag.lower()
|
||||
if tag.endswith('}p'):
|
||||
yield Paragraph(child, parent)
|
||||
elif tag.endswith('}tbl'):
|
||||
yield Table(child, parent)
|
||||
|
||||
#暂用不上,因为需要pdf文件->每页后面打标记->转docx。后面这一步钱无法省。
|
||||
def copy_docx(source_path):
|
||||
doc = Document(source_path) # 打开源文档
|
||||
"""
|
||||
从文档中截取内容:
|
||||
- 从第一个匹配 begin_pattern 的段落开始(按文档顺序)
|
||||
- 到最后一个匹配 end_pattern 的段落结束(按文档倒序匹配)
|
||||
如果没有匹配到 begin_pattern,则默认从第一段开始;
|
||||
如果没有匹配到 end_pattern,则默认到最后一段结束;
|
||||
同时确保匹配到的起始段落一定在结束段落之前,
|
||||
否则也采用默认的整个文档范围。
|
||||
同时保留中间的所有块级对象(包括文本、表格等),
|
||||
并尽量保持原来的格式、样式不变。
|
||||
"""
|
||||
# 打开源文档,并构建新文档
|
||||
doc = Document(source_path)
|
||||
output_folder = os.path.dirname(source_path)
|
||||
|
||||
# 获取原文件名并添加后缀
|
||||
original_file_name = os.path.basename(source_path)
|
||||
file_name_without_ext, file_ext = os.path.splitext(original_file_name)
|
||||
modified_file_name = file_name_without_ext + "_invalid" + file_ext
|
||||
destination_path = os.path.join(output_folder, modified_file_name)
|
||||
new_doc = Document()
|
||||
|
||||
new_doc = Document() # 创建新文档
|
||||
# 定义正则表达式(使用 regex 模块):
|
||||
begin_pattern = regex.compile(
|
||||
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
|
||||
r'(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\))]?\s*$',
|
||||
regex.MULTILINE
|
||||
)
|
||||
end_pattern = regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:(?:文件|书)(?:的)?(?:组成|构成|编制)|格式).*|'
|
||||
r'第[一二三四五六七八九十]+(?:章|部分).*?合同|'
|
||||
r'[::]清标报告|'
|
||||
r'\s*(投标文件|响应文件|响应性文件|应答文件)(?:的)?格式(?:及相关附件)?\s*$',
|
||||
regex.MULTILINE
|
||||
)
|
||||
|
||||
# 定义正则表达式模式
|
||||
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷')
|
||||
end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|:清标报告|:清标报告')
|
||||
# 遍历所有块级对象(段落和表格)
|
||||
block_items = list(iter_block_items(doc))
|
||||
|
||||
# 寻找最后一个begin_pattern的位置
|
||||
last_begin_index = -1
|
||||
for i, paragraph in enumerate(doc.paragraphs):
|
||||
|
||||
if begin_pattern.search(paragraph.text):
|
||||
last_begin_index = i
|
||||
|
||||
# 从最后一个匹配的begin_pattern开始复制,直到end_pattern
|
||||
if last_begin_index != -1:
|
||||
for i, paragraph in enumerate(doc.paragraphs[last_begin_index:], start=last_begin_index):
|
||||
new_para = new_doc.add_paragraph(style=paragraph.style)
|
||||
for run in paragraph.runs:
|
||||
new_run = new_para.add_run(run.text)
|
||||
new_run.bold = run.bold
|
||||
new_run.italic = run.italic
|
||||
new_run.underline = run.underline
|
||||
if run.font.color:
|
||||
new_run.font.color.rgb = run.font.color.rgb
|
||||
new_run.font.size = run.font.size
|
||||
|
||||
if end_pattern.search(paragraph.text):
|
||||
# 从前向后查找第一个匹配 begin_pattern 的段落索引
|
||||
begin_index = None
|
||||
for idx, block in enumerate(block_items):
|
||||
if isinstance(block, Paragraph):
|
||||
if begin_pattern.search(block.text):
|
||||
begin_index = idx
|
||||
break
|
||||
|
||||
new_doc.save(destination_path) # 保存新文档
|
||||
print("docx截取docx成功!")
|
||||
# 从后向前查找第一个匹配 end_pattern 的段落索引
|
||||
end_index = None
|
||||
for idx in range(len(block_items) - 1, -1, -1):
|
||||
block = block_items[idx]
|
||||
if isinstance(block, Paragraph):
|
||||
if end_pattern.search(block.text):
|
||||
end_index = idx
|
||||
break
|
||||
|
||||
# 根据匹配结果确定截取范围
|
||||
if begin_index is None and end_index is None:
|
||||
# 两者都没匹配到,采用整个文档
|
||||
begin_index = 0
|
||||
end_index = len(block_items) - 1
|
||||
elif begin_index is None:
|
||||
# 没有匹配到开头,则从第一块开始,但保留实际找到的 end_index
|
||||
begin_index = 0
|
||||
elif end_index is None:
|
||||
# 没有匹配到结尾,则到最后一块结束,但保留实际找到的 begin_index
|
||||
end_index = len(block_items) - 1
|
||||
|
||||
# 如果匹配到的起始段落在结束段落之后,则采用整个文档范围
|
||||
if begin_index > end_index:
|
||||
begin_index = 0
|
||||
end_index = len(block_items) - 1
|
||||
|
||||
# 将 begin_index 到 end_index 之间的所有块(包括段落和表格)复制到新文档中
|
||||
for block in block_items[begin_index : end_index + 1]:
|
||||
# 直接将原块的 XML 进行 deep copy 后添加到新文档的 body 中
|
||||
new_doc._body._element.append(copy.deepcopy(block._element))
|
||||
|
||||
new_doc.save(destination_path)
|
||||
print("docx截取成功,已保存到:", destination_path)
|
||||
return destination_path
|
||||
|
||||
|
||||
# 调用函数
|
||||
# 测试调用
|
||||
if __name__ == '__main__':
|
||||
source_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile.docx"
|
||||
source_path = r"C:\Users\Administrator\Desktop\货物标\zbfilesdocx\6_2定版视频会议磋商文件.docx"
|
||||
res = copy_docx(source_path)
|
||||
print(res)
|
@ -1,5 +1,4 @@
|
||||
import os.path
|
||||
|
||||
import requests
|
||||
import json
|
||||
|
||||
|
@ -1,5 +1,3 @@
|
||||
import re
|
||||
|
||||
"""
|
||||
处理全角点号->半角点号
|
||||
处理逗号分隔的数字: 你的当前代码无法处理包含逗号的数字,例如 "1,000元"。可以在提取数字之前移除逗号。
|
||||
|
@ -3,7 +3,6 @@ import os
|
||||
import mimetypes
|
||||
import requests
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
from flask_app.general.clean_pdf import is_scanned_pdf
|
||||
def download_file(url, local_filename):
|
||||
"""
|
||||
@ -74,7 +73,7 @@ def download_file(url, local_filename):
|
||||
|
||||
return None,4
|
||||
|
||||
def upload_file(file_path, url):
|
||||
def local_file_2_url(file_path, url):
|
||||
receive_file_url = ""
|
||||
# 定义文件名和路径
|
||||
filename = file_path.split('/')[-1]
|
||||
@ -128,7 +127,7 @@ def pdf2docx(local_path_in):
|
||||
return docx_file_path
|
||||
|
||||
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/p2d'
|
||||
receive_download_url = upload_file(local_path_in, remote_url) # 转换完成,得到下载链接
|
||||
receive_download_url = local_file_2_url(local_path_in, remote_url) # 转换完成,得到下载链接
|
||||
|
||||
local_filename = os.path.join(folder,filename) # 输出文件名 C:\Users\Administrator\Desktop\货物标\zbfiles\6.2定版视频会议磋商文件 不带后缀
|
||||
downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
|
||||
@ -148,7 +147,7 @@ def doc2docx(local_path_in):
|
||||
return docx_file_path
|
||||
|
||||
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d'
|
||||
receive_download_url = upload_file(local_path_in, remote_url)
|
||||
receive_download_url = local_file_2_url(local_path_in, remote_url)
|
||||
print(receive_download_url)
|
||||
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
|
||||
local_filename = os.path.join(folder, filename) # 输出文件名
|
||||
@ -180,7 +179,7 @@ def docx2pdf(local_path_in,force=False):
|
||||
print(f"跳过转换,文件已存在: {pdf_file_path}")
|
||||
return pdf_file_path # 跳过转换
|
||||
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p'
|
||||
receive_download_url = upload_file(local_path_in, remote_url)
|
||||
receive_download_url = local_file_2_url(local_path_in, remote_url)
|
||||
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
|
||||
local_filename = os.path.join(folder, filename) # 输出文件名
|
||||
downloaded_filepath,file_type = download_file(receive_download_url, local_filename)
|
||||
@ -201,101 +200,6 @@ def get_pdf_page_count(file_path):
|
||||
print(f"读取 PDF 页码时出错:{e}")
|
||||
return 0
|
||||
|
||||
# def docx2pdf(file_path):
|
||||
# """
|
||||
# 将本地的 .docx 或 .doc 文件转换为 .pdf 文件。
|
||||
#
|
||||
# 参数:
|
||||
# - file_path: str, 本地文件的路径,支持 .docx 和 .doc 格式。
|
||||
# """
|
||||
# # 检查文件是否存在
|
||||
# if not file_path:
|
||||
# return ""
|
||||
# # 获取文件名和扩展名
|
||||
# base_name = os.path.basename(file_path)
|
||||
# name, ext = os.path.splitext(base_name)
|
||||
# ext = ext.lower().lstrip('.')
|
||||
#
|
||||
# if ext not in ['docx', 'doc']:
|
||||
# raise ValueError(f"doc2pdf 仅支持 .docx 和 .doc 文件,当前文件扩展名为: .{ext}")
|
||||
#
|
||||
# # 定义转换接口
|
||||
# endpoint = 'http://120.26.236.97:5008/convert_to_pdf'
|
||||
# # endpoint = 'http://192.168.0.2:5008/convert_to_pdf'
|
||||
#
|
||||
# # 获取文件所在目录
|
||||
# output_dir = os.path.dirname(file_path)
|
||||
#
|
||||
# # 准备上传的文件
|
||||
# with open(file_path, 'rb') as f:
|
||||
# files = {'file': (base_name, f)}
|
||||
# try:
|
||||
# print(f"正在将 {base_name} 转换为 .pdf 格式...")
|
||||
# response = requests.post(endpoint, files=files)
|
||||
# response.raise_for_status() # 检查请求是否成功
|
||||
# except requests.RequestException as e:
|
||||
# print(f"转换过程中发生错误: {e}")
|
||||
# return
|
||||
#
|
||||
# # 准备保存转换后文件的路径
|
||||
# output_file_name = f"{name}.pdf"
|
||||
# output_path = os.path.join(output_dir, output_file_name)
|
||||
#
|
||||
# # 保存转换后的文件
|
||||
# with open(output_path, 'wb') as out_file:
|
||||
# out_file.write(response.content)
|
||||
#
|
||||
# print(f"文件已成功转换并保存至: {output_path}")
|
||||
# return output_path
|
||||
#
|
||||
#
|
||||
# def doc2docx(file_path):
|
||||
# """
|
||||
# 将本地的 .doc 文件转换为 .docx 文件。
|
||||
#
|
||||
# 参数:
|
||||
# - file_path: str, 本地文件的路径,支持 .doc 格式。
|
||||
# """
|
||||
# # 检查文件是否存在
|
||||
# if not file_path:
|
||||
# return ""
|
||||
# # 获取文件名和扩展名
|
||||
# base_name = os.path.basename(file_path)
|
||||
# name, ext = os.path.splitext(base_name)
|
||||
# ext = ext.lower().lstrip('.')
|
||||
#
|
||||
# if ext != 'doc':
|
||||
# raise ValueError(f"doc2docx 仅支持 .doc 文件,当前文件扩展名为: .{ext}")
|
||||
#
|
||||
# # 定义转换接口
|
||||
# endpoint = 'http://120.26.236.97:5008/convert_to_docx'
|
||||
#
|
||||
# # 获取文件所在目录
|
||||
# output_dir = os.path.dirname(file_path)
|
||||
#
|
||||
# # 准备上传的文件
|
||||
# with open(file_path, 'rb') as f:
|
||||
# files = {'file': (base_name, f)}
|
||||
# try:
|
||||
# print(f"正在将 {base_name} 转换为 .docx 格式...")
|
||||
# response = requests.post(endpoint, files=files)
|
||||
# response.raise_for_status() # 检查请求是否成功
|
||||
# except requests.RequestException as e:
|
||||
# print(f"转换过程中发生错误: {e}")
|
||||
# return
|
||||
#
|
||||
# # 准备保存转换后文件的路径
|
||||
# output_file_name = f"{name}.docx"
|
||||
# output_path = os.path.join(output_dir, output_file_name)
|
||||
#
|
||||
# # 保存转换后的文件
|
||||
# with open(output_path, 'wb') as out_file:
|
||||
# out_file.write(response.content)
|
||||
#
|
||||
# print(f"文件已成功转换并保存至: {output_path}")
|
||||
# return output_path
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 替换为你的文件路径和API URL
|
||||
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf"
|
||||
|
0
flask_app/general/llm/__init__.py
Normal file
0
flask_app/general/llm/__init__.py
Normal file
75
flask_app/general/llm/deepseek.py
Normal file
75
flask_app/general/llm/deepseek.py
Normal file
@ -0,0 +1,75 @@
|
||||
import os
|
||||
import json
|
||||
from openai import OpenAI
|
||||
|
||||
|
||||
def deepseek(user_query, need_extra=False):
|
||||
"""
|
||||
调用 deepseek 接口生成回答。
|
||||
|
||||
参数:
|
||||
user_query (str): 用户输入的查询内容。
|
||||
need_extra (bool): 是否额外返回 usage 信息(completion_tokens),默认为 False。
|
||||
|
||||
返回:
|
||||
如果 need_extra 为 True,则返回一个元组 (full_response, completion_tokens);
|
||||
否则仅返回 full_response。
|
||||
"""
|
||||
client = OpenAI(
|
||||
# 如果没有配置环境变量,请直接将 api_key 替换为实际的 API Key,例如:api_key="sk-xxx"
|
||||
api_key=os.getenv("DASHSCOPE_API_KEY"),
|
||||
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||
)
|
||||
|
||||
# 调用接口,开启流式输出并包含 usage 信息
|
||||
completion = client.chat.completions.create(
|
||||
model="deepseek-r1-distill-qwen-32b",
|
||||
temperature=0.5,
|
||||
messages=[
|
||||
{'role': 'user', 'content': user_query}
|
||||
],
|
||||
stream=True,
|
||||
stream_options={"include_usage": True}
|
||||
)
|
||||
|
||||
full_response = ""
|
||||
completion_tokens = 0
|
||||
|
||||
for chunk in completion:
|
||||
# 解析 chunk 数据:优先使用 to_dict 方法,否则采用 model_dump_json 进行转换
|
||||
if hasattr(chunk, 'to_dict'):
|
||||
chunk_data = chunk.to_dict()
|
||||
else:
|
||||
chunk_data = json.loads(chunk.model_dump_json())
|
||||
|
||||
# 处理 usage 信息
|
||||
usage = chunk_data.get('usage')
|
||||
if usage is not None:
|
||||
completion_tokens = usage.get('completion_tokens', 0)
|
||||
|
||||
# 处理 choices 信息
|
||||
choices = chunk_data.get('choices', [])
|
||||
if choices:
|
||||
choice = choices[0]
|
||||
delta = choice.get('delta', {})
|
||||
content = delta.get('content', '')
|
||||
if content:
|
||||
full_response += content
|
||||
# 如需实时输出可取消注释下面一行
|
||||
print(content, end='', flush=True)
|
||||
# 若需要处理完成原因,可在此处增加相应逻辑
|
||||
if choice.get('finish_reason'):
|
||||
pass
|
||||
|
||||
# 根据 need_extra 决定返回值
|
||||
if need_extra:
|
||||
return full_response, completion_tokens
|
||||
else:
|
||||
return full_response
|
||||
|
||||
|
||||
# 示例调用
|
||||
if __name__ == "__main__":
|
||||
query = "1+1等于几?请用json格式回答,键名为'答案',键值为你的回答"
|
||||
result = deepseek(query, need_extra=True)
|
||||
print("\n完整内容为:", result)
|
@ -2,134 +2,10 @@ import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import fitz
|
||||
import PyPDF2
|
||||
import tempfile
|
||||
import requests
|
||||
from ratelimit import sleep_and_retry, limits
|
||||
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
|
||||
from flask_app.general.table_ocr import CommonOcr
|
||||
|
||||
# 调用豆包对json形式的表格数据进行重构
|
||||
def extract_img_table_text(ocr_result_pages):
|
||||
print(ocr_result_pages)
|
||||
base_prompt = '''
|
||||
任务:你负责解析以json形式输入的表格信息,根据提供的文件内容来恢复表格信息并输出,不要遗漏任何一个文字。
|
||||
|
||||
要求与指南:
|
||||
1. 请运用文档表格理解能力,根据文件内容定位表格的每列和每行,列与列之间用丨分隔,若某列或者某行没有信息则用/填充。
|
||||
2. 请不要遗漏任何一个文字,同时不要打乱行与行之间的顺序,也不要打乱列与列之间的顺序,严格按照文字的位置信息来恢复表格信息。
|
||||
示例输出:
|
||||
表格标题:
|
||||
|序号|名称|数量|单位|单价(元)|总价(元)|技术参数|备注|
|
||||
|形象展示区|
|
||||
|1|公园主E题雕塑|1|套|/|/|根据江夏体育与文化元素定制|/|
|
||||
|
||||
..........
|
||||
'''
|
||||
base_prompt += f"\n\n文件内容:\n{json.dumps(ocr_result_pages, ensure_ascii=False, indent=4)}"
|
||||
model_res = doubao_model(base_prompt)
|
||||
return model_res
|
||||
|
||||
# 判断pdf中是否有图片, 并输出含有图片的页面列表
|
||||
def has_images(pdf_path):
|
||||
# 打开PDF文件
|
||||
pdf_document = fitz.open(pdf_path)
|
||||
# 存储包含图片的页面页数
|
||||
pages_with_imgs = {}
|
||||
# 遍历PDF的每一页
|
||||
for page_num in range(pdf_document.page_count):
|
||||
page = pdf_document.load_page(page_num)
|
||||
# 获取页面的图片列表
|
||||
images = page.get_images(full=True)
|
||||
# 如果页面中有图片,返回True
|
||||
if images:
|
||||
pages_with_imgs[page_num + 1] = images
|
||||
# 如果遍历了所有页面,都没有图片,则返回False
|
||||
return pages_with_imgs
|
||||
|
||||
# 调用通用表格识别对图片中的表格进行提取,放回json形式的表格结构
|
||||
def table_ocr_extract(image_path):
|
||||
table_ocr = CommonOcr(img_path=image_path) # 创建时传递 img_path
|
||||
return table_ocr.recognize()
|
||||
|
||||
# def ocr_extract(image_path):
|
||||
# # 调用您的OCR引擎来识别图像中的文本
|
||||
# # return OcrEngine.recognize_text_from_image(image_path)
|
||||
# # 调用本地ocr
|
||||
# return local_ocr.run(image_path)
|
||||
|
||||
# 提取pdf中某一页的所有图片
|
||||
def extract_images_from_page(pdf_path, image_list, page_num):
|
||||
images = []
|
||||
try:
|
||||
doc = fitz.open(pdf_path)
|
||||
|
||||
for img in image_list:
|
||||
xref = img[0]
|
||||
base_image = doc.extract_image(xref)
|
||||
image_bytes = base_image['image']
|
||||
image_ext = base_image.get('ext', 'png')
|
||||
images.append({'data': image_bytes, 'ext': image_ext, 'page_num': page_num + 1})
|
||||
except Exception as e:
|
||||
print(f"提取图片时出错: {e}")
|
||||
return images
|
||||
def pdf_image2txt(file_path, img_pdf_list):
|
||||
common_header = extract_common_header(file_path)
|
||||
# print(f"公共抬头:{common_header}")
|
||||
# print("--------------------正文开始-------------------")
|
||||
result = ""
|
||||
pdf_document = fitz.open(file_path)
|
||||
with open(file_path, 'rb') as file:
|
||||
reader = PyPDF2.PdfReader(file)
|
||||
num_pages = len(reader.pages)
|
||||
# print(f"Total pages: {num_pages}")
|
||||
for page_num in range(num_pages):
|
||||
page = reader.pages[page_num]
|
||||
# text = page.extract_text()
|
||||
text = page.extract_text() or ""
|
||||
cleaned_text = clean_page_content(text, common_header)
|
||||
# # print(f"--------第{page_num}页-----------")
|
||||
if (page_num + 1) in img_pdf_list:
|
||||
print(f"第 {page_num + 1} 页含有图片,开始提取图片并OCR")
|
||||
images = extract_images_from_page(file_path, img_pdf_list[page_num + 1], page_num)
|
||||
for img in images:
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.' + img['ext']) as temp_image:
|
||||
temp_image.write(img['data'])
|
||||
temp_image.flush()
|
||||
# 调用OCR函数
|
||||
ocr_result = table_ocr_extract(temp_image.name)
|
||||
ocr_result = json.loads(ocr_result)
|
||||
# 判断是否提取成功并且 pages 中有数据
|
||||
if ocr_result['code'] == 200 and len(ocr_result['result']['pages']) > 0:
|
||||
print("提取成功,图片数据已提取。")
|
||||
ocr_result_pages = ocr_result['result']['pages']
|
||||
table_text = extract_img_table_text(ocr_result_pages)
|
||||
if table_text.strip():
|
||||
cleaned_text += "\n" + table_text
|
||||
else:
|
||||
print("提取失败或没有页面数据。")
|
||||
except Exception as e:
|
||||
print(f"OCR处理失败: {e}")
|
||||
finally:
|
||||
try:
|
||||
os.remove(temp_image.name)
|
||||
except Exception as e:
|
||||
print(f"删除临时文件失败: {e}")
|
||||
result += cleaned_text
|
||||
|
||||
directory = os.path.dirname(os.path.abspath(file_path))
|
||||
output_path = os.path.join(directory, 'extract.txt')
|
||||
# 将结果保存到 extract.txt 文件中
|
||||
try:
|
||||
with open(output_path, 'w', encoding='utf-8') as output_file:
|
||||
output_file.write(result)
|
||||
print(f"提取内容已保存到: {output_path}")
|
||||
except IOError as e:
|
||||
print(f"写入文件时发生错误: {e}")
|
||||
# 返回保存的文件路径
|
||||
return output_path
|
||||
|
||||
def pdf2txt(file_path):
|
||||
common_header = extract_common_header(file_path)
|
||||
@ -182,32 +58,9 @@ def read_txt_to_string(file_path):
|
||||
except Exception as e:
|
||||
return f"错误:读取文件时发生错误。详细信息:{e}"
|
||||
|
||||
def count_tokens(text):
|
||||
"""
|
||||
统计文本中的 tokens 数量:
|
||||
1. 英文字母+数字作为一个 token(如 DN90)。
|
||||
2. 数字+小数点/百分号作为一个 token(如 0.25%)。
|
||||
3. 单个中文字符作为一个 token。
|
||||
4. 单个符号或标点符号作为一个 token。
|
||||
5. 忽略空白字符(空格、空行等)。
|
||||
"""
|
||||
# 正则表达式:
|
||||
# - 英文字母和数字组合:DN90
|
||||
# - 数字+小数点/百分号组合:0.25%
|
||||
# - 单个中文字符:[\u4e00-\u9fff]
|
||||
# - 单个非空白符号:[^\s]
|
||||
token_pattern = r'[a-zA-Z0-9]+(?:\.\d+)?%?|[\u4e00-\u9fff]|[^\s]'
|
||||
tokens = re.findall(token_pattern, text)
|
||||
return len(tokens)# 返回 tokens 数量和匹配的 token 列表
|
||||
|
||||
def get_total_tokens(text):
|
||||
"""
|
||||
调用 API 计算给定文本的总 Token 数量。
|
||||
|
||||
参数:
|
||||
- text (str): 需要计算 Token 的文本。
|
||||
- model (str): 使用的模型名称,默认值为 "ep-20241119121710-425g6"。
|
||||
|
||||
调用 API 计算给定文本的总 Token 数量。 注:doubao的计算方法!与qianwen不一样
|
||||
返回:
|
||||
- int: 文本的 total_tokens 数量。
|
||||
"""
|
||||
@ -241,7 +94,6 @@ def get_total_tokens(text):
|
||||
print(f"获取 Token 数量失败:{e}")
|
||||
return 0
|
||||
|
||||
|
||||
@sleep_and_retry
|
||||
@limits(calls=10, period=1) # 每秒最多调用10次
|
||||
def doubao_model(full_user_query, need_extra=False):
|
||||
@ -356,7 +208,6 @@ def doubao_model(full_user_query, need_extra=False):
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def generate_full_user_query(file_path, prompt_template):
|
||||
"""
|
||||
根据文件路径和提示词模板生成完整的user_query。
|
||||
@ -372,10 +223,8 @@ def generate_full_user_query(file_path, prompt_template):
|
||||
full_text=read_txt_to_string(file_path)
|
||||
# 格式化提示词,将提取的文件内容插入到模板中
|
||||
user_query = prompt_template.format(full_text=full_text)
|
||||
|
||||
return user_query
|
||||
|
||||
#7.文件内容为markdown格式, 表格特殊情况处理:对于表格数据,可能存在原始pdf转换markdown时跨页导致同一个货物名称(或系统名称)分隔在上下两个单元格内,你需要通过上下文语义判断是否合并之后才是完整且正确的货物名称(或系统名称)。
|
||||
if __name__ == "__main__":
|
||||
txt_path = r"output.txt"
|
||||
pdf_path_1 = "D:/bid_generator/task_folder/9a447eb0-24b8-4f51-8164-d91a62edea25/tmp/bid_format.pdf"
|
@ -1,10 +1,8 @@
|
||||
# -*- encoding:utf-8 -*-
|
||||
import concurrent.futures
|
||||
import json
|
||||
from flask_app.general.doubao import doubao_model
|
||||
from flask_app.general.json_utils import clean_json_string
|
||||
from flask_app.general.通义千问long import qianwen_long, qianwen_long_stream, qianwen_plus
|
||||
|
||||
from flask_app.general.llm.通义千问long_plus import qianwen_long, qianwen_long_stream, qianwen_plus
|
||||
|
||||
def generate_continue_query(original_query, original_answer):
|
||||
"""
|
@ -1,19 +1,14 @@
|
||||
# 基于知识库提问的通用模板,
|
||||
# assistant_id
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import queue
|
||||
import concurrent.futures
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
from dashscope import Assistants, Messages, Runs, Threads
|
||||
from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
|
||||
|
||||
from flask_app.general.doubao import doubao_model
|
||||
from flask_app.general.通义千问long import qianwen_long, upload_file, qianwen_plus
|
||||
from flask_app.general.llm.通义千问long_plus import qianwen_long, upload_file, qianwen_plus
|
||||
|
||||
prompt = """
|
||||
# 角色
|
@ -3,7 +3,7 @@ import json
|
||||
import re
|
||||
from flask_app.general.format_date import format_chinese_date
|
||||
from flask_app.general.format_amout import format_amount
|
||||
from flask_app.routes.偏离表main import postprocess_technical_table, prepare_for_zige_info, process_functions_in_parallel
|
||||
from flask_app.routes.偏离表数据解析main import postprocess_technical_table, prepare_for_zige_info, process_functions_in_parallel
|
||||
|
||||
|
||||
# 定义一个辅助函数用于获取嵌套字典中的值
|
||||
@ -15,7 +15,6 @@ def get_nested(dic, keys, default=None):
|
||||
return default
|
||||
return dic
|
||||
|
||||
|
||||
def inner_post_processing(base_info):
|
||||
# print(json.dumps(base_info,ensure_ascii=False,indent=4))
|
||||
"""
|
||||
@ -239,16 +238,10 @@ def inner_post_processing(base_info):
|
||||
|
||||
return extracted_info
|
||||
|
||||
|
||||
def outer_post_processing(combined_data, includes, good_list):
|
||||
"""
|
||||
外层处理函数,调用内层 post_processing 处理 '基础信息',并构建 processed_data。
|
||||
额外提取 '采购要求' 下的 '技术要求' 内容。
|
||||
|
||||
参数:
|
||||
combined_data (dict): 原始合并数据。
|
||||
includes (list): 需要包含的键列表。
|
||||
|
||||
外层处理函数,调用内层 inner_post_processing 处理 '基础信息',并构建 processed_data。
|
||||
额外提取 '采购要求' 下的 '技术要求' 内容以及各块的信息 =》生成商务、技术偏离表所需信息
|
||||
返回:
|
||||
tuple: (processed_data, extracted_info, procurement_reqs)
|
||||
"""
|
||||
@ -272,7 +265,7 @@ def outer_post_processing(combined_data, includes, good_list):
|
||||
# 检查 '基础信息' 是否在 includes 中
|
||||
if "基础信息" in includes:
|
||||
base_info = combined_data.get("基础信息", {})
|
||||
# 调用内层 post_processing 处理 '基础信息'
|
||||
# 调用内层 inner_post_processing 处理 '基础信息'
|
||||
extracted_info = inner_post_processing(base_info)
|
||||
# 将 '基础信息' 保留在处理后的数据中
|
||||
processed_data["基础信息"] = base_info
|
||||
|
@ -1,10 +1,6 @@
|
||||
import os
|
||||
|
||||
from docx import Document
|
||||
import json
|
||||
|
||||
|
||||
|
||||
def read_tables_from_docx(file_path):
|
||||
"""读取DOCX文件中的表格数据,并以嵌套字典的形式返回."""
|
||||
doc = Document(file_path)
|
||||
|
@ -3,12 +3,12 @@ import os
|
||||
import re
|
||||
import time
|
||||
from typing import Any, Dict
|
||||
from flask_app.general.doubao import read_txt_to_string
|
||||
from flask_app.general.llm.doubao import read_txt_to_string
|
||||
from flask_app.general.file2markdown import convert_file_to_markdown
|
||||
from flask_app.general.format_change import get_pdf_page_count, pdf2docx
|
||||
from flask_app.general.json_utils import extract_content_from_json
|
||||
from flask_app.general.model_continue_query import process_continue_answers
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long, qianwen_plus, qianwen_long_stream
|
||||
from flask_app.general.llm.model_continue_query import process_continue_answers
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file, qianwen_long, qianwen_plus, qianwen_long_stream
|
||||
|
||||
|
||||
def remove_unknown_scores(data):
|
||||
@ -455,7 +455,7 @@ def combine_evaluation_standards(evaluation_method_path,invalid_path,zb_type):
|
||||
user_query=generate_prompt(zb_type)
|
||||
if model_type==4:
|
||||
full_text = read_txt_to_string(processed_filepath)
|
||||
user_query += f"\n文件内容:\n{full_text}\n"
|
||||
user_query = f"文本内容:\n{full_text}\n" + user_query
|
||||
questions_to_continue = []
|
||||
temp_final={}
|
||||
if model_type==4:
|
||||
|
@ -114,7 +114,6 @@ def check_pdf_pages(pdf_path, mode, logger):
|
||||
else:
|
||||
return True, ['', '', '', '', '', pdf_path, '']
|
||||
|
||||
|
||||
def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header):
|
||||
try:
|
||||
if start_page is None or end_page is None:
|
||||
@ -237,14 +236,12 @@ def is_pdf_or_doc(filename):
|
||||
# 判断文件是否为PDF或Word文档
|
||||
return filename.lower().endswith(('.pdf', '.doc', '.docx'))
|
||||
|
||||
|
||||
def convert_to_pdf(file_path):
|
||||
# 假设 docx2pdf 函数已经被定义,这里仅根据文件扩展名来决定是否需要转换
|
||||
if file_path.lower().endswith(('.doc', '.docx')):
|
||||
return docx2pdf(file_path)
|
||||
return file_path
|
||||
|
||||
|
||||
def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
||||
"""
|
||||
从指定的PDF文件中提取无效部分并保存到输出文件夹中。
|
||||
|
@ -2,8 +2,8 @@
|
||||
import json
|
||||
import re
|
||||
from flask_app.general.json_utils import clean_json_string
|
||||
from flask_app.general.model_continue_query import process_continue_answers
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long_stream
|
||||
from flask_app.general.llm.model_continue_query import process_continue_answers
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file, qianwen_long_stream
|
||||
|
||||
#提取两个大标题之间的内容
|
||||
def extract_between_sections(data, target_values,flag=False):
|
||||
|
@ -4,10 +4,8 @@ import re
|
||||
import regex
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from flask_app.general.doubao import generate_full_user_query
|
||||
from flask_app.general.format_change import pdf2docx
|
||||
from flask_app.general.insert_del_pagemark import insert_mark
|
||||
from flask_app.general.通义千问long import qianwen_plus
|
||||
from flask_app.general.llm.doubao import generate_full_user_query
|
||||
from flask_app.general.llm.通义千问long_plus import qianwen_plus
|
||||
from flask_app.general.通用功能函数 import process_string_list
|
||||
from collections import OrderedDict
|
||||
from docx import Document
|
||||
@ -526,7 +524,6 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
|
||||
r'其\s*他.*?情\s*形\s*[::]',
|
||||
r'包\s*括'
|
||||
]
|
||||
|
||||
doc_contents = extract_file_elements(file_path)
|
||||
processed_paragraphs = preprocess_paragraphs(doc_contents)
|
||||
extracted_contents = extract_text_with_keywords(processed_paragraphs, [keywords], follow_up_keywords)
|
||||
@ -587,7 +584,6 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
|
||||
print(f"handle_query 在处理 {result_key} 时发生异常: {e}")
|
||||
return {result_key: [f"未解析到'{result_key}'!"]}
|
||||
|
||||
|
||||
def combine_find_invalid(invalid_docpath, output_dir):
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
queries = [
|
||||
|
@ -1,11 +1,10 @@
|
||||
# -*- encoding:utf-8 -*-
|
||||
import ast
|
||||
import logging
|
||||
import re
|
||||
from flask_app.general.json_utils import clean_json_string
|
||||
from flask_app.general.多线程提问 import multi_threading
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long
|
||||
from flask_app.工程标.判断是否分包等 import read_questions_from_judge
|
||||
from flask_app.general.llm.多线程提问 import multi_threading
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file, qianwen_long
|
||||
from flask_app.old_version.判断是否分包等_old import read_questions_from_judge
|
||||
|
||||
def get_deviation_requirements(invalid_path):
|
||||
file_id=upload_file(invalid_path)
|
||||
@ -240,6 +239,26 @@ def process_string_list(string_list):
|
||||
print(f"Error occurred: {e}")
|
||||
return [] # 出现任何异常时返回空列表
|
||||
|
||||
#统计tokens,非常粗糙;现在直接调用qianwen doubao的相关接口直接统计,该函数废弃。
|
||||
def count_tokens(text):
|
||||
"""
|
||||
统计文本中的 tokens 数量:
|
||||
1. 英文字母+数字作为一个 token(如 DN90)。
|
||||
2. 数字+小数点/百分号作为一个 token(如 0.25%)。
|
||||
3. 单个中文字符作为一个 token。
|
||||
4. 单个符号或标点符号作为一个 token。
|
||||
5. 忽略空白字符(空格、空行等)。
|
||||
"""
|
||||
# 正则表达式:
|
||||
# - 英文字母和数字组合:DN90
|
||||
# - 数字+小数点/百分号组合:0.25%
|
||||
# - 单个中文字符:[\u4e00-\u9fff]
|
||||
# - 单个非空白符号:[^\s]
|
||||
token_pattern = r'[a-zA-Z0-9]+(?:\.\d+)?%?|[\u4e00-\u9fff]|[^\s]'
|
||||
tokens = re.findall(token_pattern, text)
|
||||
return len(tokens)# 返回 tokens 数量和匹配的 token 列表
|
||||
|
||||
#根据id获取日志
|
||||
def get_global_logger(unique_id):
|
||||
if unique_id is None:
|
||||
return logging.getLogger() # 获取默认的日志器
|
||||
|
123
flask_app/old_version/extarct_img_2_txt_old.py
Normal file
123
flask_app/old_version/extarct_img_2_txt_old.py
Normal file
@ -0,0 +1,123 @@
|
||||
import json
|
||||
import os
|
||||
import fitz
|
||||
import PyPDF2
|
||||
import tempfile
|
||||
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
|
||||
from flask_app.general.llm.doubao import doubao_model
|
||||
from flask_app.old_version.table_ocr_old import CommonOcr
|
||||
|
||||
def extract_img_table_text(ocr_result_pages):
|
||||
# 调用豆包对json形式的表格数据进行重构
|
||||
# print(ocr_result_pages)
|
||||
base_prompt = '''
|
||||
任务:你负责解析以json形式输入的表格信息,根据提供的文件内容来恢复表格信息并输出,不要遗漏任何一个文字。
|
||||
|
||||
要求与指南:
|
||||
1. 请运用文档表格理解能力,根据文件内容定位表格的每列和每行,列与列之间用丨分隔,若某列或者某行没有信息则用/填充。
|
||||
2. 请不要遗漏任何一个文字,同时不要打乱行与行之间的顺序,也不要打乱列与列之间的顺序,严格按照文字的位置信息来恢复表格信息。
|
||||
示例输出:
|
||||
表格标题:
|
||||
|序号|名称|数量|单位|单价(元)|总价(元)|技术参数|备注|
|
||||
|形象展示区|
|
||||
|1|公园主E题雕塑|1|套|/|/|根据江夏体育与文化元素定制|/|
|
||||
|
||||
..........
|
||||
'''
|
||||
base_prompt += f"\n\n文件内容:\n{json.dumps(ocr_result_pages, ensure_ascii=False, indent=4)}"
|
||||
model_res = doubao_model(base_prompt)
|
||||
return model_res
|
||||
|
||||
# 判断pdf中是否有图片, 并输出含有图片的页面列表
|
||||
def has_images(pdf_path):
|
||||
# 打开PDF文件
|
||||
pdf_document = fitz.open(pdf_path)
|
||||
# 存储包含图片的页面页数
|
||||
pages_with_imgs = {}
|
||||
# 遍历PDF的每一页
|
||||
for page_num in range(pdf_document.page_count):
|
||||
page = pdf_document.load_page(page_num)
|
||||
# 获取页面的图片列表
|
||||
images = page.get_images(full=True)
|
||||
# 如果页面中有图片,返回True
|
||||
if images:
|
||||
pages_with_imgs[page_num + 1] = images
|
||||
# 如果遍历了所有页面,都没有图片,则返回False
|
||||
return pages_with_imgs
|
||||
|
||||
# 调用通用表格识别对图片中的表格进行提取,放回json形式的表格结构
|
||||
def table_ocr_extract(image_path):
|
||||
table_ocr = CommonOcr(img_path=image_path) # 创建时传递 img_path
|
||||
return table_ocr.recognize()
|
||||
|
||||
# 提取pdf中某一页的所有图片
|
||||
def extract_images_from_page(pdf_path, image_list, page_num):
|
||||
images = []
|
||||
try:
|
||||
doc = fitz.open(pdf_path)
|
||||
|
||||
for img in image_list:
|
||||
xref = img[0]
|
||||
base_image = doc.extract_image(xref)
|
||||
image_bytes = base_image['image']
|
||||
image_ext = base_image.get('ext', 'png')
|
||||
images.append({'data': image_bytes, 'ext': image_ext, 'page_num': page_num + 1})
|
||||
except Exception as e:
|
||||
print(f"提取图片时出错: {e}")
|
||||
return images
|
||||
def pdf_image2txt(file_path, img_pdf_list):
|
||||
common_header = extract_common_header(file_path)
|
||||
# print(f"公共抬头:{common_header}")
|
||||
# print("--------------------正文开始-------------------")
|
||||
result = ""
|
||||
pdf_document = fitz.open(file_path)
|
||||
with open(file_path, 'rb') as file:
|
||||
reader = PyPDF2.PdfReader(file)
|
||||
num_pages = len(reader.pages)
|
||||
# print(f"Total pages: {num_pages}")
|
||||
for page_num in range(num_pages):
|
||||
page = reader.pages[page_num]
|
||||
# text = page.extract_text()
|
||||
text = page.extract_text() or ""
|
||||
cleaned_text = clean_page_content(text, common_header)
|
||||
# # print(f"--------第{page_num}页-----------")
|
||||
if (page_num + 1) in img_pdf_list:
|
||||
print(f"第 {page_num + 1} 页含有图片,开始提取图片并OCR")
|
||||
images = extract_images_from_page(file_path, img_pdf_list[page_num + 1], page_num)
|
||||
for img in images:
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.' + img['ext']) as temp_image:
|
||||
temp_image.write(img['data'])
|
||||
temp_image.flush()
|
||||
# 调用OCR函数
|
||||
ocr_result = table_ocr_extract(temp_image.name)
|
||||
ocr_result = json.loads(ocr_result)
|
||||
# 判断是否提取成功并且 pages 中有数据
|
||||
if ocr_result['code'] == 200 and len(ocr_result['result']['pages']) > 0:
|
||||
print("提取成功,图片数据已提取。")
|
||||
ocr_result_pages = ocr_result['result']['pages']
|
||||
table_text = extract_img_table_text(ocr_result_pages)
|
||||
if table_text.strip():
|
||||
cleaned_text += "\n" + table_text
|
||||
else:
|
||||
print("提取失败或没有页面数据。")
|
||||
except Exception as e:
|
||||
print(f"OCR处理失败: {e}")
|
||||
finally:
|
||||
try:
|
||||
os.remove(temp_image.name)
|
||||
except Exception as e:
|
||||
print(f"删除临时文件失败: {e}")
|
||||
result += cleaned_text
|
||||
|
||||
directory = os.path.dirname(os.path.abspath(file_path))
|
||||
output_path = os.path.join(directory, 'extract.txt')
|
||||
# 将结果保存到 extract.txt 文件中
|
||||
try:
|
||||
with open(output_path, 'w', encoding='utf-8') as output_file:
|
||||
output_file.write(result)
|
||||
print(f"提取内容已保存到: {output_path}")
|
||||
except IOError as e:
|
||||
print(f"写入文件时发生错误: {e}")
|
||||
# 返回保存的文件路径
|
||||
return output_path
|
@ -1,7 +1,5 @@
|
||||
import os
|
||||
|
||||
import requests
|
||||
import json
|
||||
|
||||
def get_file_content(filePath):
|
||||
with open(filePath, 'rb') as fp:
|
||||
@ -27,7 +25,6 @@ class CommonOcr(object):
|
||||
except Exception as e:
|
||||
return str(e)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
png_path = "test.png"
|
||||
ocr = CommonOcr(img_path=png_path) # 将路径传入构造函数
|
@ -4,7 +4,7 @@ import re
|
||||
|
||||
from PyPDF2 import PdfWriter, PdfReader
|
||||
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file, qianwen_long
|
||||
from flask_app.general.通用功能函数 import process_string_list
|
||||
|
||||
|
||||
|
@ -2,10 +2,10 @@
|
||||
import json
|
||||
import os.path
|
||||
import re
|
||||
from flask_app.general.json_utils import extract_content_from_json, clean_json_string # 可以选择性地导入特定的函数
|
||||
from flask_app.工程标.提取打勾符号 import read_pdf_and_judge_main
|
||||
from flask_app.general.多线程提问 import multi_threading
|
||||
from flask_app.general.通义千问long import qianwen_long,upload_file
|
||||
from flask_app.general.json_utils import extract_content_from_json # 可以选择性地导入特定的函数
|
||||
from flask_app.old_version.提取打勾符号_old import read_pdf_and_judge_main
|
||||
from flask_app.general.llm.多线程提问 import multi_threading
|
||||
from flask_app.general.llm.通义千问long_plus import qianwen_long,upload_file
|
||||
#调用qianwen-ask之后,组织提示词问百炼。
|
||||
|
||||
def construct_judge_questions(json_data):
|
@ -3,7 +3,7 @@ import json
|
||||
|
||||
from flask_app.general.json_utils import clean_json_string
|
||||
from flask_app.general.商务技术评分提取 import combine_technical_and_business
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file, qianwen_long
|
||||
|
||||
def combine_evaluation_standards(evaluation_method):
|
||||
# 商务标、技术标评分项:千问
|
||||
|
@ -1,10 +1,10 @@
|
||||
import json
|
||||
|
||||
from flask_app.general.json_utils import clean_json_string, add_outer_key
|
||||
from flask_app.general.json_utils import clean_json_string
|
||||
from flask_app.工程标.投标人须知正文提取指定内容工程标 import extract_from_notice
|
||||
from flask_app.工程标.判断是否分包等 import judge_whether_main, read_questions_from_judge
|
||||
from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
|
||||
from flask_app.general.通义千问long import upload_file
|
||||
from flask_app.old_version.判断是否分包等_old import judge_whether_main, read_questions_from_judge
|
||||
from flask_app.general.llm.多线程提问 import read_questions_from_file, multi_threading
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file
|
||||
from flask_app.general.通用功能函数 import judge_consortium_bidding
|
||||
|
||||
def aggregate_basic_info_engineering(baseinfo_list):
|
||||
|
@ -3,7 +3,7 @@ import re
|
||||
import json
|
||||
import time
|
||||
|
||||
from flask_app.general.多线程提问 import multi_threading
|
||||
from flask_app.general.llm.多线程提问 import multi_threading
|
||||
from flask_app.工程标.根据条款号整合json import process_and_merge_entries,process_and_merge2
|
||||
from flask_app.general.json_utils import extract_content_from_json
|
||||
from flask_app.工程标.截取pdf工程标版 import truncate_pdf_main
|
||||
|
@ -1,6 +1,5 @@
|
||||
import re
|
||||
import PyPDF2
|
||||
import json
|
||||
|
||||
def extract_key_value_pairs(text):
|
||||
# 更新正则表达式来包括对"团"的处理和行尾斜线
|
||||
@ -53,7 +52,6 @@ def extract_key_value_pairs(text):
|
||||
results[key] = "无" # 为键赋值"无"
|
||||
return results
|
||||
|
||||
|
||||
def read_pdf_and_judge_main(file_path, output_txt_path):
|
||||
with open(file_path, 'rb') as file:
|
||||
reader = PyPDF2.PdfReader(file)
|
||||
@ -80,9 +78,6 @@ def read_pdf_and_judge_main(file_path, output_txt_path):
|
||||
|
||||
print(f"da_gou signal: Data extraction complete and saved to '{output_txt_path}'.")
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 示例调用
|
||||
file_path ="C:\\Users\Administrator\\Desktop\\fsdownload\\ee2d8828-bae0-465a-9171-7b2dd7453251\\ztbfile_tobidders_notice_table.pdf"
|
@ -4,8 +4,8 @@ import re
|
||||
import regex
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from flask_app.general.doubao import generate_full_user_query
|
||||
from flask_app.general.通义千问long import qianwen_plus
|
||||
from flask_app.general.llm.doubao import generate_full_user_query
|
||||
from flask_app.general.llm.通义千问long_plus import qianwen_plus
|
||||
from flask_app.general.通用功能函数 import process_string_list
|
||||
from collections import OrderedDict
|
||||
from docx import Document
|
||||
|
@ -5,7 +5,7 @@ import time
|
||||
import re
|
||||
|
||||
from flask_app.general.format_change import pdf2docx
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file, qianwen_long
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from flask_app.general.table_content_extraction import extract_tables_main
|
||||
|
@ -4,7 +4,7 @@ import time
|
||||
from flask_app.general.json_utils import clean_json_string
|
||||
from flask_app.general.商务技术评分提取 import combine_technical_and_business, \
|
||||
process_data_based_on_key, reorganize_data
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file, qianwen_long
|
||||
|
||||
def combine_evaluation_standards(truncate_file):
|
||||
# 定义默认的评审结果字典
|
||||
|
@ -4,7 +4,7 @@ from flask_app.old_version.提取json工程标版_old import convert_clause_to_j
|
||||
from flask_app.general.json_utils import extract_content_from_json
|
||||
from flask_app.old_version.形式响应评审old import process_reviews
|
||||
from flask_app.old_version.资格评审old_old import process_qualification
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file, qianwen_long
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
|
||||
|
@ -3,8 +3,8 @@
|
||||
import json
|
||||
import re
|
||||
from flask_app.general.json_utils import clean_json_string, combine_json_results, add_keys_to_json
|
||||
from flask_app.general.多线程提问 import multi_threading, read_questions_from_file
|
||||
from flask_app.general.通义千问long import upload_file
|
||||
from flask_app.general.llm.多线程提问 import multi_threading, read_questions_from_file
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file
|
||||
|
||||
|
||||
def merge_dictionaries_under_common_key(dicts, common_key):
|
||||
|
@ -4,7 +4,7 @@ import json
|
||||
from flask import Blueprint, Response, g
|
||||
import os
|
||||
from flask_app.general.format_change import download_file
|
||||
from flask_app.routes.偏离表main import get_tech_and_business_deviation
|
||||
from flask_app.routes.偏离表数据解析main import get_tech_and_business_deviation
|
||||
from flask_app.routes.utils import generate_deviation_response, validate_and_setup_logger, create_response, sse_format, \
|
||||
log_error_unique_id
|
||||
from flask_app.ConnectionLimiter import require_connection_limit
|
||||
|
@ -5,8 +5,8 @@ from functools import wraps
|
||||
|
||||
from flask import request, jsonify, current_app, g
|
||||
|
||||
from flask_app.general.清除file_id import read_file_ids, delete_file_by_ids
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long
|
||||
from flask_app.general.llm.清除file_id import read_file_ids, delete_file_by_ids
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file, qianwen_long
|
||||
from flask_app.logger_setup import create_logger
|
||||
|
||||
|
||||
|
@ -7,7 +7,7 @@ from copy import deepcopy
|
||||
from flask_app.general.format_change import docx2pdf,doc2docx
|
||||
from flask_app.general.json_utils import clean_json_string, rename_outer_key
|
||||
from flask_app.general.merge_pdfs import merge_pdfs
|
||||
from flask_app.general.通义千问long import qianwen_plus
|
||||
from flask_app.general.llm.通义千问long_plus import qianwen_plus
|
||||
from flask_app.general.通用功能函数 import get_global_logger
|
||||
from flask_app.general.截取pdf_main import truncate_pdf_multiple
|
||||
from flask_app.货物标.提取采购需求main import fetch_procurement_reqs
|
||||
@ -60,6 +60,7 @@ def prepare_for_zige_info(zige_review):
|
||||
return "", "", ""
|
||||
def extract_zige_deviation_table(zige_info, fuhe_info, zigefuhe_info):
|
||||
prompt_template1 = """
|
||||
文本内容:{full_text}
|
||||
任务:给出一份文本,根据文本提取资格性检查的具体评审标准。
|
||||
输出要求:
|
||||
1.以json格式返回结果,不要输出其他内容。
|
||||
@ -67,11 +68,9 @@ def extract_zige_deviation_table(zige_info, fuhe_info, zigefuhe_info):
|
||||
要求与指南:
|
||||
1. 评审标准是具体的内容,不要返回诸如'本项目的特定资格要求:'这种标题性质且不能体现具体评审标准的内容。
|
||||
2. 若文本中存在相同或相似的表述,仅需取其中一个作为键值中的一条即可。
|
||||
|
||||
文本内容:{full_text}
|
||||
"""
|
||||
|
||||
prompt_template2 = """
|
||||
文本内容:{full_text}
|
||||
任务:给出一份文本,根据文本提取符合性检查的具体评审标准。
|
||||
输出要求:
|
||||
1.以json格式返回结果,不要输出其他内容。
|
||||
@ -96,10 +95,9 @@ def extract_zige_deviation_table(zige_info, fuhe_info, zigefuhe_info):
|
||||
...
|
||||
]
|
||||
}}
|
||||
|
||||
文本内容:{full_text}
|
||||
"""
|
||||
prompt_template3 = """
|
||||
文本内容:{full_text}
|
||||
任务:给出一份文本,根据文本提取资格性检查和符合性检查的具体评审标准。
|
||||
输出要求:
|
||||
1.以json格式返回结果,不要输出其他内容。
|
||||
@ -107,10 +105,7 @@ def extract_zige_deviation_table(zige_info, fuhe_info, zigefuhe_info):
|
||||
要求与指南:
|
||||
1. 评审标准应该是具体的内容,不要返回诸如'本项目的特定符合性要求:'这种标题性质且不能体现具体评审标准的内容。
|
||||
2. 若文本中存在相同或相似的表述,仅需取其中一个作为键值中的一条即可。
|
||||
|
||||
文本内容:{full_text}
|
||||
"""
|
||||
|
||||
def get_model_response(query):
|
||||
return qianwen_plus(query)
|
||||
|
||||
@ -276,7 +271,9 @@ def extract_business_deviation(busi_requirements_dict):
|
||||
renamed_requirements = rename_outer_keys(updated_requirements)
|
||||
business_requirements_string = json.dumps(renamed_requirements, ensure_ascii=False, indent=4)
|
||||
# print(business_requirements_string)
|
||||
prompt_template1 = """以下文本是项目采购需求的商务要求部分,请帮我将信息重新组织,键名为'商务要求',键值为字符串列表,其中每个字符串为一条商务要求,保留三角▲、五角星★(若有),但是去除开头的序号(若有)。
|
||||
prompt_template1 = """
|
||||
文本内容:{full_text}
|
||||
以上文本是项目采购需求的商务要求部分,请帮我将信息重新组织,键名为'商务要求',键值为字符串列表,其中每个字符串为一条商务要求,保留三角▲、五角星★(若有),但是去除开头的序号(若有)。
|
||||
**角色**
|
||||
你是一个专业的招投标业务专家,擅长从招标文件中总结商务要求的部分,并逐条列出,作为编写商务要求偏离表的前置准备。
|
||||
|
||||
@ -333,11 +330,11 @@ def extract_business_deviation(busi_requirements_dict):
|
||||
"因投标人自身原因造成漏报、少报皆由其自行承担责任,采购人不再补偿。"
|
||||
]
|
||||
}}
|
||||
|
||||
文本内容:{full_text}
|
||||
"""
|
||||
user_query1 = prompt_template1.format(full_text=business_requirements_string)
|
||||
prompt_template2 = """以下文本是项目采购需求的商务要求部分。请从中提取以★、▲或其他特殊符号开头的要求项,它们一般是重要的商务要求,需要额外响应。返回结果应仅包含一个键名“重要商务要求”,其键值为字符串列表,每个字符串对应一个以★、▲或特殊符号开头的要求项,但是去除开头的序号(若有)。
|
||||
prompt_template2 = """
|
||||
文本内容:{full_text}
|
||||
以上文本是项目采购需求的商务要求部分。请从中提取以★、▲或其他特殊符号开头的要求项,它们一般是重要的商务要求,需要额外响应。返回结果应仅包含一个键名“重要商务要求”,其键值为字符串列表,每个字符串对应一个以★、▲或特殊符号开头的要求项,但是去除开头的序号(若有)。
|
||||
|
||||
**要求与指南**:
|
||||
1. 每个以★、▲或其他特殊符号开头的要求项应作为单独的字符串。
|
||||
@ -391,8 +388,6 @@ def extract_business_deviation(busi_requirements_dict):
|
||||
"★交货地点:采购人指定地点"
|
||||
]
|
||||
}}
|
||||
|
||||
文本内容:{full_text}
|
||||
"""
|
||||
user_query2 = prompt_template2.format(full_text=business_requirements_string)
|
||||
queries = [user_query1, user_query2]
|
||||
@ -416,10 +411,8 @@ def extract_business_deviation(busi_requirements_dict):
|
||||
business_req_deviation = results[0] if results[0] is not None else default_return[0]
|
||||
business_star_req_deviation = results[1] if results[1] is not None else default_return[1]
|
||||
business_star_req_deviation=rename_outer_key(business_star_req_deviation,"商务要求带星")
|
||||
|
||||
return business_req_deviation, business_star_req_deviation
|
||||
|
||||
|
||||
def get_tech_star_deviation(tech_requirements_dict):
|
||||
def get_tech_star_deviation_directly(tech_dict):
|
||||
"""
|
||||
@ -474,7 +467,9 @@ def get_tech_star_deviation(tech_requirements_dict):
|
||||
if not tech_dict:
|
||||
return {}
|
||||
tech_string = json.dumps(tech_dict, ensure_ascii=False, indent=4)
|
||||
prompt_template = """以下输入文本包含采购标的的技术参数要求或采购要求。请从每个键对应的字符串列表中提取带有星★或三角▲或其他特殊符号开头的的要求项。返回结果仅为符合要求的 JSON 格式对象,每个键名保持不变,键值为包含该键名下的带星或带三角或以其他特殊符合开头的要求项的字符串列表。
|
||||
prompt_template = """
|
||||
文本内容:{full_text}
|
||||
以上输入文本包含采购标的的技术参数要求或采购要求。请从每个键对应的字符串列表中提取带有星★或三角▲或其他特殊符号开头的的要求项。返回结果仅为符合要求的 JSON 格式对象,每个键名保持不变,键值为包含该键名下的带星或带三角或以其他特殊符合开头的要求项的字符串列表。
|
||||
要求与指南:
|
||||
1. 仅保留符合条件的键值对:如果某键名下没有任何以星号(★)、三角符号(▲)或其他特殊符号开头的要求项,则该键名及其对应内容不包含在输出结果中。
|
||||
2. 逐条提取:每个以星号(★)、三角符号(▲)或其他特殊符号开头的要求项,均作为独立的字符串保存在对应键的值列表中。
|
||||
@ -520,8 +515,6 @@ def get_tech_star_deviation(tech_requirements_dict):
|
||||
}}
|
||||
### 对应的输出如下:
|
||||
{{}}
|
||||
|
||||
输入文本内容:{full_text}
|
||||
"""
|
||||
user_query = prompt_template.format(full_text=tech_string)
|
||||
# 调用模型接口,假设qianwen_plus是已定义的模型调用函数
|
||||
@ -538,7 +531,9 @@ def get_tech_star_deviation(tech_requirements_dict):
|
||||
return get_tech_star_deviation_model(tech_requirements_dict)
|
||||
|
||||
def get_proof_materials(all_data_info):
|
||||
prompt_template = """以下文本是从招标文件中摘取的资格审查、采购需求、商务条款、技术评分相关内容。请根据这些内容,提取并列出投标人需要提交的证明材料。
|
||||
prompt_template = """
|
||||
文本内容:{full_text}
|
||||
以上文本是从招标文件中摘取的资格审查、采购需求、商务条款、技术评分相关内容。请根据这些内容,提取并列出投标人需要提交的证明材料。
|
||||
格式要求:
|
||||
请以 JSON 格式返回结果:
|
||||
- 键名为 '证明材料'。
|
||||
@ -560,8 +555,6 @@ def get_proof_materials(all_data_info):
|
||||
"发射器:外壳需有正规厂家世标认证"
|
||||
]
|
||||
}}
|
||||
|
||||
输入文本:{full_text}
|
||||
"""
|
||||
user_query=prompt_template.format(full_text=all_data_info)
|
||||
# print(user_query)
|
@ -1,10 +1,7 @@
|
||||
import time
|
||||
import multiprocessing
|
||||
from concurrent.futures import ThreadPoolExecutor, TimeoutError
|
||||
from queue import Queue
|
||||
from PyPDF2 import PdfReader # 确保已安装 PyPDF2: pip install PyPDF2
|
||||
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file, qianwen_long
|
||||
|
||||
|
||||
def judge_zbfile_exec(file_path):
|
||||
|
@ -5,8 +5,8 @@ import time
|
||||
|
||||
from flask_app.general.format_change import docx2pdf
|
||||
from flask_app.general.json_utils import clean_json_string
|
||||
from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
|
||||
from flask_app.general.通义千问long import upload_file
|
||||
from flask_app.general.llm.多线程提问 import read_questions_from_file, multi_threading
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file
|
||||
from flask_app.general.通用功能函数 import get_global_logger,aggregate_basic_info
|
||||
from flask_app.general.截取pdf_main import truncate_pdf_multiple
|
||||
from flask_app.general.post_processing import inner_post_processing
|
||||
|
@ -271,7 +271,6 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
|
||||
# 无法判断用户上传的是否为乱码文件,可以考虑并行调用大模型,如果为乱码文件直接return None
|
||||
# 国道107 在提取成json文件时,有'湖北众恒永业工程项目管理有限公司广水分公司编'干扰,尝试清除
|
||||
|
||||
#截取json文件有些问题:C:\Users\Administrator\Desktop\新建文件夹 (3)\test keywords和special...
|
||||
if __name__ == "__main__":
|
||||
# 配置日志器
|
||||
unique_id = "uuidzyzy11"
|
||||
|
@ -1,13 +1,13 @@
|
||||
# flask_app/start_up.py
|
||||
|
||||
from flask import Flask, request, g
|
||||
from flask import Flask, g
|
||||
from flask_app.ConnectionLimiter import ConnectionLimiter
|
||||
from flask_app.logger_setup import create_logger_main
|
||||
from flask_app.routes.get_deviation import get_deviation_bp
|
||||
from flask_app.routes.little_zbparse import little_zbparse_bp
|
||||
from flask_app.routes.upload import upload_bp
|
||||
from flask_app.routes.test_zbparse import test_zbparse_bp
|
||||
from flask_app.general.清除file_id import delete_file_by_ids,read_file_ids
|
||||
from flask_app.general.llm.清除file_id import delete_file_by_ids,read_file_ids
|
||||
from flask_app.routes.judge_zbfile import judge_zbfile_bp
|
||||
class FlaskAppWithLimiter(Flask):
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
@ -64,5 +64,3 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
# -*- encoding:utf-8 -*-
|
||||
import concurrent.futures
|
||||
|
||||
from flask_app.general.doubao import doubao_model
|
||||
from flask_app.general.llm.doubao import doubao_model
|
||||
|
||||
|
||||
# 多线程压力测试
|
||||
|
@ -1,5 +1,5 @@
|
||||
import concurrent.futures
|
||||
from flask_app.general.通义千问long import qianwen_long, upload_file
|
||||
from flask_app.general.llm.通义千问long_plus import qianwen_long, upload_file
|
||||
|
||||
|
||||
def multi_threaded_calls(file_id, user_query, num_threads=1):
|
||||
|
@ -1,662 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import regex
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from flask_app.general.doubao import generate_full_user_query
|
||||
from flask_app.general.insert_del_pagemark import insert_mark
|
||||
from flask_app.general.通义千问long import qianwen_plus
|
||||
from flask_app.general.通用功能函数 import process_string_list
|
||||
from collections import OrderedDict
|
||||
from docx import Document
|
||||
|
||||
|
||||
def clean_dict_datas(extracted_contents, keywords, excludes): # 让正则表达式提取到的东西格式化
|
||||
all_text1 = {} # 用于存储 len(text_list) == 1 的结果,按序号保存
|
||||
all_text2 = {} # 用于存储 len(text_list) > 1 的结果,按序号保存
|
||||
# 定义用于分割句子的正则表达式,包括中文和西文的结束标点
|
||||
split_pattern = r'(?<=[。!?\!\?])'
|
||||
clean_pattern = (r'^\s*(?:[((]\s*\d+\s*[)))]|'
|
||||
r'[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、.)\)]+|'
|
||||
r'[一二三四五六七八九十]+、|'
|
||||
r'[A-Za-z][))\.、.]?\s*)')
|
||||
idx = 0 # 共享的序号标记
|
||||
for key, text_list in extracted_contents.items():
|
||||
if len(text_list) == 1:
|
||||
for data in text_list:
|
||||
# print(data)
|
||||
# 检查是否包含任何需要排除的字符串
|
||||
if any(exclude in data for exclude in excludes):
|
||||
continue # 如果包含任何排除字符串,跳过这个数据
|
||||
# 去掉开头的序号,eg:1 | (1) |(2) | 1. | 2.(全角点)| 3、 | 1.1 | 2.3.4 | A1 | C1.1 | 一、
|
||||
data = re.sub(clean_pattern, '', data).strip()
|
||||
keyword_match = re.search(keywords, data)
|
||||
if keyword_match:
|
||||
# 从关键词位置开始查找结束标点符号
|
||||
start_pos = keyword_match.start()
|
||||
# 截取从关键词开始到后面的内容
|
||||
substring = data[start_pos:]
|
||||
# 按定义的结束标点分割
|
||||
sentences = re.split(split_pattern, substring, 1)
|
||||
if len(sentences) > 0 and sentences[0]:
|
||||
# 只取第一句,保留标点
|
||||
cleaned_text = data[:start_pos] + sentences[0] # eg:经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。潜在投标人应自行承担现场考察的全部费用、责任和风险。
|
||||
# 经采购人允许,潜在投标人可进入项目现场进行考察,但潜在投标人不得因此使采购人承担有关责任和蒙受损失。
|
||||
else:
|
||||
cleaned_text = data # 如果没有标点,使用整个字符串
|
||||
else:
|
||||
# 如果没有找到关键词,保留原文本
|
||||
cleaned_text = data
|
||||
# 删除空格
|
||||
cleaned_text_no_spaces = cleaned_text.replace(' ', '').replace(' ', '')
|
||||
# 如果长度大于8,则添加到结果列表
|
||||
if len(cleaned_text_no_spaces) > 8:
|
||||
all_text1[idx] = cleaned_text_no_spaces
|
||||
idx += 1 # 更新共享序号
|
||||
else:
|
||||
# print(text_list)
|
||||
# print("*********")
|
||||
# 用于处理结构化文本,清理掉不必要的序号,并将分割后的段落合并,最终形成更简洁和格式化的输出。
|
||||
data = re.sub(clean_pattern, '', text_list[0]).strip() # 只去除第一个的序号
|
||||
# 将修改后的第一个元素和剩余的元素连接起来
|
||||
text_list[0] = data # 更新列表中的第一个元素
|
||||
joined_text = "\n".join(text_list) # 如果列表中有多个元素,则连接它们
|
||||
# 删除空格
|
||||
joined_text_no_spaces = joined_text.replace(' ', '').replace(' ', '')
|
||||
all_text2[idx] = joined_text_no_spaces
|
||||
idx += 1 # 更新共享序号
|
||||
|
||||
return all_text1, all_text2 # all_texts1要额外用gpt all_text2直接返回结果
|
||||
|
||||
#处理跨页的段落
|
||||
def preprocess_paragraphs(elements):
|
||||
processed = [] # 初始化处理后的段落列表
|
||||
index = 0
|
||||
flag = False # 初始化标志位
|
||||
is_combine_table = False
|
||||
|
||||
# 定义两个新的正则表达式模式
|
||||
pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*')
|
||||
pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*')
|
||||
|
||||
# 定义列表项的模式
|
||||
list_item_pattern = re.compile(
|
||||
r'^\s*('
|
||||
r'[(\(]\d+[)\)]|' # 匹配:(1) 或 (1)
|
||||
r'[A-Za-z]\.\s*|' # 匹配:A. 或 b.
|
||||
r'[一二三四五六七八九十]+、|' # 匹配:一、二、三、
|
||||
r'第[一二三四五六七八九十百零]+[章节部分节]|' # 匹配:第x章,第x部分,第x节
|
||||
r'[A-Za-z]\d+(?:\.\d+)*[\s\.、.)\)]?|' # 匹配:A1.2 等
|
||||
r'\d+(?:\.\d+)+[\s\.、.)\)]?(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])|' # 匹配:数字序号如1.1 1.1.1
|
||||
r'(?=\d+\s(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万]))|' # 数字后空格,空格后非指定关键字
|
||||
r'(?=\d+[、..])(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])' # 数字后直接跟顿号或点号
|
||||
r')'
|
||||
)
|
||||
|
||||
# 新增的正则表达式,用于匹配以数字序号开头的段落
|
||||
pattern_numeric_header = re.compile(
|
||||
r'^(?<![a-zA-Z((])(\d+(?:\.\d+)+)\s*(.*)' # 匹配如 '12.1 内容'
|
||||
)
|
||||
pattern_numeric_header_fallback = re.compile(
|
||||
r'^(\d+\.)\s*(.+)$' # 匹配如 '12. 内容'
|
||||
)
|
||||
|
||||
# 是否存在连续超过指定长度的空白字符序列: 排除遇到表格、填空的情况
|
||||
def has_long_spaces(text, max_space_count=5):
|
||||
return any(len(space) > max_space_count for space in re.findall(r'\s+', text))
|
||||
|
||||
# 正则表达式用于检测页面标记
|
||||
pattern_marker = re.compile(r'\$\$index_mark_\d+\$\$')
|
||||
|
||||
# 辅助函数:查找上一个非空且非标记的段落
|
||||
def find_prev_text(current_index):
|
||||
for i in range(current_index - 1, -1, -1):
|
||||
if isinstance(elements[i], str):
|
||||
return '', -1
|
||||
try:
|
||||
text = elements[i].text.strip()
|
||||
except AttributeError:
|
||||
continue # 如果段落对象没有 text 属性,跳过
|
||||
if text and not pattern_marker.search(text):
|
||||
return text, i
|
||||
return '', -1
|
||||
|
||||
# 辅助函数:查找下一个非空且非标记的段落
|
||||
def find_next_text(current_index):
|
||||
for i in range(current_index + 1, len(elements)):
|
||||
if isinstance(elements[i], str):
|
||||
return '', -1
|
||||
try:
|
||||
text = elements[i].text.strip()
|
||||
except AttributeError:
|
||||
continue # 如果段落对象没有 text 属性,跳过
|
||||
# 跳过空白段落和页面标记
|
||||
if not text or pattern_marker.search(text):
|
||||
continue
|
||||
# 跳过匹配排除模式的段落
|
||||
if (pattern_numbered.match(text) or pattern_parentheses.match(text)) and len(text) < 8:
|
||||
continue
|
||||
return text, i
|
||||
return '', -1
|
||||
|
||||
while index < len(elements):
|
||||
if isinstance(elements[index], str):
|
||||
processed.append(elements[index])
|
||||
index += 1
|
||||
continue
|
||||
try:
|
||||
current_text = elements[index].text.strip() # 去除当前段落的前后空白
|
||||
except AttributeError:
|
||||
# 如果段落对象没有 text 属性,跳过该段落
|
||||
index += 1
|
||||
continue
|
||||
|
||||
# 检查当前段落是否为页面标记
|
||||
if pattern_marker.search(current_text):
|
||||
# 动态查找前一个非空段落
|
||||
prev_text, prev_index = find_prev_text(index)
|
||||
# 动态查找后一个非空段落
|
||||
next_text, next_index = find_next_text(index)
|
||||
|
||||
# 应用现有的合并逻辑
|
||||
if prev_text and next_text and not has_long_spaces(prev_text) and not has_long_spaces(next_text):
|
||||
if not prev_text.endswith(('。', '!', '?')): # ',', ',', 先注释了,如果逗号,可能还没结束。
|
||||
# 检查后一个段落是否为列表项
|
||||
if not list_item_pattern.match(next_text) and len(prev_text) > 30:
|
||||
# 合并前后段落
|
||||
merged_text = prev_text + ' ' + next_text # 为了可读性添加空格
|
||||
if prev_index < len(elements):
|
||||
# 移除 processed 中的前一个段落
|
||||
if processed and processed[-1] == prev_text:
|
||||
processed.pop()
|
||||
# 添加合并后的文本
|
||||
processed.append(merged_text)
|
||||
|
||||
# 跳过标记以及前后所有空白段落,直到 next_index
|
||||
index = next_index + 1
|
||||
continue # 继续下一个循环
|
||||
|
||||
# 如果不满足合并条件,跳过标记及其周围的空白段落
|
||||
# 计算下一个需要处理的索引
|
||||
# 从当前 index 向下,跳过所有连续的空白段落和标记
|
||||
skip_index = index + 1
|
||||
while skip_index < len(elements):
|
||||
if isinstance(elements[skip_index], str):
|
||||
break
|
||||
try:
|
||||
skip_text = elements[skip_index].text.strip()
|
||||
except AttributeError:
|
||||
skip_index += 1
|
||||
continue # 如果段落对象没有 text 属性,跳过
|
||||
if skip_text == '' or pattern_marker.search(skip_text):
|
||||
skip_index += 1
|
||||
else:
|
||||
break
|
||||
index = skip_index
|
||||
continue # 继续下一个循环
|
||||
|
||||
# 检查当前段落是否匹配任一排除模式
|
||||
if (pattern_numbered.match(current_text) or pattern_parentheses.match(current_text)) and len(current_text) < 8:
|
||||
# 如果匹配,则跳过当前段落,不添加到processed列表中
|
||||
index += 1
|
||||
continue
|
||||
|
||||
# 检查是否为以数字序号开头的段落
|
||||
match = pattern_numeric_header.match(current_text)
|
||||
if not match:
|
||||
match = pattern_numeric_header_fallback.match(current_text)
|
||||
|
||||
if match:
|
||||
# 当前段落以数字序号开头,直接添加到 processed
|
||||
processed.append(current_text)
|
||||
flag = True # 设置标志位,准备处理下一个段落
|
||||
index += 1
|
||||
continue
|
||||
else:
|
||||
if flag:
|
||||
if not list_item_pattern.match(current_text):
|
||||
if processed:
|
||||
# **新增逻辑开始**
|
||||
next_non_empty_text, next_non_empty_index = find_next_text(index)
|
||||
is_next_numbered = False
|
||||
if next_non_empty_text:
|
||||
is_next_numbered = bool(
|
||||
pattern_numeric_header.match(next_non_empty_text) or
|
||||
pattern_numeric_header_fallback.match(next_non_empty_text)
|
||||
)
|
||||
|
||||
if is_next_numbered and len(processed[-1]) > 30:
|
||||
# 只有在下一个段落以数字序号开头且上一个段落长度大于30时,才将当前段落追加到上一个段落
|
||||
processed[-1] = processed[-1] + ' ' + current_text
|
||||
else:
|
||||
# 否则,不追加,而是作为新的段落添加
|
||||
processed.append(current_text)
|
||||
# **新增逻辑结束**
|
||||
else:
|
||||
# **新增处理:匹配 list_item_pattern 的段落也应被保存**
|
||||
processed.append(current_text)
|
||||
# 无论是否追加,都将 flag 重置
|
||||
flag = False
|
||||
index += 1
|
||||
continue
|
||||
else:
|
||||
# flag 为 False,直接添加到 processed
|
||||
processed.append(current_text)
|
||||
index += 1
|
||||
continue
|
||||
|
||||
return processed
|
||||
|
||||
def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keywords):
|
||||
if isinstance(keywords, str):
|
||||
keywords = [keywords]
|
||||
extracted_paragraphs = OrderedDict()
|
||||
continue_collecting = False
|
||||
current_section_pattern = None
|
||||
active_key = None
|
||||
|
||||
def match_keywords(text, patterns):
|
||||
# 首先检查关键词是否匹配
|
||||
for pattern in patterns:
|
||||
if re.search(pattern, text, re.IGNORECASE):
|
||||
return True
|
||||
return False
|
||||
|
||||
def extract_from_text(text, current_index):
|
||||
nonlocal continue_collecting, current_section_pattern, active_key
|
||||
if text == "":
|
||||
return current_index
|
||||
|
||||
if continue_collecting:
|
||||
# 如果是收集状态,并且下面有表格,则把表格内容全部追加到active_key中去
|
||||
if text == '[$$table_start$$]':
|
||||
current_index += 1
|
||||
while (processed_paragraphs[current_index] != '[$$table_over$$]'):
|
||||
extracted_paragraphs[active_key].append(processed_paragraphs[current_index])
|
||||
current_index += 1
|
||||
return current_index
|
||||
if current_section_pattern and re.match(current_section_pattern, text):
|
||||
continue_collecting = False
|
||||
active_key = None
|
||||
else:
|
||||
if active_key is not None:
|
||||
extracted_paragraphs[active_key].append(text)
|
||||
return current_index
|
||||
|
||||
if match_keywords(text, keywords):
|
||||
active_key = text
|
||||
extracted_paragraphs[active_key] = [text]
|
||||
if match_keywords(text, follow_up_keywords):
|
||||
continue_collecting = True
|
||||
section_number = re.match(r'^(\d+([..]\d+)*)\s*[..]?', text) # 修改后的正则,支持 '数字 、' 和 '数字.'
|
||||
if section_number: #当前匹配的行前有序号,那么就匹配到下个相似序号为止停止收集
|
||||
current_section_number = section_number.group(1)
|
||||
level_count = current_section_number.count('.')
|
||||
# 获取章节的各级部分
|
||||
parts = current_section_number.split('.')
|
||||
# Pattern to match current level, e.g., 3.4.5 添加负向前瞻以防止匹配四级或更高层级
|
||||
pattern = r'^' + (r'\d+\s*[..]\s*') * level_count + r'\d+' + r'(?!\s*[..]\s*\d+)'
|
||||
matched_patterns = [pattern] # start with the full pattern
|
||||
|
||||
# for i in range(1, 6): #同级,与matched_patterns = [pattern]重复了,故注释
|
||||
# # 复制 parts 列表以避免修改原列表
|
||||
# new_parts = parts. copy()
|
||||
# new_parts[-1] = str(int(new_parts[-1]) + i)
|
||||
# # 使用不同的分隔符
|
||||
# next_pattern = r'^' + r'\s*[..]\s*'.join(new_parts)
|
||||
# matched_patterns.append(next_pattern)
|
||||
|
||||
# Parent section (if applicable)
|
||||
if len(parts) > 1:
|
||||
for i in range(1, 6): #考虑原文档的书写不规范,跳序号的情况,目前设置了范围<5
|
||||
parent_section_parts = parts[:-1].copy()
|
||||
parent_section_parts[-1] = str(int(parent_section_parts[-1]) + i)
|
||||
parent_pattern = r'^' + r'\s*[..]\s*'.join(parent_section_parts)+ r'(?!\s*[..]\s*\d+)'
|
||||
matched_patterns.append(parent_pattern)
|
||||
|
||||
# 添加对 '数字 、' 格式的支持
|
||||
digit_comma_pattern = r'^\d+\s*、'
|
||||
matched_patterns.append(digit_comma_pattern)
|
||||
|
||||
# 获取当前顶级章节编号
|
||||
current_top_level_num = int(current_section_number.split('.')[0])
|
||||
for i in range(1, 6):
|
||||
next_top_level_num = current_top_level_num + i
|
||||
next_top_level_pattern = r'^' + str(next_top_level_num) + r'\s*[..]'
|
||||
# 检查是否已经包含了该模式,避免重复
|
||||
if next_top_level_pattern not in matched_patterns:
|
||||
matched_patterns.append(next_top_level_pattern)
|
||||
|
||||
# Combine the patterns
|
||||
combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
|
||||
current_section_pattern = re.compile(combined_pattern)
|
||||
|
||||
else:
|
||||
found_next_number = False
|
||||
current_section_pattern = None
|
||||
|
||||
while current_index < len(processed_paragraphs) - 1:
|
||||
current_index += 1
|
||||
next_text = processed_paragraphs[current_index].strip()
|
||||
# 添加对空白行的处理
|
||||
if not next_text:
|
||||
continue # 跳过空白行,进入下一个循环
|
||||
if not found_next_number:
|
||||
# 修改后的正则,支持 '数字 、' 格式
|
||||
next_section_number = re.match(r'^([A-Za-z0-9]+(?:[..][A-Za-z0-9]+)*)|([((]\s*\d+\s*[))])|(\d+\s*、)',
|
||||
next_text)
|
||||
if next_section_number:
|
||||
found_next_number = True
|
||||
if next_section_number.group(1):
|
||||
section_parts = next_section_number.group(1).split('.')
|
||||
dynamic_pattern = r'^' + r'[..]'.join(
|
||||
[r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
|
||||
elif next_section_number.group(2):
|
||||
dynamic_pattern = r'^[\(\(]\s*\d+\s*[\)\)]'
|
||||
elif next_section_number.group(3):
|
||||
dynamic_pattern = r'^\d+\s*、'
|
||||
current_section_pattern = re.compile(dynamic_pattern)
|
||||
if current_section_pattern and re.match(current_section_pattern, next_text):
|
||||
extracted_paragraphs[active_key].append(next_text)
|
||||
else:
|
||||
continue_collecting = False
|
||||
active_key = None
|
||||
break
|
||||
|
||||
return current_index
|
||||
|
||||
index = 0
|
||||
while index < len(processed_paragraphs):
|
||||
# print(processed_paragraphs[index].strip())
|
||||
index = extract_from_text(processed_paragraphs[index].strip(), index)
|
||||
# print("--------------")
|
||||
index += 1
|
||||
return extracted_paragraphs
|
||||
|
||||
# 分割表格中单元格文本
|
||||
def split_cell_text(text):
|
||||
# 定义用于提取括号内容的正则表达式,支持中英文括号
|
||||
bracket_pattern = re.compile(r'[((][^(()))]+[))]')
|
||||
|
||||
# 1. 先提取并替换括号内容
|
||||
bracket_contents = []
|
||||
|
||||
def replace_bracket_content(match):
|
||||
bracket_contents.append(match.group(0)) # 保存括号内容
|
||||
return f"<BRACKET_{len(bracket_contents) - 1}>" # 使用占位符替换括号内容
|
||||
|
||||
item_with_placeholders = bracket_pattern.sub(replace_bracket_content, text)
|
||||
# print("-----------")
|
||||
# print(text)
|
||||
# 2. 分割句子,保证句子完整性(按标点符号和序号分割)
|
||||
split_sentences = regex.split(
|
||||
r'(?<=[。!?!?\?])|' # 在中文句号、感叹号、问号后分割
|
||||
r'(?<![A-Za-z]\s*)(?<!\d[..]?)(?=\d+(?:[..]\d+)+(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万台份]))|' # 匹配多级编号,限制后面不能是指定关键字
|
||||
r'(?<![+\-×÷*/.\.A-Za-z]\s*|\d)(?=\d+\s(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万台份]))|' # 数字后跟空格且不跟指定关键字时分割,且前面不包含 . 或 . eg:'1.1 xx'
|
||||
r'(?<![+\-×÷*/.\.A-Za-z]\s*|\d)(?=\d+[、..](?!\d|(?:\s*[号条款节章项例页段部步点年月日时分秒个元千万台份])))|' # 数字后直接跟顿号、点号时分割,且点号后不跟数字 eg:'1.'
|
||||
r'(?<![A-Za-z])(?=[A-Za-z][..]\s*(?![A-Za-z]))|' # 单个字母+点号或单个字母+数字,排除www.baidu.com 网址情况
|
||||
r'(?=[A-Za-z]+\s*\d+\s*(?:[..]\s*\d+)*)|' # 在字母加数字或多级编号前分割
|
||||
r'(?<=^|\n)(?=[一二三四五六七八九十]+、)', # 在中文数字加顿号(如一、二、)前分割
|
||||
item_with_placeholders
|
||||
)
|
||||
|
||||
# 3. 还原括号内容
|
||||
split_sentences = [re.sub(r"<BRACKET_(\d+)>", lambda m: bracket_contents[int(m.group(1))], s) for s in
|
||||
split_sentences]
|
||||
# 4. 过滤空字符串
|
||||
split_sentences = [s for s in split_sentences if s.strip()]
|
||||
# print(split_sentences)
|
||||
return split_sentences
|
||||
|
||||
# 文件预处理----按文件顺序提取文本和表格,并合并跨页表格
|
||||
def extract_file_elements(file_path):
|
||||
doc = Document(file_path)
|
||||
doc_elements = doc.element.body
|
||||
doc_paragraphs = doc.paragraphs
|
||||
doc_tables = doc.tables
|
||||
pre_table_head = None
|
||||
table_combine = False
|
||||
paragraph_index = 0
|
||||
tables_index = 0
|
||||
doc_contents = []
|
||||
|
||||
pattern_marker = re.compile(r'\$\$index_mark_\d+\$\$')
|
||||
|
||||
# 遍历文件元素
|
||||
for element in doc_elements:
|
||||
# 如果是段落
|
||||
if element.tag.endswith('}p'):
|
||||
if pre_table_head:
|
||||
text = doc_paragraphs[paragraph_index].text
|
||||
# 如果上一个是表格,并且之后没有文本或为跨页标记,则不提取
|
||||
if (text == '' or pattern_marker.search(text)):
|
||||
paragraph_index += 1
|
||||
continue
|
||||
# 如果遇到有效文本,则说明表格提取完毕
|
||||
else:
|
||||
doc_contents.append('[$$table_over$$]')
|
||||
table_combine = False
|
||||
pre_table_head = None
|
||||
doc_contents.append(doc_paragraphs[paragraph_index])
|
||||
paragraph_index += 1
|
||||
# 如果是表格
|
||||
elif element.tag.endswith('}tbl'):
|
||||
table = doc_tables[tables_index]
|
||||
table_content = []
|
||||
for row_idx, row in enumerate(table.rows):
|
||||
if row_idx == 0:
|
||||
# 跳过表头
|
||||
if pre_table_head:
|
||||
table_combine = True
|
||||
if pre_table_head == row.cells[0].text:
|
||||
continue
|
||||
# 记录初始表头
|
||||
else:
|
||||
pre_table_head = row.cells[0].text
|
||||
doc_contents.append('[$$table_start$$]')
|
||||
continue
|
||||
# 遍历每一行中的单元格
|
||||
for cell in row.cells:
|
||||
cell_text = cell.text.strip() # 去除单元格内容前后空白
|
||||
if len(cell_text) > 8: # 检查文字数量是否大于8
|
||||
cell_text = split_cell_text(cell_text)
|
||||
table_content += cell_text
|
||||
# 合并跨页表格
|
||||
if table_combine:
|
||||
if not doc_contents[-1].endswith(('。', '!', '?', ';')):
|
||||
doc_contents[-1] += ' ' + table_content[0]
|
||||
table_content.pop(0)
|
||||
doc_contents.extend(table_content)
|
||||
# doc_contents.append('[$$table_over$$]')
|
||||
tables_index += 1
|
||||
return doc_contents
|
||||
|
||||
def handle_query(file_path, user_query, output_file, result_key, keywords):
|
||||
try:
|
||||
excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "本人保证:", "我方"]
|
||||
follow_up_keywords = [
|
||||
r'情\s*形\s*之\s*一',
|
||||
r'情\s*况\s*之\s*一',
|
||||
r'下\s*列(?!\s*公式)', # 增加负向前瞻,排除“下列公式”
|
||||
r'以\s*下(?!\s*公式)', # 增加负向前瞻,排除“以下公式”
|
||||
r'其\s*他.*?情\s*形\s*[::]',
|
||||
r'包\s*括'
|
||||
]
|
||||
|
||||
doc_contents = extract_file_elements(file_path)
|
||||
processed_paragraphs = preprocess_paragraphs(doc_contents)
|
||||
extracted_contents = extract_text_with_keywords(processed_paragraphs, [keywords], follow_up_keywords)
|
||||
all_texts1,all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
|
||||
|
||||
# 1. 得到有序的 all_text1_items
|
||||
all_text1_items = sorted(all_texts1.items(), key=lambda x: x[0])
|
||||
# 2. 得到纯内容列表
|
||||
all_texts1_list = [content for (_, content) in all_text1_items]
|
||||
# print(all_texts)
|
||||
# Proceed only if there is content to write
|
||||
selected_contents = {}
|
||||
final_list=[]
|
||||
seen_contents = set() # 使用集合跟踪已添加的内容以去重
|
||||
if all_texts1_list:
|
||||
with open(output_file, 'w', encoding='utf-8') as file:
|
||||
counter = 1
|
||||
for content in all_texts1_list:
|
||||
# 使用内容的前25个字符作为去重的依据
|
||||
key = content[:25] # 提取前25个字符
|
||||
if key not in seen_contents: # 如果前30个字符未出现过
|
||||
file.write(f"{counter}. {content}\n")
|
||||
file.write("..............." + '\n')
|
||||
seen_contents.add(key) # 标记前30个字符为已写入
|
||||
counter += 1
|
||||
|
||||
# 生成用户查询
|
||||
user_query = generate_full_user_query(output_file, user_query)
|
||||
model_ans = qianwen_plus(user_query) # 豆包模型返回结果
|
||||
# file_id = upload_file(output_file)
|
||||
# model_ans = qianwen_long(file_id, user_query)
|
||||
num_list = process_string_list(model_ans) # 处理模型返回的序号
|
||||
print(result_key + "选中的序号:" + str(num_list))
|
||||
|
||||
for index in num_list:
|
||||
if 1 <= index <= len(all_texts1_list):
|
||||
original_global_idx = all_text1_items[index - 1][0]
|
||||
content = all_text1_items[index - 1][1]
|
||||
selected_contents[original_global_idx] = content
|
||||
# 把选中的 all_text1 内容 + all_text2 合并
|
||||
merged_dict = {}
|
||||
# 先合并用户选中的 all_text1
|
||||
for idx, txt in selected_contents.items():
|
||||
merged_dict[idx] = txt
|
||||
# 再合并 all_text2
|
||||
for idx, txt in all_texts2.items():
|
||||
merged_dict[idx] = txt
|
||||
final_list = [txt for idx, txt in sorted(merged_dict.items(), key=lambda x: x[0])]
|
||||
|
||||
return {result_key: final_list}
|
||||
except Exception as e:
|
||||
print(f"handle_query 在处理 {result_key} 时发生异常: {e}")
|
||||
return {result_key: []}
|
||||
|
||||
def combine_find_invalid(invalid_docpath, output_dir):
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
queries = [
|
||||
(
|
||||
r'否\s*决|'
|
||||
r'无\s*效\s*投\s*标|'
|
||||
r'无\s*效\s*文\s*件|'
|
||||
r'(?:文\s*件|投\s*标|响\s*应)\s*[\u4e00-\u9fa5]?\s*(?:无|失)\s*效|'
|
||||
r'无\s*效\s*响\s*应|'
|
||||
r'无\s*效\s*报\s*价|'
|
||||
r'无\s*效\s*标|'
|
||||
r'视\s*为\s*无\s*效|'
|
||||
r'被\s*拒\s*绝|'
|
||||
r'将\s*拒\s*绝|'
|
||||
r'予\s*以\s*拒\s*绝',
|
||||
"""以下是从招标文件中摘取的内容,文本中序号分明,各信息之间以...............分割。
|
||||
任务目标:
|
||||
从文本中筛选所有描述否决投标,拒绝投标,投标、响应无效或类似表述的情况,并返回对应的序号。
|
||||
要求与指南:
|
||||
文本中可能存在无关的信息,请准确筛选符合条件的信息,并将符合条件的信息的序号返回。
|
||||
输出格式:
|
||||
以 [x, x, x] 的形式返回,x 为符合条件的信息的序号,为自然数。
|
||||
如果文本中没有符合条件的信息,请返回 []。
|
||||
特殊情况:
|
||||
如果某序号的内容明显分为几部分且一部分内容符合筛选条件,但其他部分明显是无关内容,请返回符合部分的字符串内容代替序号。
|
||||
示例输出,仅供格式参考:
|
||||
[1,3,4,6]
|
||||
文本内容:{full_text}
|
||||
""",
|
||||
os.path.join(output_dir, "temp1.txt"),
|
||||
"否决和无效投标情形"
|
||||
),
|
||||
# (
|
||||
# r'废\s*标',
|
||||
# """以下是从招标文件中摘取的内容,文本中序号分明,文本内之间的信息以'...............'分割。
|
||||
# 任务目标:
|
||||
# 请根据以下内容,筛选出 废标项的情况 (明确描述导致 废标 的情况)并返回对应的序号。
|
||||
# 要求与指南:
|
||||
# 文本中可能存在无关的信息,请准确筛选符合条件的信息,并将符合条件的信息的序号返回。
|
||||
# 输出格式:
|
||||
# 返回结果以 [x, x, x] 的形式,其中 x 为符合条件的信息的序号,为自然数。
|
||||
# 如果文本中没有任何符合条件的废标情况,请返回 []。
|
||||
# 示例输出,仅供格式参考:
|
||||
# [1,3,4,6]
|
||||
# 文本内容:{full_text}
|
||||
# """,
|
||||
# os.path.join(output_dir, "temp2.txt"),
|
||||
# "废标项"
|
||||
# ),
|
||||
# (
|
||||
# r'不\s*得(?!\s*(分|力))|禁\s*止\s*投\s*标',
|
||||
# """以下是从招标文件中摘取的内容,文本中序号分明,文本内的条款以'...............'分割。条款规定了各方不得存在的情形。请根据以下要求进行筛选:
|
||||
# **投标相关主体与非投标相关主体的定义**:
|
||||
# 投标相关主体:包括但不限于“投标人”、“中标人”、“供应商”、“联合体投标各方”、“响应人”、“应答人”或其他描述投标方的词语。
|
||||
# 非投标相关主体:包括但不限于“招标人”、“采购人”、“评标委员会”或其他描述非投标方的词语。
|
||||
# **筛选要求**:
|
||||
# 1. **仅筛选**明确描述投标相关主体禁止情形或不得存在的情形的条款,不包含笼统或未具体说明情形的条款。例如:
|
||||
# 若条款内容包含'投标人不得存在的其他关联情形'这样的笼统描述,而未说明具体的情形,则无需添加该条款。
|
||||
# 2. **排除**仅描述非投标相关主体行为限制或禁止情形的条款,例如“招标人不得泄露信息”或“评标委员会不得收受贿赂”,则无需返回。
|
||||
# 3. 若条款同时描述了对投标相关主体与非投标相关主体的行为限制、禁止情形,也需返回。
|
||||
# 4. **特殊情况**:如果条款中包含“磋商小组”、”各方“等既能指代投标相关主体又能指代非投标相关主体的词汇:
|
||||
# 若在语境中其指代或包含投标相关主体,则应将其考虑在内;否则,排除该条款。
|
||||
#
|
||||
# **输出格式**:
|
||||
# 返回结果以 [x, x, x] 的形式,其中 x 为符合条件的条款的序号,为自然数。
|
||||
# 如果没有符合条件的条款,返回 `[]`。
|
||||
# **示例**:
|
||||
# - **符合条件**:
|
||||
# - `1. 投标人不得...` → 包含,返回序号 1。
|
||||
# - `3. 联合体投标各方不得...` → 包含,返回序号 3。
|
||||
# - **不符合条件**:
|
||||
# - `2. 采购人不得...` → 主语为“采购人”,排除。
|
||||
# -示例输出: [1,3]
|
||||
# 请根据上述筛选要求,阅读以下文本内容,并返回符合条件的条款序号,
|
||||
#
|
||||
# 文本内容:{full_text}
|
||||
# """,
|
||||
# os.path.join(output_dir, "temp3.txt"),
|
||||
# "不得存在的情形"
|
||||
# )
|
||||
]
|
||||
results = []
|
||||
|
||||
# 使用线程池来并行处理查询
|
||||
with ThreadPoolExecutor() as executor:
|
||||
futures = []
|
||||
for keywords, user_query, output_file, result_key in queries:
|
||||
future = executor.submit(handle_query, invalid_docpath, user_query, output_file, result_key, keywords)
|
||||
futures.append((future, result_key)) # 保持顺序
|
||||
time.sleep(0.5) # 暂停0.5秒后再提交下一个任务
|
||||
|
||||
for future, result_key in futures:
|
||||
try:
|
||||
result = future.result()
|
||||
except Exception as e:
|
||||
print(f"线程处理 {result_key} 时出错: {e}")
|
||||
result = {result_key: ""}
|
||||
results.append(result)
|
||||
combined_dict = {}
|
||||
for d in results:
|
||||
combined_dict.update(d)
|
||||
|
||||
print("无效标与废标done...")
|
||||
return {"无效标与废标项": combined_dict}
|
||||
|
||||
if __name__ == '__main__':
|
||||
start_time = time.time()
|
||||
# truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json"
|
||||
# truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招标文件(实高电子显示屏)_tobidders_notice_part1.docx"
|
||||
# clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
|
||||
# doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
|
||||
# doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
|
||||
pdf_path = r'C:\Users\Administrator\Desktop\货物\test5\磋商采购文件-恩施市森林火灾风险普查样品检测服务.pdf'
|
||||
|
||||
output_dir = r"C:\Users\Administrator\Desktop\货物\test5"
|
||||
# invalid_added = insert_mark(pdf_path)
|
||||
# invalid_added_docx = pdf2docx(invalid_added)
|
||||
invalid_added_docx=r'C:\Users\Administrator\Desktop\货物\test3\invalid_added.docx'
|
||||
results = combine_find_invalid(invalid_added_docx, output_dir)
|
||||
end_time = time.time()
|
||||
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
|
||||
# print("Elapsed time:", str(end_time - start_time))
|
@ -1,42 +0,0 @@
|
||||
import PyPDF2
|
||||
import re
|
||||
|
||||
def extract_contents_with_pages(pdf_path, keyword):
|
||||
with open(pdf_path, "rb") as file:
|
||||
reader = PyPDF2.PdfReader(file)
|
||||
for page_number in range(len(reader.pages)):
|
||||
page = reader.pages[page_number]
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
lines = text.split('\n')
|
||||
for line in lines:
|
||||
if keyword.lower() in line.lower():
|
||||
match = re.search(r"\d+(?=\s*$)", line)
|
||||
if match:
|
||||
return int(match.group(0)) # 直接返回整数类型的页码
|
||||
return None # 如果遍历完所有页面后仍未找到页码,返回None
|
||||
|
||||
def split_pdf(pdf_path, start_page, output_path):
|
||||
"""切分PDF文件从start_page到end_page"""
|
||||
with open(pdf_path, "rb") as file:
|
||||
reader = PyPDF2.PdfReader(file)
|
||||
writer = PyPDF2.PdfWriter()
|
||||
end_page = len(reader.pages)
|
||||
# 确保start_page是整数
|
||||
start_page = int(start_page)
|
||||
# 注意页码从0开始,因此需要调整页码索引
|
||||
for i in range(start_page - 1, end_page):
|
||||
writer.add_page(reader.pages[i])
|
||||
with open(output_path, "wb") as output_pdf:
|
||||
writer.write(output_pdf)
|
||||
|
||||
# 使用示例
|
||||
pdf_path = "D:\\项目\\工程招标\\zb1.pdf"
|
||||
output_path = "D:\\项目\\工程招标\\tb_format.pdf"
|
||||
keyword = "投标文件格式" # 修改为你想查找的关键字
|
||||
page_number = extract_contents_with_pages(pdf_path, keyword)
|
||||
print(page_number)
|
||||
if page_number is not None:
|
||||
split_pdf(pdf_path, page_number, output_path)
|
||||
else:
|
||||
print("未找到含有关键字的页码")
|
@ -5,9 +5,9 @@ import concurrent.futures
|
||||
from flask_app.general.json_utils import clean_json_string, add_outer_key
|
||||
from flask_app.general.通用功能函数 import process_judge_questions, aggregate_basic_info
|
||||
from flask_app.general.投标人须知正文提取指定内容 import extract_from_notice
|
||||
from flask_app.工程标.判断是否分包等 import merge_json_to_list
|
||||
from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
|
||||
from flask_app.general.通义千问long import upload_file
|
||||
from flask_app.old_version.判断是否分包等_old import merge_json_to_list
|
||||
from flask_app.general.llm.多线程提问 import read_questions_from_file, multi_threading
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file
|
||||
|
||||
def update_baseinfo_lists(baseinfo_list1, baseinfo_list2):
|
||||
# 创建一个字典,用于存储 baseinfo_list1 中的所有键值对
|
||||
|
@ -4,34 +4,12 @@ import re
|
||||
import json
|
||||
import time
|
||||
|
||||
from flask_app.general.多线程提问 import multi_threading
|
||||
from flask_app.general.llm.多线程提问 import multi_threading
|
||||
from flask_app.工程标.根据条款号整合json import process_and_merge_entries,process_and_merge2
|
||||
from flask_app.general.json_utils import clean_json_string
|
||||
from flask_app.general.投标人须知正文条款提取成json文件 import convert_clause_to_json
|
||||
from flask_app.general.通义千问long import upload_file
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file
|
||||
from flask_app.general.merge_pdfs import merge_pdfs
|
||||
prompt = """
|
||||
# 角色
|
||||
你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。
|
||||
|
||||
## 技能
|
||||
### 技能 1:文档解析与摘要
|
||||
- 深入理解并分析${document1}的内容,提取关键信息。
|
||||
- 根据需求生成简洁明了的摘要,保持原文核心意义不变。
|
||||
|
||||
### 技能 2:信息检索与关联
|
||||
- 在${document1}中高效检索特定信息或关键词。
|
||||
- 能够识别并链接到文档内部或外部的相关内容,增强信息的连贯性和深度。
|
||||
|
||||
## 限制
|
||||
- 所有操作均需基于${document1}的内容,不可超出此范围创造信息。
|
||||
- 在处理敏感或机密信息时,需遵守严格的隐私和安全规定。
|
||||
- 确保所有生成或改编的内容逻辑连贯,无误导性信息。
|
||||
|
||||
请注意,上述技能执行时将直接利用并参考${document1}的具体内容,以确保所有产出紧密相关且高质量。
|
||||
"""
|
||||
|
||||
|
||||
def update_json_data(original_data, updates1, updates2,second_response_list):
|
||||
"""
|
||||
根据提供的更新字典覆盖原始JSON数据中对应的键值,支持点分隔的键来表示嵌套结构。
|
||||
|
@ -7,7 +7,7 @@ from flask_app.general.json_utils import extract_content_from_json, clean_json_s
|
||||
from flask_app.general.table_content_extraction import extract_tables_main
|
||||
from flask_app.工程标.形式响应评审 import process_reviews
|
||||
from flask_app.工程标.资格评审 import process_qualification
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file, qianwen_long
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from flask_app.货物标.资格审查main import combine_qualification_review
|
||||
from flask_app.general.merge_pdfs import merge_pdfs
|
||||
|
@ -3,8 +3,8 @@
|
||||
import json
|
||||
import re
|
||||
from flask_app.general.json_utils import clean_json_string, combine_json_results, add_keys_to_json
|
||||
from flask_app.general.多线程提问 import multi_threading, read_questions_from_file
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long
|
||||
from flask_app.general.llm.多线程提问 import multi_threading, read_questions_from_file
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file, qianwen_long
|
||||
|
||||
|
||||
# 这个函数的主要用途是将多个相关的字典(都包含 'common_key' 键)合并成一个更大的、综合的字典,所有相关信息都集中在 'common_key' 键下
|
||||
|
@ -4,15 +4,14 @@ import re
|
||||
import fitz
|
||||
from PyPDF2 import PdfReader
|
||||
import textwrap
|
||||
from flask_app.general.doubao import read_txt_to_string
|
||||
from flask_app.general.llm.doubao import read_txt_to_string
|
||||
from flask_app.general.json_utils import clean_json_string
|
||||
from flask_app.general.model_continue_query import continue_answer, process_continue_answers
|
||||
from flask_app.general.llm.model_continue_query import process_continue_answers
|
||||
from flask_app.general.截取pdf通用函数 import create_get_text_function
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long_stream, qianwen_plus
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file, qianwen_long_stream, qianwen_plus
|
||||
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
|
||||
from flask_app.general.format_change import docx2pdf, pdf2docx
|
||||
import concurrent.futures
|
||||
from flask_app.general.doubao import doubao_model
|
||||
|
||||
|
||||
# 正则表达式判断原文中是否有商务、服务、其他要求
|
||||
@ -157,22 +156,6 @@ def find_exists(truncate_file, required_keys):
|
||||
# 最终返回清理后的要求列表
|
||||
return clean_requirements
|
||||
|
||||
|
||||
def generate_queries(truncate_file, required_keys):
|
||||
key_list = find_exists(truncate_file, required_keys)
|
||||
queries = []
|
||||
user_query_template = "这是一份货物标中采购要求部分的内容,请告诉我\"{}\"是什么,请以json格式返回结果,外层键名是\"{}\",内层键值对中的键名是原文中的标题或者是你对相关子要求的总结,而键值需要完全与原文保持一致,不可擅自总结删减,注意你无需回答采购清单中具体设备的技术参数要求,仅需从正文部分开始提取,"
|
||||
for key in key_list:
|
||||
query_base = user_query_template.format(key, key)
|
||||
other_keys = [k for k in key_list if k != key]
|
||||
if other_keys:
|
||||
query_base += "也不需要回答\"{}\"中的内容,".format("\"和\"".join(other_keys))
|
||||
query_base += "若相关要求不存在,在键值中填'未知'。"
|
||||
queries.append(query_base)
|
||||
# print(query_base)
|
||||
return queries
|
||||
|
||||
|
||||
def generate_template(required_keys,full_text, type=1):
|
||||
# 定义每个键对应的示例内容
|
||||
example_content1 = {
|
||||
@ -318,7 +301,7 @@ def generate_template(required_keys,full_text, type=1):
|
||||
{tech_json_example2_str}
|
||||
"""
|
||||
if full_text:
|
||||
user_query_template += f"\n\n文件内容:{full_text}"
|
||||
user_query_template = f"文件内容:{full_text}\n" + user_query_template
|
||||
return user_query_template
|
||||
|
||||
def get_business_requirements(procurement_path, processed_filepath, model_type):
|
||||
|
@ -4,10 +4,10 @@ import threading
|
||||
import time
|
||||
import concurrent.futures
|
||||
from flask_app.general.json_utils import clean_json_string, add_outer_key
|
||||
from flask_app.general.通用功能函数 import process_judge_questions, aggregate_basic_info, get_deviation_requirements
|
||||
from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
|
||||
from flask_app.general.通义千问long import upload_file
|
||||
from flask_app.工程标.判断是否分包等 import merge_json_to_list
|
||||
from flask_app.general.通用功能函数 import process_judge_questions, aggregate_basic_info
|
||||
from flask_app.general.llm.多线程提问 import read_questions_from_file, multi_threading
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file
|
||||
from flask_app.old_version.判断是否分包等_old import merge_json_to_list
|
||||
from flask_app.general.投标人须知正文提取指定内容 import extract_from_notice
|
||||
from flask_app.货物标.提取采购需求main import fetch_procurement_reqs
|
||||
|
||||
|
@ -1,19 +1,18 @@
|
||||
# -*- encoding:utf-8 -*-
|
||||
import concurrent.futures
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from copy import deepcopy
|
||||
from flask_app.general.model_continue_query import continue_answer, process_continue_answers
|
||||
from flask_app.general.file2markdown import convert_file_to_markdown
|
||||
from flask_app.general.llm.model_continue_query import process_continue_answers
|
||||
from flask_app.general.format_change import pdf2docx
|
||||
from flask_app.general.多线程提问 import multi_threading
|
||||
from flask_app.general.通义千问long import qianwen_long, upload_file, qianwen_plus
|
||||
from flask_app.general.json_utils import clean_json_string, combine_json_results
|
||||
from flask_app.general.doubao import doubao_model, generate_full_user_query, pdf2txt, read_txt_to_string
|
||||
from flask_app.货物标.技术参数要求提取后处理函数 import all_postprocess
|
||||
from flask_app.general.llm.多线程提问 import multi_threading
|
||||
from flask_app.general.llm.通义千问long_plus import qianwen_long, upload_file, qianwen_plus
|
||||
from flask_app.general.json_utils import clean_json_string
|
||||
from flask_app.general.llm.doubao import read_txt_to_string
|
||||
from flask_app.货物标.技术参数要求提取后处理函数 import main_postprocess
|
||||
|
||||
|
||||
def truncate_system_keys(data):
|
||||
"""
|
||||
@ -52,6 +51,8 @@ def truncate_system_keys(data):
|
||||
else:
|
||||
# 对于其他类型的数据,保持不变
|
||||
return data
|
||||
|
||||
|
||||
def generate_key_paths(data):
|
||||
"""
|
||||
处理输入的字典,生成 key_paths, grouped_paths 和 good_list,并根据条件修改原始字典。
|
||||
@ -171,6 +172,8 @@ def generate_key_paths(data):
|
||||
grouped_paths = [{path: grouped_counts[path]} for path in collected_grouped_paths]
|
||||
|
||||
return key_paths, grouped_paths, good_list, data_copy
|
||||
|
||||
|
||||
def rename_keys(data):
|
||||
"""
|
||||
对整个数据结构进行重命名处理。
|
||||
@ -227,10 +230,12 @@ def rename_keys(data):
|
||||
# 对整个数据结构进行递归重命名
|
||||
return rename_keys_recursive(data)
|
||||
|
||||
|
||||
def combine_and_update_results(original_data, updates):
|
||||
"""
|
||||
先规范化original和updates中的字典,防止空格的情况导致匹配不上无法更新
|
||||
"""
|
||||
|
||||
def normalize_key(key):
|
||||
"""
|
||||
规范化键名:
|
||||
@ -292,6 +297,7 @@ def combine_and_update_results(original_data, updates):
|
||||
|
||||
return original_data
|
||||
|
||||
|
||||
def generate_prompt(judge_res, full_text=None):
|
||||
"""
|
||||
获取需要采购的货物名称
|
||||
@ -384,10 +390,11 @@ def generate_prompt(judge_res, full_text=None):
|
||||
'''
|
||||
if '否' not in judge_res and full_text:
|
||||
# 添加文件内容部分
|
||||
base_prompt += f"\n文件内容:\n{full_text}\n"
|
||||
base_prompt = f"文件内容:{full_text}\n" + base_prompt
|
||||
base_prompt += "\n注意事项:\n1.严格按照上述要求执行,确保输出准确性和规范性。\n"
|
||||
return base_prompt
|
||||
|
||||
|
||||
def preprocess_data(data):
|
||||
"""
|
||||
动态识别并将值为非空列表且列表中每个项为单键字典或字符串的键转换为嵌套字典结构。
|
||||
@ -399,6 +406,7 @@ def preprocess_data(data):
|
||||
返回:
|
||||
dict: 预处理后的数据字典。
|
||||
"""
|
||||
|
||||
def is_single_key_dict_list(lst):
|
||||
"""
|
||||
判断一个列表是否为非空,且每个项都是单键字典。
|
||||
@ -505,15 +513,17 @@ def preprocess_data(data):
|
||||
recursive_preprocess(data)
|
||||
return data
|
||||
|
||||
|
||||
def get_technical_requirements(invalid_path, processed_filepath, model_type=1):
|
||||
judge_res = ""
|
||||
file_id = ""
|
||||
full_text = read_txt_to_string(processed_filepath)
|
||||
if model_type == 1:
|
||||
first_query_template = """该文件是否说明了采购需求,即需要采购哪些内容(包括货物、设备、系统、功能模块等)?如果有,请回答'是',否则,回答'否'
|
||||
first_query_template = """
|
||||
该文件是否说明了采购需求,即需要采购哪些内容(包括货物、设备、系统、功能模块等)?如果有,请回答'是',否则,回答'否'
|
||||
{}
|
||||
"""
|
||||
judge_query = first_query_template.format(f"文件内容:{full_text}")
|
||||
judge_query = f"文件内容:{full_text}\n" + first_query_template
|
||||
# judge_res = doubao_model(judge_query)
|
||||
judge_res = qianwen_plus(judge_query)
|
||||
if '否' in judge_res or model_type == 2:
|
||||
@ -537,12 +547,13 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
|
||||
return {"采购需求": {}}
|
||||
preprocessed_data = preprocess_data(cleaned_res) # 确保最内层为[]
|
||||
processed_data = truncate_system_keys(preprocessed_data) # 限制深度
|
||||
key_paths, grouped_paths, good_list, data_copy= generate_key_paths(processed_data) # 提取需要采购的货物清单 key_list:交通监控视频子系统.高清视频抓拍像机 ... grouped_paths是同一系统下同时有'交换机-1'和'交换机-2',提取'交换机' ,输出eg:{'交通标志.标志牌铝板', '交通信号灯.交换机'}
|
||||
key_paths, grouped_paths, good_list, data_copy = generate_key_paths(
|
||||
processed_data) # 提取需要采购的货物清单 key_list:交通监控视频子系统.高清视频抓拍像机 ... grouped_paths是同一系统下同时有'交换机-1'和'交换机-2',提取'交换机' ,输出eg:{'交通标志.标志牌铝板', '交通信号灯.交换机'}
|
||||
# if len(good_list)>100 and model_type==1: #并发特别高(len(good_list)),tokens会比较贵,以后可以考虑qianwen-long, 目前qianwen-plus:0.0008/ktokens long:0.0005/ktokens 差不多价格,暂不考虑
|
||||
# model_type=2
|
||||
# file_id=upload_file(processed_filepath)
|
||||
modified_data = rename_keys(data_copy)
|
||||
user_query_template = """请根据货物标中采购要求部分的内容,告诉我\"{}\"的技术参数或采购要求是什么。请以 JSON 格式返回结果,键名为\"{}\",键值为一个列表,列表中包含若干描述\"{}\"的技术参数或采购要求或功能说明的字符串,请按原文内容回答,保留三角▲、五角★或其他特殊符号(若有)和序号(若有),不可擅自增删内容,尤其是不可擅自添加序号。
|
||||
user_query_template = """请根据招标文件中采购要求部分的内容,告诉我\"{}\"的技术参数或采购要求是什么。请以 JSON 格式返回结果,键名为\"{}\",键值为一个列表,列表中包含若干描述\"{}\"的技术参数或采购要求或功能说明的字符串,请按原文内容回答,保留三角▲、五角★或其他特殊符号(若有)和序号(若有),不可擅自增删内容,尤其是不可擅自添加序号。
|
||||
**重要限制**:
|
||||
- **仅提取技术参数或采购要求,不包括任何商务要求**。商务要求通常涉及供应商资格、报价条款、交货时间、质保等内容,是整体的要求;而技术参数或采购要求则具体描述产品的技术规格、功能、性能指标等。
|
||||
- **商务要求的关键词示例**(仅供参考,不限于此):报价、交货、合同、资质、认证、服务、保修期等。如果内容包含上述关键词,请仔细甄别是否属于商务要求。
|
||||
@ -574,10 +585,11 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
|
||||
"协议:routes 接口开放:具备;▲支持标准 ONVIF 协议与第三方厂家设备进行互联;支持 GB/T28181;应提供 SDK"
|
||||
]
|
||||
}}
|
||||
|
||||
{}
|
||||
"""
|
||||
user_query_template_two="""请根据货物标中采购要求部分的内容,告诉我\"{}\"的技术参数或采购要求是什么。由于该货物存在 {} 种不同的采购要求或技术参数,请逐一列出,并以 JSON 格式返回结果。请以'货物名-编号'区分多种型号,编号为从 1 开始的自然数,依次递增,即第一个键名为\"{}-1\";键值为一个列表,列表中包含若干描述\"{}\"的技术参数或采购要求或功能说明的字符串,请按原文内容回答,保留三角▲、五角★或其他特殊符号(若有)和序号(若有),不可擅自增删内容,尤其是不可擅自添加序号。
|
||||
user_query_template_two = """请根据招标文件中采购要求部分的内容,告诉我\"{}\"的技术参数或采购要求是什么。由于该货物存在 {} 种不同的采购要求或技术参数,请逐一列出,并以 JSON 格式返回结果。请以'货物名-编号'区分多种型号,编号为从 1 开始的自然数,依次递增,即第一个键名为\"{}-1\";键值为一个列表,列表中包含若干描述\"{}\"的技术参数或采购要求或功能说明的字符串,请按原文内容回答,保留三角▲、五角★或其他特殊符号(若有)和序号(若有),不可擅自增删内容,尤其是不可擅自添加序号。
|
||||
**重要限制**:
|
||||
- **仅提取技术参数或采购要求,不包括任何商务要求**。商务要求通常涉及供应商资格、报价条款、交货时间、质保等内容,是整体的要求;而技术参数或采购要求则具体描述产品的技术规格、功能、性能指标等。
|
||||
- **商务要求的关键词示例**(仅供参考,不限于此):报价、交货、合同、资质、认证、服务、保修期等。如果内容包含上述关键词,请仔细甄别是否属于商务要求。
|
||||
|
||||
要求与指南:
|
||||
1. 你的键值应该全面,不要遗漏。
|
||||
@ -613,18 +625,15 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
|
||||
"支持夜视", "支持云存储"
|
||||
]
|
||||
}}
|
||||
|
||||
{}
|
||||
"""
|
||||
queries = []
|
||||
for key in key_paths:
|
||||
# 将键中的 '.' 替换为 '下的'
|
||||
modified_key = key.replace('.', '下的')
|
||||
# 使用修改后的键填充第一个占位符,原始键填充第二个占位符
|
||||
new_query = user_query_template.format(modified_key, key, modified_key) # 转豆包后取消注释
|
||||
if model_type == 1:
|
||||
new_query = user_query_template.format(modified_key, key, modified_key,f"文件内容:{full_text}") #转豆包后取消注释
|
||||
else:
|
||||
new_query = user_query_template.format(modified_key, key, modified_key,"")
|
||||
new_query = f"文件内容:{full_text}\n" + new_query
|
||||
queries.append(new_query)
|
||||
|
||||
# 处理 grouped_paths 中的项,应用 user_query_template_two
|
||||
@ -632,12 +641,9 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
|
||||
for grouped_key, grouped_key_cnt in grouped_dict.items():
|
||||
# 将键中的 '.' 替换为 '下的'
|
||||
modified_grouped_key = grouped_key.replace('.', '下的')
|
||||
new_query = user_query_template_two.format(modified_grouped_key, grouped_key_cnt, grouped_key, modified_grouped_key)
|
||||
if model_type == 1:
|
||||
new_query = user_query_template_two.format(modified_grouped_key, grouped_key_cnt, grouped_key,
|
||||
modified_grouped_key, f"文件内容:{full_text}")
|
||||
else:
|
||||
new_query = user_query_template_two.format(modified_grouped_key, grouped_key_cnt, grouped_key,
|
||||
modified_grouped_key, "")
|
||||
new_query=f"文件内容:{full_text}\n" + new_query
|
||||
queries.append(new_query)
|
||||
if model_type == 1:
|
||||
results = multi_threading(queries, "", "", 3, True) # 豆包
|
||||
@ -666,11 +672,12 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
|
||||
"""根据所有键是否已添加处理技术要求"""
|
||||
# 更新原始采购需求字典
|
||||
final_res = combine_and_update_results(modified_data, temp_final)
|
||||
ffinal_res=all_postprocess(final_res)
|
||||
ffinal_res = main_postprocess(final_res)
|
||||
ffinal_res["货物列表"] = good_list
|
||||
# 输出最终的 JSON 字符串
|
||||
return {"采购需求": ffinal_res}
|
||||
|
||||
|
||||
def test_all_files_in_folder(input_folder, output_folder):
|
||||
# 确保输出文件夹存在
|
||||
if not os.path.exists(output_folder):
|
||||
@ -693,6 +700,8 @@ def test_all_files_in_folder(input_folder, output_folder):
|
||||
print(f"结果已保存到: {output_file_path}")
|
||||
except Exception as e:
|
||||
print(f"处理文件 {file_path} 时出错: {e}")
|
||||
|
||||
|
||||
# 如果采购需求为空 考虑再调用一次大模型 qianwen-stream
|
||||
if __name__ == "__main__":
|
||||
start_time = time.time()
|
||||
|
@ -1,9 +1,11 @@
|
||||
import json
|
||||
import re
|
||||
from collections import defaultdict
|
||||
#传输技术参数需求的时候后处理 12.27版本,对重复的键名,若键值一样,不添加后缀-a -b..
|
||||
#按货物名提取技术参数,将货物名前的星等特殊符号(如'★交换机')带到具体的参数要求中
|
||||
|
||||
def postprocess_technical_table(data, good_list, special_keys=None, parent_key=''):
|
||||
"""
|
||||
传输技术参数需求的时候后处理,输入的data为经过main_postprocess处理的采购需求数据
|
||||
"""
|
||||
def get_suffix(n):
|
||||
"""
|
||||
根据数字n返回对应的字母后缀。
|
||||
@ -18,7 +20,12 @@ def postprocess_technical_table(data, good_list, special_keys=None, parent_key='
|
||||
def count_matching_keys(data, patterns, special_keys, key_value_map=None):
|
||||
"""
|
||||
递归统计匹配键的出现次数及其对应的唯一值,仅统计值为列表的键。
|
||||
不包括 special_keys 中的键。
|
||||
对于每个键,会先去除键名中的空格,如果这个清理后的键不在 special_keys 列表中且满足至少一个patterns,就把对应的值(列表转换为元组)添加到 key_value_map 字典中。
|
||||
注意:同一个键对应的相同值(转换成元组后)只会保存一次,保证唯一性。
|
||||
返回结构:{
|
||||
'key1': [tuple_value1, tuple_value2, ...],
|
||||
'key2': [tuple_value1, ...],
|
||||
}
|
||||
"""
|
||||
if key_value_map is None:
|
||||
key_value_map = defaultdict(list)
|
||||
@ -29,7 +36,7 @@ def postprocess_technical_table(data, good_list, special_keys=None, parent_key='
|
||||
if isinstance(value, list):
|
||||
if clean_key not in special_keys and any(pattern.match(clean_key) for pattern in patterns):
|
||||
value_tuple = tuple(value)
|
||||
if value_tuple not in key_value_map[clean_key]:
|
||||
if value_tuple not in key_value_map[clean_key]: #每个键下的值若一样,则只保留一个
|
||||
key_value_map[clean_key].append(value_tuple)
|
||||
elif isinstance(value, dict):
|
||||
count_matching_keys(value, patterns, special_keys, key_value_map)
|
||||
@ -42,7 +49,8 @@ def postprocess_technical_table(data, good_list, special_keys=None, parent_key='
|
||||
|
||||
def assign_suffixes(key_value_map):
|
||||
"""
|
||||
为每个键的每个唯一值分配后缀。
|
||||
如果该键只有一个唯一值,则不添加后缀-a -b..
|
||||
如果有多个唯一值,则给第一个值不添加后缀,后续的值按照顺序分别添加 -a、-b、-c
|
||||
返回一个字典,键为原键名,值为另一个字典,键为值元组,值为对应的后缀(如果需要)。
|
||||
"""
|
||||
suffix_assignment = defaultdict(dict)
|
||||
@ -74,7 +82,7 @@ def postprocess_technical_table(data, good_list, special_keys=None, parent_key='
|
||||
elif any(pattern.match(clean_key) for pattern in patterns):
|
||||
# 处理普通匹配键
|
||||
# 检查是否以特殊符号开头
|
||||
if clean_key.startswith(('▲', '★','●','■','◆','☆','△','◇','○','□','#')):
|
||||
if clean_key.startswith(('▲', '★','●','■','◆','☆','△','◇','○','□','#')): #货物名以特殊符号开头->移至键值(技术参数)开头
|
||||
#提取符号并去除符号:
|
||||
symbol = clean_key[0]
|
||||
stripped_key = clean_key[1:]
|
||||
@ -131,23 +139,8 @@ def postprocess_technical_table(data, good_list, special_keys=None, parent_key='
|
||||
|
||||
return filtered_data
|
||||
|
||||
|
||||
|
||||
def postprocess(data):
|
||||
"""递归地转换字典中的值为列表,如果所有键对应的值都是'/', '{}' 或 '未知'"""
|
||||
def convert_dict(value):
|
||||
# 如果所有值是'/', '{}' 或 '未知'
|
||||
if all(v in ['/', '未知', {}] for v in value.values()):
|
||||
return list(value.keys())
|
||||
else:
|
||||
# 如果不满足条件,则递归处理嵌套的字典
|
||||
return {k: convert_dict(v) if isinstance(v, dict) else v for k, v in value.items()}
|
||||
|
||||
# 递归处理顶层数据
|
||||
return {key: convert_dict(val) if isinstance(val, dict) else val for key, val in data.items()}
|
||||
|
||||
|
||||
def all_postprocess(data):
|
||||
def main_postprocess(data):
|
||||
#解析采购要求时候的后处理,用于前端网页展示。
|
||||
def recursive_process(item):
|
||||
pattern = re.compile(r'(.+)-\d+$')
|
||||
|
||||
@ -164,10 +157,9 @@ def all_postprocess(data):
|
||||
return item
|
||||
|
||||
temp = restructure_data(data)
|
||||
processed_data = recursive_process(temp)
|
||||
processed_data = recursive_process(temp) #重构数据以标准化嵌套层级至三层,便于前端展示。
|
||||
return processed_data
|
||||
|
||||
|
||||
def restructure_data(data):
|
||||
"""
|
||||
重构数据以标准化嵌套层级至三层。
|
||||
@ -223,15 +215,6 @@ def restructure_data(data):
|
||||
else:
|
||||
return structured_data
|
||||
|
||||
|
||||
# 定义获取所有以':'结尾的前缀的函数
|
||||
def get_prefixes(s):
|
||||
prefixes = []
|
||||
for i in range(len(s)):
|
||||
if s[i] in [':', ':']:
|
||||
prefixes.append(s[:i+1])
|
||||
return prefixes
|
||||
|
||||
# 定义删除公共前缀的函数
|
||||
def remove_common_prefixes(string_list, min_occurrence=3):
|
||||
"""
|
||||
@ -244,6 +227,14 @@ def remove_common_prefixes(string_list, min_occurrence=3):
|
||||
Returns:
|
||||
list: 删除公共前缀后的字符串列表。
|
||||
"""
|
||||
# 定义获取所有以':'结尾的前缀的函数
|
||||
def get_prefixes(s):
|
||||
prefixes = []
|
||||
for i in range(len(s)):
|
||||
if s[i] in [':', ':']:
|
||||
prefixes.append(s[:i + 1])
|
||||
return prefixes
|
||||
|
||||
if not string_list:
|
||||
return string_list
|
||||
|
||||
@ -305,7 +296,7 @@ if __name__ == "__main__":
|
||||
],
|
||||
}
|
||||
# 处理数据
|
||||
result = all_postprocess(sample_data)
|
||||
result = main_postprocess(sample_data)
|
||||
# 输出处理结果
|
||||
print(json.dumps(result,ensure_ascii=False,indent=4))
|
||||
|
||||
|
@ -3,11 +3,8 @@ import json
|
||||
import os
|
||||
import time
|
||||
from flask_app.general.format_change import get_pdf_page_count
|
||||
from flask_app.general.doubao import pdf2txt
|
||||
from flask_app.general.file2markdown import convert_file_to_markdown
|
||||
from flask_app.general.format_change import pdf2docx
|
||||
from flask_app.货物标.技术参数要求提取 import get_technical_requirements
|
||||
from flask_app.general.通义千问long import upload_file
|
||||
from flask_app.货物标.商务服务其他要求提取 import get_business_requirements
|
||||
|
||||
|
||||
|
@ -3,7 +3,7 @@ import json
|
||||
import re
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from flask_app.general.通义千问long import upload_file, qianwen_long
|
||||
from flask_app.general.llm.通义千问long_plus import upload_file, qianwen_long
|
||||
from flask_app.general.json_utils import clean_json_string
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user