无效标废标提取优化版本
This commit is contained in:
parent
b8dbc9602b
commit
2226d27a3c
@ -130,8 +130,8 @@ def process_all_part1_pdfs(folder_path, output_folder):
|
|||||||
extract_tables_main(file_path, subfolder_path)
|
extract_tables_main(file_path, subfolder_path)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招标文件111_tobidders_notice_part1.docx'
|
path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件.docx'
|
||||||
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp" # 前附表json文件
|
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\tmp" # 前附表json文件
|
||||||
res=extract_tables_main(path, output_folder)
|
res=extract_tables_main(path, output_folder)
|
||||||
#
|
#
|
||||||
# folder_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4'
|
# folder_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4'
|
||||||
|
@ -133,7 +133,7 @@ def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达
|
|||||||
if any(exclude in data for exclude in excludes):
|
if any(exclude in data for exclude in excludes):
|
||||||
continue # 如果包含任何排除字符串,跳过这个数据
|
continue # 如果包含任何排除字符串,跳过这个数据
|
||||||
# 去掉开头的序号,包括字母+数字的格式 以及括号+数字
|
# 去掉开头的序号,包括字母+数字的格式 以及括号+数字
|
||||||
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)'
|
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)'
|
||||||
data = re.sub(pattern, '', data).strip()
|
data = re.sub(pattern, '', data).strip()
|
||||||
keyword_match = re.search(keywords, data)
|
keyword_match = re.search(keywords, data)
|
||||||
if keyword_match:
|
if keyword_match:
|
||||||
@ -158,7 +158,7 @@ def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达
|
|||||||
# print(text_list)
|
# print(text_list)
|
||||||
new_text_list=preprocess_text_list(text_list)
|
new_text_list=preprocess_text_list(text_list)
|
||||||
# print(new_text_list)
|
# print(new_text_list)
|
||||||
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)'
|
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)'
|
||||||
data = re.sub(pattern, '', new_text_list[0]).strip() #去除序号
|
data = re.sub(pattern, '', new_text_list[0]).strip() #去除序号
|
||||||
# 将修改后的第一个元素和剩余的元素连接起来
|
# 将修改后的第一个元素和剩余的元素连接起来
|
||||||
new_text_list[0] = data # 更新列表中的第一个元素
|
new_text_list[0] = data # 更新列表中的第一个元素
|
||||||
@ -183,7 +183,8 @@ def find_sentences_with_keywords(data, keywords, follow_up_keywords):
|
|||||||
sentences2.extend(result2)
|
sentences2.extend(result2)
|
||||||
elif isinstance(data, str):
|
elif isinstance(data, str):
|
||||||
# 分割句子,保证句子完整性(按标点符号和序号分割)
|
# 分割句子,保证句子完整性(按标点符号和序号分割)
|
||||||
split_sentences = re.split(r'(?<=[。!?\!\?])|(?=\d+[\、\.])|(?=[((]\d+[))])', data) # 扩展匹配序号分割
|
# split_sentences = re.split(r'(?<=[。!?\!\?])|(?=\d+[\、\.])|(?=[((]\d+[))])', data) # 扩展匹配序号分割
|
||||||
|
split_sentences = re.split(r'(?<=[。!?\!\?])|(?=\d+\.\d+)|(?=\d+[\、\.])|(?=[((]\d+[))])', data)
|
||||||
i = 0
|
i = 0
|
||||||
while i < len(split_sentences):
|
while i < len(split_sentences):
|
||||||
sentence = split_sentences[i].strip()
|
sentence = split_sentences[i].strip()
|
||||||
@ -205,12 +206,14 @@ def find_sentences_with_keywords(data, keywords, follow_up_keywords):
|
|||||||
full_text = ' '.join(split_sentences[start_index:end_index]).strip()
|
full_text = ' '.join(split_sentences[start_index:end_index]).strip()
|
||||||
else:
|
else:
|
||||||
full_text = ' '.join(split_sentences[start_index:]).strip()
|
full_text = ' '.join(split_sentences[start_index:]).strip()
|
||||||
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
|
# pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
|
||||||
|
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)'
|
||||||
data=re.sub(pattern,'',full_text)
|
data=re.sub(pattern,'',full_text)
|
||||||
sentences2.append(data) # 存储有后续关键词的情况
|
sentences2.append(data) # 存储有后续关键词的情况
|
||||||
i = end_index if found_next_section else len(split_sentences)
|
i = end_index if found_next_section else len(split_sentences)
|
||||||
else:
|
else:
|
||||||
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
|
# pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
|
||||||
|
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)'
|
||||||
data = re.sub(pattern, '', sentence).replace('\n','').strip()
|
data = re.sub(pattern, '', sentence).replace('\n','').strip()
|
||||||
sentences1.append(data) # 存储没有后续关键词的情况
|
sentences1.append(data) # 存储没有后续关键词的情况
|
||||||
i += 1
|
i += 1
|
||||||
@ -274,36 +277,36 @@ def handle_query(file_path, user_query, output_file, result_key, keywords, trunc
|
|||||||
all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
|
all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
|
||||||
all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords) #提取表格数据(json_data)
|
all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords) #提取表格数据(json_data)
|
||||||
qianwen_txt = all_texts1 + all_tables1
|
qianwen_txt = all_texts1 + all_tables1
|
||||||
# Proceed only if there is content to write
|
selected_contents = set() # 使用 set 去重
|
||||||
|
|
||||||
if qianwen_txt:
|
if qianwen_txt:
|
||||||
with open(output_file, 'w', encoding='utf-8') as file:
|
with open(output_file, 'w', encoding='utf-8') as file:
|
||||||
# 初始化一个计数器
|
|
||||||
counter = 1
|
counter = 1
|
||||||
for content in qianwen_txt:
|
for content in qianwen_txt:
|
||||||
file.write("..............."+'\n')
|
file.write("..............." + '\n')
|
||||||
# 写入内容前加上序号,后面接一个点和空格,然后是内容
|
|
||||||
file.write(f"{counter}. {content}\n")
|
file.write(f"{counter}. {content}\n")
|
||||||
# 更新计数器,每次循环递增
|
|
||||||
counter += 1
|
counter += 1
|
||||||
|
|
||||||
file_id = upload_file(output_file)
|
file_id = upload_file(output_file)
|
||||||
qianwen_ans = qianwen_long(file_id, user_query)
|
qianwen_ans = qianwen_long(file_id, user_query)
|
||||||
selected_contents = []
|
|
||||||
num_list = process_string_list(qianwen_ans)
|
num_list = process_string_list(qianwen_ans)
|
||||||
print(num_list)
|
print(num_list)
|
||||||
|
|
||||||
for index in num_list:
|
for index in num_list:
|
||||||
if index - 1 < len(qianwen_txt):
|
if index - 1 < len(qianwen_txt):
|
||||||
content = qianwen_txt[index - 1] # 转换序号为索引(假设序号从1开始)
|
content = qianwen_txt[index - 1]
|
||||||
selected_contents.append(content)
|
selected_contents.add(content)
|
||||||
selected_contents += all_texts2
|
|
||||||
selected_contents += all_tables2
|
# 无论 qianwen_txt 是否为空,都添加 all_texts2 和 all_tables2 的内容
|
||||||
# 创建一个字典来保存结果
|
selected_contents.update(all_texts2)
|
||||||
res = {result_key: selected_contents}
|
selected_contents.update(all_tables2)
|
||||||
# 将结果转换为JSON字符串
|
|
||||||
# os.remove(output_file) # Remove the file after use
|
# 如果 selected_contents 不为空,则返回结果,否则返回空字符串
|
||||||
# print(f"Deleted temporary file: {output_file}")
|
if selected_contents:
|
||||||
|
res = {result_key: list(selected_contents)}
|
||||||
else:
|
else:
|
||||||
res = {result_key: ""} # Set the response to empty if no contents were extracted
|
res = {result_key: ""}
|
||||||
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,truncate3):
|
def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,truncate3):
|
||||||
|
@ -36,7 +36,41 @@ def read_docx_tables(file_path):
|
|||||||
print(f"Row {row_idx + 1}: {row_data}")
|
print(f"Row {row_idx + 1}: {row_data}")
|
||||||
print("\n" + "-" * 40 + "\n") # 打印分隔线
|
print("\n" + "-" * 40 + "\n") # 打印分隔线
|
||||||
|
|
||||||
|
def read_tables_from_docx(file_path):
|
||||||
|
# 尝试打开文档
|
||||||
|
try:
|
||||||
|
doc = Document(file_path)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error opening file: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 初始化列表来保存符合条件的单元格内容
|
||||||
|
cell_contents = []
|
||||||
|
|
||||||
|
# 读取文档中的所有表格
|
||||||
|
if not doc.tables:
|
||||||
|
print("No tables found in the document.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 遍历文档中的每个表格
|
||||||
|
for table_idx, table in enumerate(doc.tables):
|
||||||
|
# 遍历表格中的每一行
|
||||||
|
for row_idx, row in enumerate(table.rows):
|
||||||
|
# 遍历每一行中的单元格
|
||||||
|
for cell in row.cells:
|
||||||
|
cell_text = cell.text.strip() # 去除单元格内容前后空白
|
||||||
|
if len(cell_text) > 6: # 检查文字数量是否大于5
|
||||||
|
cell_contents.append(cell_text)
|
||||||
|
|
||||||
|
# 返回符合条件的单元格内容
|
||||||
|
return cell_contents
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件(1)_tobidders_notice_part1.docx"
|
file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件.docx'
|
||||||
|
# output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\tmp" # 前附表json文件
|
||||||
# read_docx(file_path)
|
# read_docx(file_path)
|
||||||
read_docx_tables(file_path)
|
read_docx_tables(file_path)
|
||||||
|
list=read_tables_from_docx(file_path)
|
||||||
|
for i in list:
|
||||||
|
print(i)
|
||||||
|
print("--------------")
|
@ -3,7 +3,6 @@ import json
|
|||||||
import os.path
|
import os.path
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
from flask_app.main.json_utils import combine_json_results, nest_json_under_key
|
|
||||||
from flask_app.main.通义千问long import upload_file, qianwen_long
|
from flask_app.main.通义千问long import upload_file, qianwen_long
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from flask_app.main.禁止投标情形 import find_forbidden, process_string_list
|
from flask_app.main.禁止投标情形 import find_forbidden, process_string_list
|
||||||
@ -147,8 +146,8 @@ def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达
|
|||||||
# 检查是否包含任何需要排除的字符串
|
# 检查是否包含任何需要排除的字符串
|
||||||
if any(exclude in data for exclude in excludes):
|
if any(exclude in data for exclude in excludes):
|
||||||
continue # 如果包含任何排除字符串,跳过这个数据
|
continue # 如果包含任何排除字符串,跳过这个数据
|
||||||
# 去掉开头的序号,包括字母+数字的格式 以及括号+数字
|
# 去掉开头的序号,eg:1 | (1) |(2) | 1. | 2.(全角点)| 3、 | 1.1 | 2.3.4 | A1 | C1.1 | 一、
|
||||||
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)'
|
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)'
|
||||||
data = re.sub(pattern, '', data).strip()
|
data = re.sub(pattern, '', data).strip()
|
||||||
keyword_match = re.search(keywords, data)
|
keyword_match = re.search(keywords, data)
|
||||||
if keyword_match:
|
if keyword_match:
|
||||||
@ -172,7 +171,8 @@ def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达
|
|||||||
else:
|
else:
|
||||||
new_text_list=preprocess_text_list(text_list)
|
new_text_list=preprocess_text_list(text_list)
|
||||||
#用于处理结构化文本,清理掉不必要的序号,并将分割后的段落合并,最终形成更简洁和格式化的输出。
|
#用于处理结构化文本,清理掉不必要的序号,并将分割后的段落合并,最终形成更简洁和格式化的输出。
|
||||||
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)'
|
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)'
|
||||||
|
|
||||||
data = re.sub(pattern, '', new_text_list[0]).strip() #去除序号
|
data = re.sub(pattern, '', new_text_list[0]).strip() #去除序号
|
||||||
# 将修改后的第一个元素和剩余的元素连接起来
|
# 将修改后的第一个元素和剩余的元素连接起来
|
||||||
new_text_list[0] = data # 更新列表中的第一个元素
|
new_text_list[0] = data # 更新列表中的第一个元素
|
||||||
@ -181,10 +181,11 @@ def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达
|
|||||||
|
|
||||||
return all_texts1,all_texts2 #all_texts1要额外用gpt all_text2直接返回结果
|
return all_texts1,all_texts2 #all_texts1要额外用gpt all_text2直接返回结果
|
||||||
|
|
||||||
def read_docx_last_column(file_path):
|
#只读取前附表中的最后一列(省钱,但容易漏内容)
|
||||||
|
def read_docx_last_column(truncate_file):
|
||||||
# 尝试打开文档
|
# 尝试打开文档
|
||||||
try:
|
try:
|
||||||
doc = Document(file_path)
|
doc = Document(truncate_file)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error opening file: {e}")
|
print(f"Error opening file: {e}")
|
||||||
return []
|
return []
|
||||||
@ -207,15 +208,51 @@ def read_docx_last_column(file_path):
|
|||||||
|
|
||||||
return last_column_values
|
return last_column_values
|
||||||
|
|
||||||
#TODO:采购拦标价:人民币 380000.00 元供应商首次报价或最后报价超出本项目公布拦标价的,按照无效报价处
|
#完整读取文件中所有表格(适合pdf转docx价格便宜的情况,优先推荐,内容完整)
|
||||||
|
def read_tables_from_docx(file_path):
|
||||||
|
# 尝试打开文档
|
||||||
|
try:
|
||||||
|
doc = Document(file_path)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error opening file: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 初始化列表来保存符合条件的单元格内容
|
||||||
|
cell_contents = []
|
||||||
|
|
||||||
|
# 读取文档中的所有表格
|
||||||
|
if not doc.tables:
|
||||||
|
print("No tables found in the document.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 遍历文档中的每个表格
|
||||||
|
for table_idx, table in enumerate(doc.tables):
|
||||||
|
# 遍历表格中的每一行
|
||||||
|
for row_idx, row in enumerate(table.rows):
|
||||||
|
# 遍历每一行中的单元格
|
||||||
|
for cell in row.cells:
|
||||||
|
cell_text = cell.text.strip() # 去除单元格内容前后空白
|
||||||
|
if len(cell_text) > 6: # 检查文字数量是否大于5
|
||||||
|
cell_contents.append(cell_text)
|
||||||
|
|
||||||
|
# 返回符合条件的单元格内容
|
||||||
|
return cell_contents
|
||||||
|
|
||||||
def extract_table_with_keywords(data, keywords, follow_up_keywords):
|
def extract_table_with_keywords(data, keywords, follow_up_keywords):
|
||||||
"""遍历列表中的每个元素,查找并返回包含关键词的句子列表,并根据是否存在后续关键词分别存储到两个列表中。"""
|
"""遍历列表中的每个元素,查找并返回包含关键词的句子列表,并根据是否存在后续关键词分别存储到两个列表中。"""
|
||||||
sentences1 = [] # 保存没有后续关键词的情况
|
sentences1 = [] # 保存没有后续关键词的情况
|
||||||
sentences2 = [] # 保存有后续关键词的情况
|
sentences2 = [] # 保存有后续关键词的情况
|
||||||
|
|
||||||
|
# 检查是否包含 '无效报价' 的关键词
|
||||||
|
check_invalid_bidding = '无\s*效\s*报\s*价' in keywords
|
||||||
# 遍历列表中的每个字符串元素
|
# 遍历列表中的每个字符串元素
|
||||||
for item in data:
|
for item in data:
|
||||||
# 分割句子,保证句子完整性(按标点符号和序号分割)
|
# 只有在包含 '无效投标' 关键词时,才检查 "无效报价"
|
||||||
|
if check_invalid_bidding and re.search(r'无\s*效\s*报\s*价', item, re.IGNORECASE):
|
||||||
|
sentences1.append(item.strip())
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 分割句子,保证句子完整性(按标点符号和序号分割)eg:(?=\d+\.\d+):匹配诸如 1.1、2.2 之类的序号,并在序号前进行分割。
|
||||||
split_sentences = re.split(r'(?<=[。!?\!\?])|(?=\d+\.\d+)|(?=\d+[\、\.])|(?=[((]\d+[))])', item)
|
split_sentences = re.split(r'(?<=[。!?\!\?])|(?=\d+\.\d+)|(?=\d+[\、\.])|(?=[((]\d+[))])', item)
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
@ -239,19 +276,17 @@ def extract_table_with_keywords(data, keywords, follow_up_keywords):
|
|||||||
full_text = ' '.join(split_sentences[start_index:end_index]).strip()
|
full_text = ' '.join(split_sentences[start_index:end_index]).strip()
|
||||||
else:
|
else:
|
||||||
full_text = ' '.join(split_sentences[start_index:]).strip()
|
full_text = ' '.join(split_sentences[start_index:]).strip()
|
||||||
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
|
# pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
|
||||||
|
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)'
|
||||||
full_text = re.sub(pattern, '', full_text)
|
full_text = re.sub(pattern, '', full_text)
|
||||||
sentences2.append(full_text) # 存储有后续关键词的情况
|
sentences2.append(full_text) # 存储有后续关键词的情况
|
||||||
i = end_index if found_next_section else len(split_sentences)
|
i = end_index if found_next_section else len(split_sentences)
|
||||||
else:
|
else:
|
||||||
# 没有后续关键词的情况
|
# 没有后续关键词的情况
|
||||||
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
|
# pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
|
||||||
#TODO:会删除什么范围的万
|
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|.)?|[一二三四五六七八九十]+、)'
|
||||||
cleaned_sentence = re.sub(pattern, '', sentence).replace('\n', '').strip()
|
cleaned_sentence = re.sub(pattern, '', sentence).replace('\n', '').strip()
|
||||||
# 删除句子中的 "万" 或 "元"
|
|
||||||
cleaned_sentence = re.sub(r'[万元]', '', cleaned_sentence).strip()
|
|
||||||
sentences1.append(cleaned_sentence) # 存储没有后续关键词的情况
|
sentences1.append(cleaned_sentence) # 存储没有后续关键词的情况
|
||||||
|
|
||||||
i += 1
|
i += 1
|
||||||
else:
|
else:
|
||||||
i += 1
|
i += 1
|
||||||
@ -303,47 +338,47 @@ def handle_query(file_path, user_query, output_file, result_key, keywords,trunca
|
|||||||
excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:","我方"]
|
excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:","我方"]
|
||||||
follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下']
|
follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下']
|
||||||
extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) #字典结果
|
extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) #字典结果
|
||||||
# print(extracted_contents)
|
|
||||||
all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
|
all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
|
||||||
table_data_list=read_docx_last_column(truncate_file) #从投标人须知前附表中提取信息生成列表data,每个元素为'一行信息'
|
table_data_list=read_docx_last_column(truncate_file) #从投标人须知前附表中提取信息生成列表data,每个元素为'一行信息'
|
||||||
|
# table_data_list=read_tables_from_docx(file_path)
|
||||||
all_tables1, all_tables2 = extract_table_with_keywords(table_data_list, keywords,follow_up_keywords)
|
all_tables1, all_tables2 = extract_table_with_keywords(table_data_list, keywords,follow_up_keywords)
|
||||||
qianwen_txt = all_texts1 + all_tables1
|
qianwen_txt = all_texts1 + all_tables1
|
||||||
# Proceed only if there is content to write
|
# Proceed only if there is content to write
|
||||||
|
selected_contents = set() # 使用 set 去重
|
||||||
|
|
||||||
if qianwen_txt:
|
if qianwen_txt:
|
||||||
with open(output_file, 'w', encoding='utf-8') as file:
|
with open(output_file, 'w', encoding='utf-8') as file:
|
||||||
# 初始化一个计数器
|
|
||||||
counter = 1
|
counter = 1
|
||||||
for content in qianwen_txt:
|
for content in qianwen_txt:
|
||||||
file.write("..............."+'\n')
|
file.write("..............." + '\n')
|
||||||
# 写入内容前加上序号,后面接一个点和空格,然后是内容
|
|
||||||
file.write(f"{counter}. {content}\n")
|
file.write(f"{counter}. {content}\n")
|
||||||
# 更新计数器,每次循环递增
|
|
||||||
counter += 1
|
counter += 1
|
||||||
|
|
||||||
file_id = upload_file(output_file)
|
file_id = upload_file(output_file)
|
||||||
qianwen_ans = qianwen_long(file_id, user_query)
|
qianwen_ans = qianwen_long(file_id, user_query)
|
||||||
selected_contents = set() # 使用 set 去重
|
|
||||||
num_list = process_string_list(qianwen_ans)
|
num_list = process_string_list(qianwen_ans)
|
||||||
print(num_list)
|
print(num_list)
|
||||||
|
|
||||||
for index in num_list:
|
for index in num_list:
|
||||||
if index - 1 < len(qianwen_txt):
|
if index - 1 < len(qianwen_txt):
|
||||||
content = qianwen_txt[index - 1] # 转换序号为索引(假设序号从1开始)
|
content = qianwen_txt[index - 1]
|
||||||
selected_contents.add(content)
|
selected_contents.add(content)
|
||||||
# 将 all_texts2 和 all_tables2 中的内容也添加到 set 中
|
|
||||||
selected_contents.update(all_texts2)
|
# 无论 qianwen_txt 是否为空,都添加 all_texts2 和 all_tables2 的内容
|
||||||
selected_contents.update(all_tables2)
|
selected_contents.update(all_texts2)
|
||||||
# 将 set 转换为 list 来返回结果
|
selected_contents.update(all_tables2)
|
||||||
|
|
||||||
|
# 如果 selected_contents 不为空,则返回结果,否则返回空字符串
|
||||||
|
if selected_contents:
|
||||||
res = {result_key: list(selected_contents)}
|
res = {result_key: list(selected_contents)}
|
||||||
# 将结果转换为JSON字符串
|
|
||||||
# os.remove(output_file) # Remove the file after use
|
|
||||||
# print(f"Deleted temporary file: {output_file}")
|
|
||||||
else:
|
else:
|
||||||
res = {result_key: ""} # Set the response to empty if no contents were extracted
|
res = {result_key: ""}
|
||||||
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def combine_find_invalid(file_path, output_dir,truncate_file):
|
def combine_find_invalid(file_path, output_dir,truncate_file):
|
||||||
queries = [
|
queries = [
|
||||||
(r'否\s*决|无\s*效\s*投\s*标|无\s*效\s*文\s*件|无\s*效\s*响\s*应|无\s*效\s*报\s*价|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
|
(r'否\s*决|无\s*效\s*投\s*标|无\s*效\s*文\s*件|文\s*件\s*无\s*效|无\s*效\s*响\s*应|无\s*效\s*报\s*价|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
|
||||||
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。",
|
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。",
|
||||||
os.path.join(output_dir, "temp1.txt"), "否决和无效投标情形"),
|
os.path.join(output_dir, "temp1.txt"), "否决和无效投标情形"),
|
||||||
(r'废\s*标',
|
(r'废\s*标',
|
||||||
@ -381,11 +416,11 @@ def combine_find_invalid(file_path, output_dir,truncate_file):
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
# truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json"
|
# truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json"
|
||||||
truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件(1)_tobidders_notice_part1.docx"
|
truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招标文件(实高电子显示屏)_tobidders_notice_part1.docx"
|
||||||
clause_path="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause磋商文件(1)_tobidders_notice_part2.json"
|
clause_path="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause招标文件(实高电子显示屏)_tobidders_notice_part2.json"
|
||||||
output_dir = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\invalid"
|
output_dir = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\invalid"
|
||||||
# doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx'
|
# doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx'
|
||||||
doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx'
|
doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件(实高电子显示屏).docx'
|
||||||
results = combine_find_invalid(doc_path, output_dir,truncate_file)
|
results = combine_find_invalid(doc_path, output_dir,truncate_file)
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
print("Elapsed time:", str(end_time - start_time))
|
print("Elapsed time:", str(end_time - start_time))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user