9.3
This commit is contained in:
parent
1d0211ce72
commit
ee16d5e4b4
50
flask_app/main/docx截取docx.py
Normal file
50
flask_app/main/docx截取docx.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
from docx import Document
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def copy_docx(source_path):
|
||||||
|
doc = Document(source_path) # 打开源文档
|
||||||
|
output_folder = os.path.dirname(source_path)
|
||||||
|
|
||||||
|
# 获取原文件名并添加后缀
|
||||||
|
original_file_name = os.path.basename(source_path)
|
||||||
|
file_name_without_ext, file_ext = os.path.splitext(original_file_name)
|
||||||
|
modified_file_name = file_name_without_ext + "_invalid" + file_ext
|
||||||
|
destination_path = os.path.join(output_folder, modified_file_name)
|
||||||
|
|
||||||
|
new_doc = Document() # 创建新文档
|
||||||
|
|
||||||
|
# 定义正则表达式模式
|
||||||
|
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷')
|
||||||
|
end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|:清标报告|:清标报告')
|
||||||
|
|
||||||
|
# 寻找最后一个begin_pattern的位置
|
||||||
|
last_begin_index = -1
|
||||||
|
for i, paragraph in enumerate(doc.paragraphs):
|
||||||
|
if begin_pattern.search(paragraph.text):
|
||||||
|
last_begin_index = i
|
||||||
|
|
||||||
|
# 从最后一个匹配的begin_pattern开始复制,直到end_pattern
|
||||||
|
if last_begin_index != -1:
|
||||||
|
for i, paragraph in enumerate(doc.paragraphs[last_begin_index:], start=last_begin_index):
|
||||||
|
new_para = new_doc.add_paragraph(style=paragraph.style)
|
||||||
|
for run in paragraph.runs:
|
||||||
|
new_run = new_para.add_run(run.text)
|
||||||
|
new_run.bold = run.bold
|
||||||
|
new_run.italic = run.italic
|
||||||
|
new_run.underline = run.underline
|
||||||
|
if run.font.color:
|
||||||
|
new_run.font.color.rgb = run.font.color.rgb
|
||||||
|
new_run.font.size = run.font.size
|
||||||
|
|
||||||
|
if end_pattern.search(paragraph.text):
|
||||||
|
break
|
||||||
|
|
||||||
|
new_doc.save(destination_path) # 保存新文档
|
||||||
|
|
||||||
|
|
||||||
|
# 调用函数
|
||||||
|
if __name__ == '__main__':
|
||||||
|
source_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output\\zbtest13.docx"
|
||||||
|
copy_docx(source_path)
|
@ -58,7 +58,7 @@ def docx2pdf(local_path_in):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# 替换为你的文件路径和API URL
|
# 替换为你的文件路径和API URL
|
||||||
local_path_in="C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest16_invalid.docx"
|
local_path_in="C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\test111.pdf"
|
||||||
# pdf2docx(local_path_in)
|
# pdf2docx(local_path_in)
|
||||||
downloaded_file=docx2pdf(local_path_in)
|
downloaded_file=docx2pdf(local_path_in)
|
||||||
print(downloaded_file)
|
print(downloaded_file)
|
||||||
|
@ -1,18 +1,35 @@
|
|||||||
import re
|
import re
|
||||||
|
import ast
|
||||||
|
|
||||||
# 正则表达式
|
def process_string_list(string_list):
|
||||||
pattern = re.compile(r'第[一二三四五六七八九十百千]+章.*?(?:项目|服务|商务).*?要求')
|
# 使用正则表达式匹配方括号内的内容
|
||||||
|
match = re.search(r'\[(.*?)\]', string_list)
|
||||||
|
if match:
|
||||||
|
# 获取匹配的内容,即方括号内的部分
|
||||||
|
content_inside_brackets = match.group(1)
|
||||||
|
if content_inside_brackets: # 检查内容是否为空
|
||||||
|
# 检查内容是否是数字列表
|
||||||
|
if all(item.strip().isdigit() for item in content_inside_brackets.split(',')):
|
||||||
|
# 如果是数字,不用加引号,直接保留数字
|
||||||
|
formatted_list = '[' + ', '.join(item.strip() for item in content_inside_brackets.split(',') if item.strip()) + ']'
|
||||||
|
else:
|
||||||
|
# 如果不全是数字,按字符串处理
|
||||||
|
formatted_list = '[' + ', '.join(f"'{item.strip()}'" for item in content_inside_brackets.split(',') if item.strip()) + ']'
|
||||||
|
else:
|
||||||
|
return [] # 直接返回空列表如果内容为空
|
||||||
|
|
||||||
# 示例文本进行测试
|
# 使用 ast.literal_eval 来解析格式化后的字符串
|
||||||
text = """
|
try:
|
||||||
第一章项目技术、服务及商务要求
|
actual_list = ast.literal_eval(formatted_list)
|
||||||
第二章 服务细节要求
|
return actual_list
|
||||||
第三章 商务处理要求
|
except SyntaxError as e:
|
||||||
第四章 项目安排要求
|
print(f"Error parsing list: {e}")
|
||||||
第五章 安全要求
|
return []
|
||||||
"""
|
else:
|
||||||
|
# 如果没有匹配到内容,返回空列表
|
||||||
|
return []
|
||||||
|
|
||||||
# 查找所有匹配
|
# 测试代码
|
||||||
matches = pattern.findall(text)
|
test_string = "[1,2,哈哈]"
|
||||||
for match in matches:
|
result = process_string_list(test_string)
|
||||||
print(match)
|
print(result) # 现在应该输出: [1, 2, 4, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19, 20, 21, 22]
|
||||||
|
@ -9,8 +9,9 @@ def clean_page_numbers(text):
|
|||||||
# 删除结尾的页码
|
# 删除结尾的页码
|
||||||
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text)
|
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text)
|
||||||
# 删除形如 /129 的页码
|
# 删除形如 /129 的页码
|
||||||
cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text)
|
cleaned_text = re.sub(r'\s*\/\s*\d+\\s*', '', cleaned_text)
|
||||||
return cleaned_text
|
return cleaned_text
|
||||||
|
|
||||||
def extract_pages(pdf_path, output_folder, chapter_pattern, begin_page, end_phrase_pattern, output_suffix):
|
def extract_pages(pdf_path, output_folder, chapter_pattern, begin_page, end_phrase_pattern, output_suffix):
|
||||||
# 打开PDF文件
|
# 打开PDF文件
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
@ -38,7 +39,6 @@ def extract_pages(pdf_path, output_folder, chapter_pattern, begin_page, end_phra
|
|||||||
end_page = i
|
end_page = i
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
# 确保找到了起始和结束页面
|
# 确保找到了起始和结束页面
|
||||||
if start_page is None or end_page is None:
|
if start_page is None or end_page is None:
|
||||||
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
|
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
|
||||||
@ -111,6 +111,14 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
|||||||
]
|
]
|
||||||
output_suffix = "tobidders_notice"
|
output_suffix = "tobidders_notice"
|
||||||
elif selection==4:
|
elif selection==4:
|
||||||
|
# 配置用于 "资格审查条件" 的正则表达式模式和短语
|
||||||
|
appendix_pattern = r'^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]'
|
||||||
|
pattern = re.compile(appendix_pattern)
|
||||||
|
begin_page = 5
|
||||||
|
end_phrases = [r'评标办法正文', r'评标办法', appendix_pattern]
|
||||||
|
output_suffix = "qualification"
|
||||||
|
elif selection == 5:
|
||||||
|
# 配置用于 "无效标" 的正则表达式模式和短语
|
||||||
pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:')
|
pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:')
|
||||||
begin_page = 0
|
begin_page = 0
|
||||||
end_phrases = [
|
end_phrases = [
|
||||||
@ -118,13 +126,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
|||||||
r':清标报告',# 添加了新的匹配项
|
r':清标报告',# 添加了新的匹配项
|
||||||
r':清标报告'
|
r':清标报告'
|
||||||
]
|
]
|
||||||
output_suffix="invalid"
|
output_suffix = "invalid"
|
||||||
elif selection==5:
|
|
||||||
appendix_pattern = r'^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]'
|
|
||||||
pattern = re.compile(appendix_pattern)
|
|
||||||
begin_page=5
|
|
||||||
end_phrases = [r'评标办法正文', r'评标办法',appendix_pattern]
|
|
||||||
output_suffix="qualification"
|
|
||||||
else:
|
else:
|
||||||
print("无效的选择")
|
print("无效的选择")
|
||||||
return None
|
return None
|
||||||
@ -134,16 +136,15 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
|||||||
|
|
||||||
def truncate_pdf_multiple(input_path, output_folder):
|
def truncate_pdf_multiple(input_path, output_folder):
|
||||||
truncate_files = []
|
truncate_files = []
|
||||||
for selection in range(1, 6):
|
for selection in range(1, 5):
|
||||||
files = truncate_pdf_main(input_path, output_folder, selection)
|
files = truncate_pdf_main(input_path, output_folder, selection)
|
||||||
truncate_files.extend(files)
|
truncate_files.extend(files)
|
||||||
return truncate_files
|
return truncate_files
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest5.pdf"
|
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标03.pdf"
|
||||||
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\test"
|
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\test"
|
||||||
truncate_pdf_multiple(input_path,output_folder)
|
# truncate_pdf_multiple(input_path,output_folder)
|
||||||
# selection = 5 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-招标公告-合同条款前
|
selection = 5 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件
|
||||||
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||||
# print("生成的文件:", generated_files)
|
# print("生成的文件:", generated_files)
|
||||||
|
|
||||||
|
@ -33,7 +33,8 @@ def extract_section(text, start_keyword, end_phrases):
|
|||||||
|
|
||||||
end_index = len(text)
|
end_index = len(text)
|
||||||
for phrase in end_phrases:
|
for phrase in end_phrases:
|
||||||
match = re.search(phrase, text[start_index:])
|
# Use multiline mode with `re.MULTILINE`
|
||||||
|
match = re.search(phrase, text[start_index:], re.MULTILINE) #Hello, world!\nWelcome to OpenAI. 在多行字符串多,要 re.MULTILINE以匹配每一行的开头,否则只会匹配字符串的开头。
|
||||||
if match:
|
if match:
|
||||||
end_index = start_index + match.start()
|
end_index = start_index + match.start()
|
||||||
break
|
break
|
||||||
@ -118,10 +119,8 @@ def convert_to_json(file_path, start_word, end_phrases):
|
|||||||
text = extract_text_from_pdf(file_path)
|
text = extract_text_from_pdf(file_path)
|
||||||
else:
|
else:
|
||||||
raise ValueError("Unsupported file format")
|
raise ValueError("Unsupported file format")
|
||||||
|
|
||||||
# 提取从 start_word 开始到 end_phrases 结束的内容
|
# 提取从 start_word 开始到 end_phrases 结束的内容
|
||||||
text = extract_section(text, start_word, end_phrases)
|
text = extract_section(text, start_word, end_phrases)
|
||||||
|
|
||||||
parsed_data = parse_text_by_heading(text)
|
parsed_data = parse_text_by_heading(text)
|
||||||
return parsed_data
|
return parsed_data
|
||||||
|
|
||||||
@ -138,13 +137,13 @@ def convert_clause_to_json(input_path,output_folder):
|
|||||||
return output_path
|
return output_path
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp6\\48e650ff-70eb-48df-874c-66c8abbcd89d\\ztbfile_tobidders_notice.pdf'
|
file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp6\\713022ff-27d9-43e3-9cc9-2752effbfd66\\ztbfile_tobidders_notice.pdf'
|
||||||
start_word = "投标人须知正文"
|
start_word = "投标人须知正文"
|
||||||
end_phrases = [
|
end_phrases = [
|
||||||
r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
|
r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
|
||||||
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
|
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
|
||||||
]
|
]
|
||||||
output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件'
|
output_folder = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp6\\713022ff-27d9-43e3-9cc9-2752effbfd66'
|
||||||
try:
|
try:
|
||||||
output_path = convert_clause_to_json(file_path,output_folder)
|
output_path = convert_clause_to_json(file_path,output_folder)
|
||||||
print(f"Final JSON result saved to: {output_path}")
|
print(f"Final JSON result saved to: {output_path}")
|
||||||
|
File diff suppressed because one or more lines are too long
@ -7,6 +7,7 @@ from flask_app.main.json_utils import combine_json_results, nest_json_under_key
|
|||||||
from flask_app.main.通义千问long import upload_file, qianwen_long
|
from flask_app.main.通义千问long import upload_file, qianwen_long
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from flask_app.main.禁止投标情形 import find_forbidden
|
from flask_app.main.禁止投标情形 import find_forbidden
|
||||||
|
from 禁止投标情形 import process_string_list
|
||||||
|
|
||||||
#如果当前段落有序号,则向下匹配直接遇到相同的序号样式
|
#如果当前段落有序号,则向下匹配直接遇到相同的序号样式
|
||||||
#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。
|
#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。
|
||||||
@ -85,7 +86,7 @@ def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达
|
|||||||
if any(exclude in data for exclude in excludes):
|
if any(exclude in data for exclude in excludes):
|
||||||
continue # 如果包含任何排除字符串,跳过这个数据
|
continue # 如果包含任何排除字符串,跳过这个数据
|
||||||
# 去掉开头的序号,包括字母+数字的格式 以及括号+数字
|
# 去掉开头的序号,包括字母+数字的格式 以及括号+数字
|
||||||
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
|
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)'
|
||||||
data = re.sub(pattern, '', data).strip()
|
data = re.sub(pattern, '', data).strip()
|
||||||
keyword_match = re.search(keywords, data)
|
keyword_match = re.search(keywords, data)
|
||||||
if keyword_match:
|
if keyword_match:
|
||||||
@ -106,14 +107,14 @@ def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达
|
|||||||
all_texts1.append(cleaned_text) # 将处理后的文本添加到结果列表
|
all_texts1.append(cleaned_text) # 将处理后的文本添加到结果列表
|
||||||
|
|
||||||
else:
|
else:
|
||||||
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
|
pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)'
|
||||||
data = re.sub(pattern, '', text_list[0]).strip()
|
data = re.sub(pattern, '', text_list[0]).strip()
|
||||||
# 将修改后的第一个元素和剩余的元素连接起来
|
# 将修改后的第一个元素和剩余的元素连接起来
|
||||||
text_list[0] = data # 更新列表中的第一个元素
|
text_list[0] = data # 更新列表中的第一个元素
|
||||||
joined_text = "\n".join(text_list) # 如果列表中有多个元素,则连接它们
|
joined_text = "\n".join(text_list) # 如果列表中有多个元素,则连接它们
|
||||||
all_texts2.append(joined_text) # 将每个列表的内容添加到 all_texts 中
|
all_texts2.append(joined_text) # 将每个列表的内容添加到 all_texts 中
|
||||||
|
|
||||||
return all_texts1,all_texts2
|
return all_texts1,all_texts2 #all_texts1要额外用gpt all_text2直接返回结果
|
||||||
def find_sentences_with_keywords(data, keywords, follow_up_keywords):
|
def find_sentences_with_keywords(data, keywords, follow_up_keywords):
|
||||||
"""递归查找并返回包含关键词的句子列表,并根据是否存在后续关键词分别存储到两个列表中。"""
|
"""递归查找并返回包含关键词的句子列表,并根据是否存在后续关键词分别存储到两个列表中。"""
|
||||||
sentences1 = [] # 保存没有后续关键词的情况
|
sentences1 = [] # 保存没有后续关键词的情况
|
||||||
@ -236,8 +237,9 @@ def handle_query(file_path, user_query, output_file, result_key, keywords, trunc
|
|||||||
print("starting qianwen-long...")
|
print("starting qianwen-long...")
|
||||||
qianwen_ans = qianwen_long(file_id, user_query)
|
qianwen_ans = qianwen_long(file_id, user_query)
|
||||||
selected_contents = []
|
selected_contents = []
|
||||||
num_list = json.loads(qianwen_ans)
|
num_list = process_string_list(qianwen_ans)
|
||||||
print(num_list)
|
print(num_list)
|
||||||
|
|
||||||
for index in num_list:
|
for index in num_list:
|
||||||
if index - 1 < len(qianwen_txt):
|
if index - 1 < len(qianwen_txt):
|
||||||
content = qianwen_txt[index - 1] # 转换序号为索引(假设序号从1开始)
|
content = qianwen_txt[index - 1] # 转换序号为索引(假设序号从1开始)
|
||||||
@ -253,14 +255,14 @@ def handle_query(file_path, user_query, output_file, result_key, keywords, trunc
|
|||||||
res = {result_key: ""} # Set the response to empty if no contents were extracted
|
res = {result_key: ""} # Set the response to empty if no contents were extracted
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,truncate4):
|
def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,truncate3):
|
||||||
print("starting无效标与废标...")
|
print("starting无效标与废标...")
|
||||||
queries = [
|
queries = [
|
||||||
(r'否\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
|
(r'否\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
|
||||||
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号。",
|
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。",
|
||||||
os.path.join(output_dir, "temp1.txt"), "否决和无效投标情形"),
|
os.path.join(output_dir, "temp1.txt"), "否决和无效投标情形"),
|
||||||
(r'废\s*标',
|
(r'废\s*标',
|
||||||
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:废标项的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号。",
|
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:废标项的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。",
|
||||||
os.path.join(output_dir, "temp2.txt"), "废标项")
|
os.path.join(output_dir, "temp2.txt"), "废标项")
|
||||||
]
|
]
|
||||||
results = []
|
results = []
|
||||||
@ -277,7 +279,9 @@ def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,t
|
|||||||
for future in futures:
|
for future in futures:
|
||||||
results.append(future.result())
|
results.append(future.result())
|
||||||
|
|
||||||
forbidden_res = find_forbidden(truncate_json_path, clause_path, truncate4)
|
#禁止投标
|
||||||
|
print("starting不得存在的情形...")
|
||||||
|
forbidden_res = find_forbidden(truncate_json_path, clause_path, truncate3)
|
||||||
results.append(forbidden_res)
|
results.append(forbidden_res)
|
||||||
|
|
||||||
combined_dict = {}
|
combined_dict = {}
|
||||||
@ -288,15 +292,15 @@ def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,t
|
|||||||
return nest_json_under_key(combined_dict, "无效标与废标项")
|
return nest_json_under_key(combined_dict, "无效标与废标项")
|
||||||
|
|
||||||
|
|
||||||
#TODO:1.运行时间约80s,如果成为短板需要优化多线程 2.没有提取评标办法前附表中的表格 3.提取表格时根据中文的句号分割 4.qianwen-long存在bug
|
#TODO:1.运行时间约80s,如果成为短板需要优化多线程
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json"
|
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json"
|
||||||
clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\clause.json"
|
clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\clause.json"
|
||||||
truncate4="C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest12_qualification.pdf"
|
truncate3="C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest12_qualification.pdf"
|
||||||
output_dir = "C:\\Users\\Administrator\\Desktop\\招标文件"
|
output_dir = "C:\\Users\\Administrator\\Desktop\\货物标\\output"
|
||||||
doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标03.docx'
|
doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output\\zbtest11_invalid.docx'
|
||||||
results = combine_find_invalid(doc_path, output_dir,truncate_json_path,clause_path,truncate4)
|
results = combine_find_invalid(doc_path, output_dir,truncate_json_path,clause_path,truncate3)
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
print("Elapsed time:", str(end_time - start_time))
|
print("Elapsed time:", str(end_time - start_time))
|
||||||
print("Results:", results)
|
print("Results:", results)
|
||||||
|
@ -105,10 +105,16 @@ def process_string_list(string_list):
|
|||||||
# 获取匹配的内容,即方括号内的部分
|
# 获取匹配的内容,即方括号内的部分
|
||||||
content_inside_brackets = match.group(1)
|
content_inside_brackets = match.group(1)
|
||||||
if content_inside_brackets: # 检查内容是否为空
|
if content_inside_brackets: # 检查内容是否为空
|
||||||
# 将每个项目用引号包裹,并确保适当的空格和逗号处理
|
# 检查内容是否是数字列表
|
||||||
formatted_list = '[' + ', '.join(f"'{item.strip()}'" for item in content_inside_brackets.split(',') if item.strip()) + ']'
|
if all(item.strip().isdigit() for item in content_inside_brackets.split(',')):
|
||||||
|
# 如果是数字,不用加引号,直接保留数字
|
||||||
|
formatted_list = '[' + ', '.join(item.strip() for item in content_inside_brackets.split(',') if item.strip()) + ']'
|
||||||
|
else:
|
||||||
|
# 如果不全是数字,按字符串处理
|
||||||
|
formatted_list = '[' + ', '.join(f"'{item.strip()}'" for item in content_inside_brackets.split(',') if item.strip()) + ']'
|
||||||
else:
|
else:
|
||||||
return [] # 直接返回空列表如果内容为空
|
return [] # 直接返回空列表如果内容为空
|
||||||
|
|
||||||
# 使用 ast.literal_eval 来解析格式化后的字符串
|
# 使用 ast.literal_eval 来解析格式化后的字符串
|
||||||
try:
|
try:
|
||||||
actual_list = ast.literal_eval(formatted_list)
|
actual_list = ast.literal_eval(formatted_list)
|
||||||
@ -123,18 +129,17 @@ def find_forbidden(truncate_json_path,clause_path,truncate4): #投标人须
|
|||||||
# output_filename="merged.pdf"
|
# output_filename="merged.pdf"
|
||||||
# paths=[truncate1,truncate4]
|
# paths=[truncate1,truncate4]
|
||||||
# merged_filepath=merge_pdfs(paths,output_filename) #暂时废弃,评分前附表中的在'否决投标'中摘录了。
|
# merged_filepath=merge_pdfs(paths,output_filename) #暂时废弃,评分前附表中的在'否决投标'中摘录了。
|
||||||
|
|
||||||
file_id=upload_file(truncate4)
|
file_id=upload_file(truncate4)
|
||||||
#user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请按json列表格式给我提供信息,键名为'不得存在的其他情形',请你不要回答有关\"信誉要求\"的内容,若文件中未说明,请在键值中填'未知'。"
|
# user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请按json列表格式给我提供信息,键名为'不得存在的其他情形',请你不要回答有关\"信誉要求\"的内容,若文件中未说明,请在键值中填'未知'。"
|
||||||
user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...],请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。"
|
user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...],请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。"
|
||||||
qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden)
|
qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden)
|
||||||
print(qianwen_forbidden_str)
|
actual_list=process_string_list(qianwen_forbidden_str) #提取出字符串列表 ["xxx","xx"]
|
||||||
actual_list=process_string_list(qianwen_forbidden_str)
|
|
||||||
print(actual_list)
|
|
||||||
|
|
||||||
includes = ["不得存在", "禁止投标"]
|
includes = ["不得存在", "禁止投标"]
|
||||||
forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes)
|
forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes)
|
||||||
processed_results = extract_unique_items_from_texts(forbidden_results)
|
processed_results = extract_unique_items_from_texts(forbidden_results)
|
||||||
print(processed_results)
|
# print(processed_results)
|
||||||
merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results))
|
merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results))
|
||||||
forbidden_dict={'不得存在的其他情形':merged_forbidden_list}
|
forbidden_dict={'不得存在的其他情形':merged_forbidden_list}
|
||||||
|
|
||||||
@ -142,9 +147,9 @@ def find_forbidden(truncate_json_path,clause_path,truncate4): #投标人须
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json"
|
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\truncate_output.json"
|
||||||
clause_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\clause.json"
|
clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\clause.json"
|
||||||
truncate4 = "C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest12_qualification.pdf"
|
truncate4 = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_qualification.pdf"
|
||||||
output_dir = "C:\\Users\\Administrator\\Desktop\\招标文件"
|
output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7"
|
||||||
doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标03.docx'
|
doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile.docx'
|
||||||
find_forbidden(truncate_json_path,clause_path,truncate4)
|
find_forbidden(truncate_json_path,clause_path,truncate4)
|
@ -8,7 +8,7 @@ from flask_app.main.资格评审 import process_qualification
|
|||||||
from flask_app.main.通义千问long import upload_file, qianwen_long
|
from flask_app.main.通义千问long import upload_file, qianwen_long
|
||||||
|
|
||||||
|
|
||||||
def combine_review_standards(truncate1,truncate4,knowledge_name,truncate0_jsonpath,clause_path): #评标办法前附表
|
def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path): #评标办法前附表
|
||||||
# 形式评审、响应评审:千问
|
# 形式评审、响应评审:千问
|
||||||
print("starting形式响应评审...")
|
print("starting形式响应评审...")
|
||||||
file_id=upload_file(truncate1) #评标办法前附表
|
file_id=upload_file(truncate1) #评标办法前附表
|
||||||
@ -16,7 +16,7 @@ def combine_review_standards(truncate1,truncate4,knowledge_name,truncate0_jsonpa
|
|||||||
results = qianwen_long(file_id, user_query_1)
|
results = qianwen_long(file_id, user_query_1)
|
||||||
original_dict_data = extract_content_from_json(results)
|
original_dict_data = extract_content_from_json(results)
|
||||||
qualification_review = original_dict_data.pop('资格评审标准', '默认值或None')
|
qualification_review = original_dict_data.pop('资格评审标准', '默认值或None')
|
||||||
final_qualify_json=process_qualification(qualification_review,truncate4,knowledge_name)
|
final_qualify_json=process_qualification(qualification_review,truncate3,knowledge_name)
|
||||||
form_response_dict=process_reviews(original_dict_data, knowledge_name, truncate0_jsonpath, clause_path)
|
form_response_dict=process_reviews(original_dict_data, knowledge_name, truncate0_jsonpath, clause_path)
|
||||||
print("形式响应评审done")
|
print("形式响应评审done")
|
||||||
form_response_dict.update(final_qualify_json)
|
form_response_dict.update(final_qualify_json)
|
||||||
|
0
flask_app/main/转化格式/pydocx_p2d.py
Normal file
0
flask_app/main/转化格式/pydocx_p2d.py
Normal file
31
flask_app/货物标/test.py
Normal file
31
flask_app/货物标/test.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
from docx import Document
|
||||||
|
from docxcompose.composer import Composer
|
||||||
|
|
||||||
|
|
||||||
|
def combine_docx(master, sub,index):
|
||||||
|
if not os.path.exists(sub): # 待合并文件必须存在
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not master.endswith('.docx') or not sub.endswith('.docx'): # 主文件必须是docx格式(可以不存在)
|
||||||
|
return False
|
||||||
|
|
||||||
|
if os.path.exists(master):
|
||||||
|
doc_master = Document(master)
|
||||||
|
doc_master.add_page_break()
|
||||||
|
cp = Composer(doc_master)
|
||||||
|
cp.append(Document(sub))
|
||||||
|
else:
|
||||||
|
# master不存在,则sub直接给master
|
||||||
|
doc_master = Document(sub)
|
||||||
|
|
||||||
|
doc_master.save(master)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
master = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\test.docx'
|
||||||
|
sub = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\zbfile.docx'
|
||||||
|
index = 2 # 假设你要在第二个元素位置插入
|
||||||
|
combine_docx(master, sub,index)
|
Loading…
x
Reference in New Issue
Block a user