This commit is contained in:
zy123 2024-09-03 09:36:18 +08:00
parent 1d0211ce72
commit ee16d5e4b4
11 changed files with 194 additions and 76 deletions

View File

@ -0,0 +1,50 @@
from docx import Document
import re
import os
def copy_docx(source_path):
doc = Document(source_path) # 打开源文档
output_folder = os.path.dirname(source_path)
# 获取原文件名并添加后缀
original_file_name = os.path.basename(source_path)
file_name_without_ext, file_ext = os.path.splitext(original_file_name)
modified_file_name = file_name_without_ext + "_invalid" + file_ext
destination_path = os.path.join(output_folder, modified_file_name)
new_doc = Document() # 创建新文档
# 定义正则表达式模式
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷')
end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|:清标报告|:清标报告')
# 寻找最后一个begin_pattern的位置
last_begin_index = -1
for i, paragraph in enumerate(doc.paragraphs):
if begin_pattern.search(paragraph.text):
last_begin_index = i
# 从最后一个匹配的begin_pattern开始复制直到end_pattern
if last_begin_index != -1:
for i, paragraph in enumerate(doc.paragraphs[last_begin_index:], start=last_begin_index):
new_para = new_doc.add_paragraph(style=paragraph.style)
for run in paragraph.runs:
new_run = new_para.add_run(run.text)
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
if run.font.color:
new_run.font.color.rgb = run.font.color.rgb
new_run.font.size = run.font.size
if end_pattern.search(paragraph.text):
break
new_doc.save(destination_path) # 保存新文档
# 调用函数
if __name__ == '__main__':
source_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output\\zbtest13.docx"
copy_docx(source_path)

View File

@ -58,7 +58,7 @@ def docx2pdf(local_path_in):
if __name__ == '__main__':
# 替换为你的文件路径和API URL
local_path_in="C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest16_invalid.docx"
local_path_in="C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\test111.pdf"
# pdf2docx(local_path_in)
downloaded_file=docx2pdf(local_path_in)
print(downloaded_file)

View File

@ -1,18 +1,35 @@
import re
import ast
# 正则表达式
pattern = re.compile(r'第[一二三四五六七八九十百千]+章.*?(?:项目|服务|商务).*?要求')
def process_string_list(string_list):
# 使用正则表达式匹配方括号内的内容
match = re.search(r'\[(.*?)\]', string_list)
if match:
# 获取匹配的内容,即方括号内的部分
content_inside_brackets = match.group(1)
if content_inside_brackets: # 检查内容是否为空
# 检查内容是否是数字列表
if all(item.strip().isdigit() for item in content_inside_brackets.split(',')):
# 如果是数字,不用加引号,直接保留数字
formatted_list = '[' + ', '.join(item.strip() for item in content_inside_brackets.split(',') if item.strip()) + ']'
else:
# 如果不全是数字,按字符串处理
formatted_list = '[' + ', '.join(f"'{item.strip()}'" for item in content_inside_brackets.split(',') if item.strip()) + ']'
else:
return [] # 直接返回空列表如果内容为空
# 示例文本进行测试
text = """
第一章项目技术服务及商务要求
第二章 服务细节要求
第三章 商务处理要求
第四章 项目安排要求
第五章 安全要求
"""
# 使用 ast.literal_eval 来解析格式化后的字符串
try:
actual_list = ast.literal_eval(formatted_list)
return actual_list
except SyntaxError as e:
print(f"Error parsing list: {e}")
return []
else:
# 如果没有匹配到内容,返回空列表
return []
# 查找所有匹配
matches = pattern.findall(text)
for match in matches:
print(match)
# 测试代码
test_string = "[1,2,哈哈]"
result = process_string_list(test_string)
print(result) # 现在应该输出: [1, 2, 4, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19, 20, 21, 22]

View File

@ -9,8 +9,9 @@ def clean_page_numbers(text):
# 删除结尾的页码
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text)
# 删除形如 /129 的页码
cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text)
cleaned_text = re.sub(r'\s*\/\s*\d+\\s*', '', cleaned_text)
return cleaned_text
def extract_pages(pdf_path, output_folder, chapter_pattern, begin_page, end_phrase_pattern, output_suffix):
# 打开PDF文件
pdf_document = PdfReader(pdf_path)
@ -38,7 +39,6 @@ def extract_pages(pdf_path, output_folder, chapter_pattern, begin_page, end_phra
end_page = i
break
# 确保找到了起始和结束页面
if start_page is None or end_page is None:
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
@ -111,6 +111,14 @@ def truncate_pdf_main(input_path, output_folder, selection):
]
output_suffix = "tobidders_notice"
elif selection==4:
# 配置用于 "资格审查条件" 的正则表达式模式和短语
appendix_pattern = r'^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]'
pattern = re.compile(appendix_pattern)
begin_page = 5
end_phrases = [r'评标办法正文', r'评标办法', appendix_pattern]
output_suffix = "qualification"
elif selection == 5:
# 配置用于 "无效标" 的正则表达式模式和短语
pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:')
begin_page = 0
end_phrases = [
@ -119,12 +127,6 @@ def truncate_pdf_main(input_path, output_folder, selection):
r':清标报告'
]
output_suffix = "invalid"
elif selection==5:
appendix_pattern = r'^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]'
pattern = re.compile(appendix_pattern)
begin_page=5
end_phrases = [r'评标办法正文', r'评标办法',appendix_pattern]
output_suffix="qualification"
else:
print("无效的选择")
return None
@ -134,16 +136,15 @@ def truncate_pdf_main(input_path, output_folder, selection):
def truncate_pdf_multiple(input_path, output_folder):
truncate_files = []
for selection in range(1, 6):
for selection in range(1, 5):
files = truncate_pdf_main(input_path, output_folder, selection)
truncate_files.extend(files)
return truncate_files
if __name__ == "__main__":
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest5.pdf"
input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标03.pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\test"
truncate_pdf_multiple(input_path,output_folder)
# selection = 5 # 例如1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-招标公告-合同条款前
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
# truncate_pdf_multiple(input_path,output_folder)
selection = 5 # 例如1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件
generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print("生成的文件:", generated_files)

View File

@ -33,7 +33,8 @@ def extract_section(text, start_keyword, end_phrases):
end_index = len(text)
for phrase in end_phrases:
match = re.search(phrase, text[start_index:])
# Use multiline mode with `re.MULTILINE`
match = re.search(phrase, text[start_index:], re.MULTILINE) #Hello, world!\nWelcome to OpenAI. 在多行字符串多,要 re.MULTILINE以匹配每一行的开头否则只会匹配字符串的开头。
if match:
end_index = start_index + match.start()
break
@ -118,10 +119,8 @@ def convert_to_json(file_path, start_word, end_phrases):
text = extract_text_from_pdf(file_path)
else:
raise ValueError("Unsupported file format")
# 提取从 start_word 开始到 end_phrases 结束的内容
text = extract_section(text, start_word, end_phrases)
parsed_data = parse_text_by_heading(text)
return parsed_data
@ -138,13 +137,13 @@ def convert_clause_to_json(input_path,output_folder):
return output_path
if __name__ == "__main__":
file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp6\\48e650ff-70eb-48df-874c-66c8abbcd89d\\ztbfile_tobidders_notice.pdf'
file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp6\\713022ff-27d9-43e3-9cc9-2752effbfd66\\ztbfile_tobidders_notice.pdf'
start_word = "投标人须知正文"
end_phrases = [
r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:',
r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:',
]
output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件'
output_folder = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp6\\713022ff-27d9-43e3-9cc9-2752effbfd66'
try:
output_path = convert_clause_to_json(file_path,output_folder)
print(f"Final JSON result saved to: {output_path}")

File diff suppressed because one or more lines are too long

View File

@ -7,6 +7,7 @@ from flask_app.main.json_utils import combine_json_results, nest_json_under_key
from flask_app.main.通义千问long import upload_file, qianwen_long
from concurrent.futures import ThreadPoolExecutor
from flask_app.main.禁止投标情形 import find_forbidden
from 禁止投标情形 import process_string_list
#如果当前段落有序号,则向下匹配直接遇到相同的序号样式
#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。
@ -85,7 +86,7 @@ def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达
if any(exclude in data for exclude in excludes):
continue # 如果包含任何排除字符串,跳过这个数据
# 去掉开头的序号,包括字母+数字的格式 以及括号+数字
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)'
data = re.sub(pattern, '', data).strip()
keyword_match = re.search(keywords, data)
if keyword_match:
@ -106,14 +107,14 @@ def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达
all_texts1.append(cleaned_text) # 将处理后的文本添加到结果列表
else:
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
pattern = r'^\s*([(]\d+[)]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)'
data = re.sub(pattern, '', text_list[0]).strip()
# 将修改后的第一个元素和剩余的元素连接起来
text_list[0] = data # 更新列表中的第一个元素
joined_text = "\n".join(text_list) # 如果列表中有多个元素,则连接它们
all_texts2.append(joined_text) # 将每个列表的内容添加到 all_texts 中
return all_texts1,all_texts2
return all_texts1,all_texts2 #all_texts1要额外用gpt all_text2直接返回结果
def find_sentences_with_keywords(data, keywords, follow_up_keywords):
"""递归查找并返回包含关键词的句子列表,并根据是否存在后续关键词分别存储到两个列表中。"""
sentences1 = [] # 保存没有后续关键词的情况
@ -236,8 +237,9 @@ def handle_query(file_path, user_query, output_file, result_key, keywords, trunc
print("starting qianwen-long...")
qianwen_ans = qianwen_long(file_id, user_query)
selected_contents = []
num_list = json.loads(qianwen_ans)
num_list = process_string_list(qianwen_ans)
print(num_list)
for index in num_list:
if index - 1 < len(qianwen_txt):
content = qianwen_txt[index - 1] # 转换序号为索引假设序号从1开始
@ -253,14 +255,14 @@ def handle_query(file_path, user_query, output_file, result_key, keywords, trunc
res = {result_key: ""} # Set the response to empty if no contents were extracted
return res
def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,truncate4):
def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,truncate3):
print("starting无效标与废标...")
queries = [
(r'\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果x为符合的信息的序号",
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果x为符合的信息的序号,若情况不存在,返回[]",
os.path.join(output_dir, "temp1.txt"), "否决和无效投标情形"),
(r'\s*标',
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:废标项的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果x为符合的信息的序号",
"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:废标项的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果x为符合的信息的序号,若情况不存在,返回[]",
os.path.join(output_dir, "temp2.txt"), "废标项")
]
results = []
@ -277,7 +279,9 @@ def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,t
for future in futures:
results.append(future.result())
forbidden_res = find_forbidden(truncate_json_path, clause_path, truncate4)
#禁止投标
print("starting不得存在的情形...")
forbidden_res = find_forbidden(truncate_json_path, clause_path, truncate3)
results.append(forbidden_res)
combined_dict = {}
@ -288,15 +292,15 @@ def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,t
return nest_json_under_key(combined_dict, "无效标与废标项")
#TODO:1.运行时间约80s如果成为短板需要优化多线程 2.没有提取评标办法前附表中的表格 3.提取表格时根据中文的句号分割 4.qianwen-long存在bug
#TODO:1.运行时间约80s如果成为短板需要优化多线程
if __name__ == '__main__':
start_time = time.time()
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json"
clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\clause.json"
truncate4="C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest12_qualification.pdf"
output_dir = "C:\\Users\\Administrator\\Desktop\\招标文件"
doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标03.docx'
results = combine_find_invalid(doc_path, output_dir,truncate_json_path,clause_path,truncate4)
truncate3="C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest12_qualification.pdf"
output_dir = "C:\\Users\\Administrator\\Desktop\\货物标\\output"
doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output\\zbtest11_invalid.docx'
results = combine_find_invalid(doc_path, output_dir,truncate_json_path,clause_path,truncate3)
end_time = time.time()
print("Elapsed time:", str(end_time - start_time))
print("Results:", results)

View File

@ -105,10 +105,16 @@ def process_string_list(string_list):
# 获取匹配的内容,即方括号内的部分
content_inside_brackets = match.group(1)
if content_inside_brackets: # 检查内容是否为空
# 将每个项目用引号包裹,并确保适当的空格和逗号处理
# 检查内容是否是数字列表
if all(item.strip().isdigit() for item in content_inside_brackets.split(',')):
# 如果是数字,不用加引号,直接保留数字
formatted_list = '[' + ', '.join(item.strip() for item in content_inside_brackets.split(',') if item.strip()) + ']'
else:
# 如果不全是数字,按字符串处理
formatted_list = '[' + ', '.join(f"'{item.strip()}'" for item in content_inside_brackets.split(',') if item.strip()) + ']'
else:
return [] # 直接返回空列表如果内容为空
# 使用 ast.literal_eval 来解析格式化后的字符串
try:
actual_list = ast.literal_eval(formatted_list)
@ -123,18 +129,17 @@ def find_forbidden(truncate_json_path,clause_path,truncate4): #投标人须
# output_filename="merged.pdf"
# paths=[truncate1,truncate4]
# merged_filepath=merge_pdfs(paths,output_filename) #暂时废弃,评分前附表中的在'否决投标'中摘录了。
file_id=upload_file(truncate4)
# user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些请按json列表格式给我提供信息键名为'不得存在的其他情形',请你不要回答有关\"信誉要求\"的内容,若文件中未说明,请在键值中填'未知'。"
user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...],请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。"
qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden)
print(qianwen_forbidden_str)
actual_list=process_string_list(qianwen_forbidden_str)
print(actual_list)
actual_list=process_string_list(qianwen_forbidden_str) #提取出字符串列表 ["xxx","xx"]
includes = ["不得存在", "禁止投标"]
forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes)
processed_results = extract_unique_items_from_texts(forbidden_results)
print(processed_results)
# print(processed_results)
merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results))
forbidden_dict={'不得存在的其他情形':merged_forbidden_list}
@ -142,9 +147,9 @@ def find_forbidden(truncate_json_path,clause_path,truncate4): #投标人须
if __name__ == '__main__':
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json"
clause_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\clause.json"
truncate4 = "C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest12_qualification.pdf"
output_dir = "C:\\Users\\Administrator\\Desktop\\招标文件"
doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标03.docx'
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\truncate_output.json"
clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\clause.json"
truncate4 = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_qualification.pdf"
output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7"
doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile.docx'
find_forbidden(truncate_json_path,clause_path,truncate4)

View File

@ -8,7 +8,7 @@ from flask_app.main.资格评审 import process_qualification
from flask_app.main.通义千问long import upload_file, qianwen_long
def combine_review_standards(truncate1,truncate4,knowledge_name,truncate0_jsonpath,clause_path): #评标办法前附表
def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path): #评标办法前附表
# 形式评审、响应评审:千问
print("starting形式响应评审...")
file_id=upload_file(truncate1) #评标办法前附表
@ -16,7 +16,7 @@ def combine_review_standards(truncate1,truncate4,knowledge_name,truncate0_jsonpa
results = qianwen_long(file_id, user_query_1)
original_dict_data = extract_content_from_json(results)
qualification_review = original_dict_data.pop('资格评审标准', '默认值或None')
final_qualify_json=process_qualification(qualification_review,truncate4,knowledge_name)
final_qualify_json=process_qualification(qualification_review,truncate3,knowledge_name)
form_response_dict=process_reviews(original_dict_data, knowledge_name, truncate0_jsonpath, clause_path)
print("形式响应评审done")
form_response_dict.update(final_qualify_json)

View File

@ -0,0 +1,31 @@
import os
from docx import Document
from docxcompose.composer import Composer
def combine_docx(master, sub,index):
if not os.path.exists(sub): # 待合并文件必须存在
return False
if not master.endswith('.docx') or not sub.endswith('.docx'): # 主文件必须是docx格式可以不存在
return False
if os.path.exists(master):
doc_master = Document(master)
doc_master.add_page_break()
cp = Composer(doc_master)
cp.append(Document(sub))
else:
# master不存在则sub直接给master
doc_master = Document(sub)
doc_master.save(master)
return True
if __name__ == '__main__':
master = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\test.docx'
sub = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\zbfile.docx'
index = 2 # 假设你要在第二个元素位置插入
combine_docx(master, sub,index)