9.13
This commit is contained in:
parent
010eff9405
commit
79834efb5e
@ -2,8 +2,6 @@ from docx import Document
|
|||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from flask import g
|
|
||||||
|
|
||||||
|
|
||||||
def copy_docx(source_path):
|
def copy_docx(source_path):
|
||||||
doc = Document(source_path) # 打开源文档
|
doc = Document(source_path) # 打开源文档
|
||||||
@ -45,7 +43,7 @@ def copy_docx(source_path):
|
|||||||
break
|
break
|
||||||
|
|
||||||
new_doc.save(destination_path) # 保存新文档
|
new_doc.save(destination_path) # 保存新文档
|
||||||
g.logger.info("docx截取docx成功!")
|
print("docx截取docx成功!")
|
||||||
|
|
||||||
|
|
||||||
# 调用函数
|
# 调用函数
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
import requests
|
import requests
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
|
||||||
from flask import g
|
|
||||||
|
|
||||||
|
|
||||||
def download_file(url, local_filename):
|
def download_file(url, local_filename):
|
||||||
@ -29,13 +28,13 @@ def download_file(url, local_filename):
|
|||||||
else:
|
else:
|
||||||
return full_filename,3
|
return full_filename,3
|
||||||
except requests.HTTPError as e:
|
except requests.HTTPError as e:
|
||||||
g.logger.error(f"download: HTTP Error: {e}")
|
print(f"download: HTTP Error: {e}")
|
||||||
return None
|
return None
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
g.logger.error(f"download: Error downloading the file: {e}")
|
print(f"download: Error downloading the file: {e}")
|
||||||
return None
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
g.logger.error(f"download: An error occurred: {e}")
|
print(f"download: An error occurred: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import requests
|
import requests
|
||||||
from flask import g
|
|
||||||
|
|
||||||
from flask_app.main.download import download_file
|
from flask_app.main.download import download_file
|
||||||
|
|
||||||
@ -21,14 +20,14 @@ def upload_file(file_path, url):
|
|||||||
|
|
||||||
# 检查响应状态码
|
# 检查响应状态码
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
g.logger.info("format_change 文件上传成功")
|
print("format_change 文件上传成功")
|
||||||
receive_file_response = response.content.decode('utf-8')
|
receive_file_response = response.content.decode('utf-8')
|
||||||
receive_file_json = json.loads(receive_file_response)
|
receive_file_json = json.loads(receive_file_response)
|
||||||
receive_file_url = receive_file_json["data"]
|
receive_file_url = receive_file_json["data"]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
g.logger.info(f"format_change 文件上传失败,状态码: {response.status_code}")
|
print(f"format_change 文件上传失败,状态码: {response.status_code}")
|
||||||
g.logger.info(f"format_change {response.text}")
|
print(f"format_change {response.text}")
|
||||||
|
|
||||||
return receive_file_url
|
return receive_file_url
|
||||||
|
|
||||||
@ -46,7 +45,7 @@ def pdf2docx(local_path_in):
|
|||||||
filename, folder = get_filename_and_folder(local_path_in) #输入输出在同一个文件夹
|
filename, folder = get_filename_and_folder(local_path_in) #输入输出在同一个文件夹
|
||||||
local_path_out=os.path.join(folder,filename) #输出文件名
|
local_path_out=os.path.join(folder,filename) #输出文件名
|
||||||
downloaded_filepath,file_type=download_file(receive_download_url, local_path_out)
|
downloaded_filepath,file_type=download_file(receive_download_url, local_path_out)
|
||||||
g.logger.info("format_change p2d:have downloaded file to:",downloaded_filepath)
|
print(f"format_change p2d:have downloaded file to: {downloaded_filepath}")
|
||||||
return downloaded_filepath
|
return downloaded_filepath
|
||||||
|
|
||||||
def docx2pdf(local_path_in):
|
def docx2pdf(local_path_in):
|
||||||
@ -55,7 +54,7 @@ def docx2pdf(local_path_in):
|
|||||||
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
|
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
|
||||||
local_path_out = os.path.join(folder, filename) # 输出文件名
|
local_path_out = os.path.join(folder, filename) # 输出文件名
|
||||||
downloaded_filepath,file_type = download_file(receive_download_url, local_path_out)
|
downloaded_filepath,file_type = download_file(receive_download_url, local_path_out)
|
||||||
g.logger.info("format_change d2p:have downloaded file to:", downloaded_filepath)
|
print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
|
||||||
return downloaded_filepath
|
return downloaded_filepath
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -1,9 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from flask import g
|
|
||||||
|
|
||||||
|
|
||||||
def extract_content_from_json(json_data):
|
def extract_content_from_json(json_data):
|
||||||
"""提取 { 和 } 之间的内容,并将其解析为字典"""
|
"""提取 { 和 } 之间的内容,并将其解析为字典"""
|
||||||
if not json_data.strip():
|
if not json_data.strip():
|
||||||
@ -14,10 +11,10 @@ def extract_content_from_json(json_data):
|
|||||||
json_data = match.group(0)
|
json_data = match.group(0)
|
||||||
return json.loads(json_data) #返回字典
|
return json.loads(json_data) #返回字典
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
g.logger.info(f"json_utils: extract_content_from_json: JSON decode error: {e}")
|
print(f"json_utils: extract_content_from_json: JSON decode error: {e}")
|
||||||
return {}
|
return {}
|
||||||
else:
|
else:
|
||||||
g.logger.info("json_utils: extract_content_from_json: No valid JSON content found.")
|
print("json_utils: extract_content_from_json: No valid JSON content found.")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def clean_json_string(json_string):
|
def clean_json_string(json_string):
|
||||||
@ -66,18 +63,18 @@ def add_keys_to_json(target_dict, source_dict):
|
|||||||
dict: 更新后的字典。
|
dict: 更新后的字典。
|
||||||
"""
|
"""
|
||||||
if not target_dict:
|
if not target_dict:
|
||||||
g.logger.error("json_utils: Error: Target dictionary is empty.")
|
print("json_utils: Error: Target dictionary is empty.")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
if len(target_dict) != 1:
|
if len(target_dict) != 1:
|
||||||
g.logger.error("json_utils: Error: Target dictionary must contain exactly one top-level key.")
|
print("json_utils: Error: Target dictionary must contain exactly one top-level key.")
|
||||||
return target_dict
|
return target_dict
|
||||||
|
|
||||||
# 获取唯一的外层键
|
# 获取唯一的外层键
|
||||||
target_key, existing_dict = next(iter(target_dict.items()))
|
target_key, existing_dict = next(iter(target_dict.items()))
|
||||||
|
|
||||||
if not isinstance(existing_dict, dict):
|
if not isinstance(existing_dict, dict):
|
||||||
g.logger.error(f"json_utils: Error: The value under the key '{target_key}' is not a dictionary.")
|
print(f"json_utils: Error: The value under the key '{target_key}' is not a dictionary.")
|
||||||
return target_dict
|
return target_dict
|
||||||
|
|
||||||
# 合并字典
|
# 合并字典
|
||||||
@ -95,7 +92,7 @@ def rename_outer_key(original_data,new_key):
|
|||||||
|
|
||||||
# 提取原始数据中的唯一外层值(假设只有一个外层键)
|
# 提取原始数据中的唯一外层值(假设只有一个外层键)
|
||||||
if not original_data or not isinstance(original_data, dict):
|
if not original_data or not isinstance(original_data, dict):
|
||||||
g.logger.error("json_utils: Error: Invalid input or input is not a dictionary.") # 如果输入无效或不是字典,则返回空字典
|
print("json_utils: Error: Invalid input or input is not a dictionary.") # 如果输入无效或不是字典,则返回空字典
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
# 使用 next(iter(...)) 提取第一个键的值
|
# 使用 next(iter(...)) 提取第一个键的值
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
import shutil
|
import shutil
|
||||||
import sys
|
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
@ -40,7 +39,8 @@ def before_request():
|
|||||||
def create_logger():
|
def create_logger():
|
||||||
unique_id = str(uuid.uuid4())
|
unique_id = str(uuid.uuid4())
|
||||||
g.unique_id = unique_id
|
g.unique_id = unique_id
|
||||||
output_folder = f"flask_app/static/output/{unique_id}"
|
# output_folder = f"flask_app/static/output/{unique_id}"
|
||||||
|
output_folder = f"C:/Users/Administrator/Desktop/货物标/zboutpub/{unique_id}"
|
||||||
os.makedirs(output_folder, exist_ok=True)
|
os.makedirs(output_folder, exist_ok=True)
|
||||||
log_filename = "log.txt"
|
log_filename = "log.txt"
|
||||||
log_path = os.path.join(output_folder, log_filename)
|
log_path = os.path.join(output_folder, log_filename)
|
||||||
@ -59,11 +59,12 @@ def create_logger():
|
|||||||
|
|
||||||
@app.route('/upload', methods=['POST'])
|
@app.route('/upload', methods=['POST'])
|
||||||
def zbparse():
|
def zbparse():
|
||||||
|
logger=g.logger
|
||||||
file_url = validate_request()
|
file_url = validate_request()
|
||||||
if isinstance(file_url, tuple): # Check if the returned value is an error response
|
if isinstance(file_url, tuple): # Check if the returned value is an error response
|
||||||
return file_url
|
return file_url
|
||||||
try:
|
try:
|
||||||
app.logger.info("starting parsing url:" + file_url)
|
logger.info("starting parsing url:" + file_url)
|
||||||
final_json_path, output_folder = download_and_process_file(file_url)
|
final_json_path, output_folder = download_and_process_file(file_url)
|
||||||
if not final_json_path:
|
if not final_json_path:
|
||||||
return jsonify({'error': 'File processing failed'}), 500
|
return jsonify({'error': 'File processing failed'}), 500
|
||||||
@ -71,7 +72,7 @@ def zbparse():
|
|||||||
# remove_directory(output_folder) # 然后删除文件夹
|
# remove_directory(output_folder) # 然后删除文件夹
|
||||||
return response # 最后返回获取的响应
|
return response # 最后返回获取的响应
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
app.logger.error('Exception occurred: ' + str(e)) # 使用全局 logger 记录
|
logger.error('Exception occurred: ' + str(e)) # 使用全局 logger 记录
|
||||||
return jsonify({'error': str(e)}), 500
|
return jsonify({'error': str(e)}), 500
|
||||||
|
|
||||||
|
|
||||||
@ -138,7 +139,8 @@ def validate_request():
|
|||||||
def download_and_process_file(file_url):
|
def download_and_process_file(file_url):
|
||||||
logger = g.logger
|
logger = g.logger
|
||||||
unique_id = g.unique_id
|
unique_id = g.unique_id
|
||||||
output_folder = f"flask_app/static/output/{unique_id}" # 直接使用全局 unique_id 构建路径
|
# output_folder = f"flask_app/static/output/{unique_id}" # 直接使用全局 unique_id 构建路径
|
||||||
|
output_folder = f"C:/Users/Administrator/Desktop/货物标/zboutpub/{unique_id}"
|
||||||
filename = "ztbfile"
|
filename = "ztbfile"
|
||||||
downloaded_filename = os.path.join(output_folder, filename)
|
downloaded_filename = os.path.join(output_folder, filename)
|
||||||
|
|
||||||
|
@ -3,7 +3,6 @@ import os
|
|||||||
from docx import Document
|
from docx import Document
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from flask import g
|
|
||||||
|
|
||||||
|
|
||||||
def read_tables_from_docx(file_path):
|
def read_tables_from_docx(file_path):
|
||||||
@ -89,13 +88,13 @@ def save_data_to_json(data, output_folder):
|
|||||||
"""将数据保存到JSON文件中."""
|
"""将数据保存到JSON文件中."""
|
||||||
with open(output_filepath, 'w', encoding='utf-8') as file:
|
with open(output_filepath, 'w', encoding='utf-8') as file:
|
||||||
json.dump(data, file, ensure_ascii=False, indent=4)
|
json.dump(data, file, ensure_ascii=False, indent=4)
|
||||||
g.logger.info(f"table_content_extraction: The data has been processed and saved to '{output_filepath}'.")
|
print(f"table_content_extraction: The data has been processed and saved to '{output_filepath}'.")
|
||||||
return output_filepath
|
return output_filepath
|
||||||
|
|
||||||
|
|
||||||
def extract_tables_main(path, output_folder):
|
def extract_tables_main(path, output_folder):
|
||||||
if not os.path.exists(path):
|
if not os.path.exists(path):
|
||||||
g.logger.error(f"table_content_extraction: The specified file does not exist: {path}")
|
print(f"table_content_extraction: The specified file does not exist: {path}")
|
||||||
return ""
|
return ""
|
||||||
# 读取文档表格数据
|
# 读取文档表格数据
|
||||||
table_data = read_tables_from_docx(path)
|
table_data = read_tables_from_docx(path)
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
import json
|
import json
|
||||||
|
|
||||||
from flask import g
|
|
||||||
|
|
||||||
from flask_app.main.json_utils import clean_json_string, nest_json_under_key,rename_outer_key, combine_json_results
|
from flask_app.main.json_utils import clean_json_string, nest_json_under_key,rename_outer_key, combine_json_results
|
||||||
from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
|
from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
|
||||||
from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge
|
from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge
|
||||||
@ -32,9 +30,6 @@ def combine_basic_info(baseinfo_list):
|
|||||||
# 根据检测到的键动态调整 key_groups
|
# 根据检测到的键动态调整 key_groups
|
||||||
dynamic_key_handling(key_groups, relevant_keys_detected)
|
dynamic_key_handling(key_groups, relevant_keys_detected)
|
||||||
|
|
||||||
# 打印 key_groups 的内容检查它们是否被正确更新
|
|
||||||
# g.logger.info("Updated key_groups after dynamic handling:")
|
|
||||||
|
|
||||||
|
|
||||||
# 使用合并后的字典创建最终输出
|
# 使用合并后的字典创建最终输出
|
||||||
for group_name, keys in key_groups.items():
|
for group_name, keys in key_groups.items():
|
||||||
@ -82,8 +77,7 @@ def judge_consortium_bidding(baseinfo_list):
|
|||||||
def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #投标人须知前附表
|
def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #投标人须知前附表
|
||||||
# 调用大模型回答项目基础信息
|
# 调用大模型回答项目基础信息
|
||||||
baseinfo_list = []
|
baseinfo_list = []
|
||||||
# baseinfo_file_path='../static/提示词/前两章提问总结.txt'
|
baseinfo_file_path='flask_app/static/提示词/前两章提问总结.txt'
|
||||||
baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt' # 替换为你的txt文件路径
|
|
||||||
questions = read_questions_from_file(baseinfo_file_path)
|
questions = read_questions_from_file(baseinfo_file_path)
|
||||||
res1 = multi_threading(questions, knowledge_name)
|
res1 = multi_threading(questions, knowledge_name)
|
||||||
for _, response in res1: # _占位,代表ques;response[0]也是ques;response[1]是ans
|
for _, response in res1: # _占位,代表ques;response[0]也是ques;response[1]是ans
|
||||||
@ -91,13 +85,12 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
|
|||||||
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
||||||
baseinfo_list.append(response[1])
|
baseinfo_list.append(response[1])
|
||||||
else:
|
else:
|
||||||
g.logger.error(f"基础信息整合: Warning: Missing or incomplete response data for query index {_}.")
|
print(f"基础信息整合: Warning: Missing or incomplete response data for query index {_}.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
g.logger.error(f"基础信息整合: Error processing response for query index {_}: {e}")
|
print(f"基础信息整合: Error processing response for query index {_}: {e}")
|
||||||
# 判断是否分包、是否需要递交投标保证金等
|
# 判断是否分包、是否需要递交投标保证金等
|
||||||
chosen_numbers, merged = judge_whether_main(truncate0,output_folder)
|
chosen_numbers, merged = judge_whether_main(truncate0,output_folder)
|
||||||
baseinfo_list.append(merged)
|
baseinfo_list.append(merged)
|
||||||
# judge_file_path = '../static/提示词/是否相关问题.txt'
|
|
||||||
judge_file_path ='flask_app/static/提示词/是否相关问题.txt'
|
judge_file_path ='flask_app/static/提示词/是否相关问题.txt'
|
||||||
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
|
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
|
||||||
|
|
||||||
@ -109,7 +102,7 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
|
|||||||
file_id=upload_file(truncate0)
|
file_id=upload_file(truncate0)
|
||||||
res2 = multi_threading(judge_questions, "",file_id,2) #调用千问-long
|
res2 = multi_threading(judge_questions, "",file_id,2) #调用千问-long
|
||||||
if not res2:
|
if not res2:
|
||||||
g.logger.error("基础信息整合: multi_threading errror!")
|
print("基础信息整合: multi_threading errror!")
|
||||||
else:
|
else:
|
||||||
# 打印结果
|
# 打印结果
|
||||||
for question, response in res2:
|
for question, response in res2:
|
||||||
|
@ -6,7 +6,6 @@ import concurrent.futures
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
from dashscope import Assistants, Messages, Runs, Threads
|
from dashscope import Assistants, Messages, Runs, Threads
|
||||||
from flask import g
|
|
||||||
from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
|
from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
|
||||||
from flask_app.main.通义千问long import qianwen_long, upload_file
|
from flask_app.main.通义千问long import qianwen_long, upload_file
|
||||||
prompt = """
|
prompt = """
|
||||||
@ -118,10 +117,10 @@ def pure_assistant():
|
|||||||
|
|
||||||
def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type):
|
def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type):
|
||||||
if llm_type==1:
|
if llm_type==1:
|
||||||
g.logger.info(f"rag_assistant! question:{question}")
|
print(f"rag_assistant! question:{question}")
|
||||||
assistant = rag_assistant(knowledge_name)
|
assistant = rag_assistant(knowledge_name)
|
||||||
elif llm_type==2:
|
elif llm_type==2:
|
||||||
g.logger.info(f"qianwen_long! question:{question}")
|
print(f"qianwen_long! question:{question}")
|
||||||
qianwen_res = qianwen_long(file_id,question)
|
qianwen_res = qianwen_long(file_id,question)
|
||||||
result_queue.put((ans_index,(question,qianwen_res)))
|
result_queue.put((ans_index,(question,qianwen_res)))
|
||||||
return
|
return
|
||||||
@ -131,7 +130,7 @@ def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type
|
|||||||
result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans)
|
result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans)
|
||||||
|
|
||||||
def multi_threading(queries, knowledge_name="", file_id="",llm_type=1):
|
def multi_threading(queries, knowledge_name="", file_id="",llm_type=1):
|
||||||
g.logger.info("多线程提问:starting multi_threading...")
|
print("多线程提问:starting multi_threading...")
|
||||||
result_queue = queue.Queue()
|
result_queue = queue.Queue()
|
||||||
|
|
||||||
# 使用 ThreadPoolExecutor 管理线程
|
# 使用 ThreadPoolExecutor 管理线程
|
||||||
@ -150,7 +149,7 @@ def multi_threading(queries, knowledge_name="", file_id="",llm_type=1):
|
|||||||
try:
|
try:
|
||||||
future.result() # 可以用来捕获异常或确认任务完成
|
future.result() # 可以用来捕获异常或确认任务完成
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
g.logger.error(f"Query {index} generated an exception: {exc}")
|
print(f"Query {index} generated an exception: {exc}")
|
||||||
|
|
||||||
# 从队列中获取所有结果并按索引排序
|
# 从队列中获取所有结果并按索引排序
|
||||||
results = [None] * len(queries)
|
results = [None] * len(queries)
|
||||||
|
@ -3,8 +3,6 @@ import re
|
|||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from flask import g
|
|
||||||
|
|
||||||
from flask_app.main.多线程提问 import multi_threading
|
from flask_app.main.多线程提问 import multi_threading
|
||||||
from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2
|
from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2
|
||||||
from flask_app.main.json_utils import extract_content_from_json
|
from flask_app.main.json_utils import extract_content_from_json
|
||||||
@ -189,9 +187,9 @@ def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause
|
|||||||
temp = extract_content_from_json(response[1])
|
temp = extract_content_from_json(response[1])
|
||||||
first_response_list.append(temp)
|
first_response_list.append(temp)
|
||||||
else:
|
else:
|
||||||
g.logger.error(f"形式响应评审:Warning: Missing or incomplete response data for query index {_}.")
|
print(f"形式响应评审:Warning: Missing or incomplete response data for query index {_}.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
g.logger.error(f"形式响应评审:Error processing response for query index {_}: {e}")
|
print(f"形式响应评审:Error processing response for query index {_}: {e}")
|
||||||
|
|
||||||
# Assume JSON file paths are defined or configured correctly
|
# Assume JSON file paths are defined or configured correctly
|
||||||
# print(entries_with_numbers) #[{'形式评审标准.多标段投标': '3.7.4(5)'}]
|
# print(entries_with_numbers) #[{'形式评审标准.多标段投标': '3.7.4(5)'}]
|
||||||
|
@ -2,43 +2,47 @@ from PyPDF2 import PdfReader, PdfWriter
|
|||||||
import re # 导入正则表达式库
|
import re # 导入正则表达式库
|
||||||
import os # 用于文件和文件夹操作
|
import os # 用于文件和文件夹操作
|
||||||
|
|
||||||
from flask import g
|
def clean_page_content(text, common_header):
|
||||||
|
# 首先删除抬头公共部分
|
||||||
|
if common_header: # 确保有公共抬头才进行替换
|
||||||
|
for header_line in common_header.split('\n'):
|
||||||
|
if header_line.strip(): # 只处理非空行
|
||||||
|
# 替换首次出现的完整行
|
||||||
|
text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
|
||||||
|
|
||||||
|
# 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除
|
||||||
|
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
|
||||||
|
text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码
|
||||||
|
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码
|
||||||
|
|
||||||
|
return text
|
||||||
def extract_common_header(pdf_path):
|
def extract_common_header(pdf_path):
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
headers = []
|
headers = []
|
||||||
num_pages_to_read = 3 # 预读页数
|
start_page = 4 # 从第5页开始读取,索引为4
|
||||||
|
num_pages_to_read = 3 # 连续读取3页
|
||||||
|
|
||||||
for i in range(min(num_pages_to_read, len(pdf_document.pages))):
|
# 确保从第5页开始,且总页数足够
|
||||||
|
for i in range(start_page, min(start_page + num_pages_to_read, len(pdf_document.pages))):
|
||||||
page = pdf_document.pages[i]
|
page = pdf_document.pages[i]
|
||||||
text = page.extract_text()
|
text = page.extract_text() or ""
|
||||||
if text: # 确保页面有文本内容
|
if text:
|
||||||
first_line = text.strip().split('\n')[0]
|
# 只取每页的前三行
|
||||||
headers.append(first_line)
|
first_lines = text.strip().split('\n')[:3]
|
||||||
|
headers.append(first_lines)
|
||||||
|
|
||||||
if len(headers) < 2:
|
if len(headers) < 2:
|
||||||
return "" # 如果没有足够的页来比较,返回空字符串
|
return "" # 如果没有足够的页来比较,返回空字符串
|
||||||
|
|
||||||
# 使用set交集来找出公共部分
|
# 寻找每一行中的公共部分
|
||||||
common_header = set(headers[0].split()).intersection(*[set(header.split()) for header in headers[1:]])
|
common_headers = []
|
||||||
common_header = ' '.join(common_header)
|
for lines in zip(*headers):
|
||||||
return common_header
|
# 在每一行中寻找公共单词
|
||||||
|
common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]])
|
||||||
|
if common_line:
|
||||||
|
common_headers.append(' '.join(common_line))
|
||||||
|
|
||||||
|
return '\n'.join(common_headers)
|
||||||
def clean_page_content(text, common_header):
|
|
||||||
# 首先删除抬头公共部分
|
|
||||||
if common_header: # 确保有公共抬头才进行替换
|
|
||||||
cleaned_text = text.replace(common_header, '', 1) # 假设抬头出现在文本开头,只替换一次
|
|
||||||
else:
|
|
||||||
cleaned_text = text
|
|
||||||
|
|
||||||
# 删除页码
|
|
||||||
cleaned_text = re.sub(r'^\s*\d+\s*(?=\D)', '', cleaned_text) # 删除开头的页码,仅当紧跟非数字字符时
|
|
||||||
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text) # 删除结尾的页码
|
|
||||||
cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text) # 删除形如 /129 的页码
|
|
||||||
|
|
||||||
return cleaned_text
|
|
||||||
|
|
||||||
|
|
||||||
def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page):
|
def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page):
|
||||||
@ -62,16 +66,16 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
|
|||||||
with open(output_pdf_path, 'wb') as f:
|
with open(output_pdf_path, 'wb') as f:
|
||||||
output_doc.write(f)
|
output_doc.write(f)
|
||||||
|
|
||||||
g.logger.info(f"已截取并保存页面从 {start_page + 1} 到 {end_page + 1} 为 {output_pdf_path}")
|
print(f"已截取并保存页面从 {start_page + 1} 到 {end_page + 1} 为 {output_pdf_path}")
|
||||||
else:
|
else:
|
||||||
g.logger.error("提供的页码范围无效。")
|
print("提供的页码范围无效。")
|
||||||
return output_pdf_path
|
return output_pdf_path
|
||||||
|
|
||||||
|
|
||||||
def extract_pages_twice(pdf_path, output_folder, output_suffix):
|
def extract_pages_twice(pdf_path, output_folder, output_suffix):
|
||||||
common_header = extract_common_header(pdf_path)
|
common_header = extract_common_header(pdf_path)
|
||||||
last_begin_index = 0
|
last_begin_index = 0
|
||||||
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷')
|
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|投标邀请书')
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
for i, page in enumerate(pdf_document.pages):
|
for i, page in enumerate(pdf_document.pages):
|
||||||
text = page.extract_text()
|
text = page.extract_text()
|
||||||
@ -108,7 +112,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix):
|
|||||||
break # 找到结束页后退出循环
|
break # 找到结束页后退出循环
|
||||||
|
|
||||||
if start_page is None or end_page is None:
|
if start_page is None or end_page is None:
|
||||||
g.logger.error(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!")
|
print(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!")
|
||||||
return ""
|
return ""
|
||||||
else:
|
else:
|
||||||
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
|
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
|
||||||
@ -157,7 +161,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
|||||||
if output_suffix == "qualification" or output_suffix =="invalid":
|
if output_suffix == "qualification" or output_suffix =="invalid":
|
||||||
return extract_pages_twice(pdf_path, output_folder, output_suffix)
|
return extract_pages_twice(pdf_path, output_folder, output_suffix)
|
||||||
else:
|
else:
|
||||||
g.logger.error(f"first: 未找到起始或结束页在文件 {pdf_path} 中!")
|
print(f"first: 未找到起始或结束页在文件 {pdf_path} 中!")
|
||||||
return ""
|
return ""
|
||||||
else:
|
else:
|
||||||
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
|
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
|
||||||
@ -185,7 +189,7 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
|
|||||||
if output_pdf_path and os.path.isfile(output_pdf_path):
|
if output_pdf_path and os.path.isfile(output_pdf_path):
|
||||||
return [output_pdf_path] # 以列表形式返回,以保持一致性
|
return [output_pdf_path] # 以列表形式返回,以保持一致性
|
||||||
else:
|
else:
|
||||||
g.logger.error("提供的路径既不是文件夹也不是PDF文件。")
|
print("提供的路径既不是文件夹也不是PDF文件。")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
@ -224,7 +228,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
|||||||
output_suffix = "qualification"
|
output_suffix = "qualification"
|
||||||
elif selection == 5:
|
elif selection == 5:
|
||||||
# 配置用于 "招标公告" 的正则表达式模式和短语
|
# 配置用于 "招标公告" 的正则表达式模式和短语
|
||||||
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷')
|
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|投标邀请书')
|
||||||
begin_page = 0
|
begin_page = 0
|
||||||
end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*投标人须知', re.MULTILINE)
|
end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*投标人须知', re.MULTILINE)
|
||||||
output_suffix = "notice"
|
output_suffix = "notice"
|
||||||
@ -235,7 +239,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
|||||||
end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|第二卷', re.MULTILINE)
|
end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|第二卷', re.MULTILINE)
|
||||||
output_suffix = "invalid"
|
output_suffix = "invalid"
|
||||||
else:
|
else:
|
||||||
g.logger.error("无效的选择:请选择1-6")
|
print("无效的选择:请选择1-6")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Process the selected input
|
# Process the selected input
|
||||||
|
@ -4,8 +4,6 @@ import fitz
|
|||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from flask import g
|
|
||||||
|
|
||||||
|
|
||||||
def extract_text_from_docx(file_path):
|
def extract_text_from_docx(file_path):
|
||||||
doc = docx.Document(file_path)
|
doc = docx.Document(file_path)
|
||||||
@ -129,7 +127,7 @@ def convert_to_json(file_path, start_word, end_phrases):
|
|||||||
|
|
||||||
def convert_clause_to_json(input_path,output_folder,type=1):
|
def convert_clause_to_json(input_path,output_folder,type=1):
|
||||||
if not os.path.exists(input_path):
|
if not os.path.exists(input_path):
|
||||||
g.logger.error(f"The specified file does not exist: {input_path}")
|
print(f"The specified file does not exist: {input_path}")
|
||||||
return ""
|
return ""
|
||||||
if type==1:
|
if type==1:
|
||||||
start_word = "投标人须知正文"
|
start_word = "投标人须知正文"
|
||||||
|
@ -4,8 +4,6 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from flask import g
|
|
||||||
|
|
||||||
from flask_app.main.截取pdf import truncate_pdf_multiple
|
from flask_app.main.截取pdf import truncate_pdf_multiple
|
||||||
from flask_app.main.table_content_extraction import extract_tables_main
|
from flask_app.main.table_content_extraction import extract_tables_main
|
||||||
from flask_app.main.知识库操作 import addfileToKnowledge, deleteKnowledge
|
from flask_app.main.知识库操作 import addfileToKnowledge, deleteKnowledge
|
||||||
@ -20,16 +18,17 @@ from flask_app.main.商务标技术标整合 import combine_evaluation_standards
|
|||||||
from flask_app.main.format_change import pdf2docx, docx2pdf
|
from flask_app.main.format_change import pdf2docx, docx2pdf
|
||||||
from flask_app.main.docx截取docx import copy_docx
|
from flask_app.main.docx截取docx import copy_docx
|
||||||
|
|
||||||
# def get_global_logger(unique_id):
|
def get_global_logger(unique_id):
|
||||||
# if unique_id is None:
|
if unique_id is None:
|
||||||
# return logging.getLogger() # 获取默认的日志器
|
return logging.getLogger() # 获取默认的日志器
|
||||||
# logger = logging.getLogger(unique_id)
|
logger = logging.getLogger(unique_id)
|
||||||
# return logger
|
return logger
|
||||||
|
|
||||||
|
logger=None
|
||||||
|
|
||||||
# 可能有问题:pdf转docx导致打勾符号消失
|
# 可能有问题:pdf转docx导致打勾符号消失
|
||||||
def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
|
def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
|
||||||
g.logger.info("starting 文件预处理...")
|
logger.info("starting 文件预处理...")
|
||||||
# 根据文件类型处理文件路径
|
# 根据文件类型处理文件路径
|
||||||
if file_type == 1: # docx
|
if file_type == 1: # docx
|
||||||
docx_path = downloaded_file_path
|
docx_path = downloaded_file_path
|
||||||
@ -39,7 +38,7 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
|
|||||||
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
|
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
|
||||||
else:
|
else:
|
||||||
# 如果文件类型不是预期中的1或2,记录错误并返回None
|
# 如果文件类型不是预期中的1或2,记录错误并返回None
|
||||||
g.logger.error("Unsupported file type provided. Preprocessing halted.")
|
logger.error("Unsupported file type provided. Preprocessing halted.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# 上传知识库
|
# 上传知识库
|
||||||
@ -59,7 +58,7 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
|
|||||||
truncate1 = truncate_files[1] #评标办法前附表
|
truncate1 = truncate_files[1] #评标办法前附表
|
||||||
truncate3 = truncate_files[3] #资格审查表
|
truncate3 = truncate_files[3] #资格审查表
|
||||||
clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json
|
clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json
|
||||||
g.logger.info("文件预处理done")
|
logger.info("文件预处理done")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'input_file_path':downloaded_file_path,
|
'input_file_path':downloaded_file_path,
|
||||||
@ -94,59 +93,59 @@ def post_processing(data,includes):
|
|||||||
return result
|
return result
|
||||||
# 基本信息
|
# 基本信息
|
||||||
def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path): # 投标人须知前附表
|
def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path): # 投标人须知前附表
|
||||||
g.logger.info("starting基础信息...")
|
logger.info("starting基础信息...")
|
||||||
basic_res = project_basic_info(knowledge_name, truncate0, output_folder, clause_path)
|
basic_res = project_basic_info(knowledge_name, truncate0, output_folder, clause_path)
|
||||||
g.logger.info("基础信息done")
|
logger.info("基础信息done")
|
||||||
return basic_res
|
return basic_res
|
||||||
|
|
||||||
|
|
||||||
# 形式、响应、资格评审
|
# 形式、响应、资格评审
|
||||||
def fetch_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path,input_file,output_folder):
|
def fetch_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path,input_file,output_folder):
|
||||||
g.logger.info("starting资格审查...")
|
logger.info("starting资格审查...")
|
||||||
review_standards_res = combine_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath,
|
review_standards_res = combine_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath,
|
||||||
clause_path,input_file,output_folder)
|
clause_path,input_file,output_folder)
|
||||||
g.logger.info("资格审查done")
|
logger.info("资格审查done")
|
||||||
return review_standards_res
|
return review_standards_res
|
||||||
|
|
||||||
|
|
||||||
# 评分细则
|
# 评分细则
|
||||||
def fetch_evaluation_standards(truncate1): # 评标办法前附表
|
def fetch_evaluation_standards(truncate1): # 评标办法前附表
|
||||||
g.logger.info("starting商务标技术标...")
|
logger.info("starting商务标技术标...")
|
||||||
evaluation_standards_res = combine_evaluation_standards(truncate1)
|
evaluation_standards_res = combine_evaluation_standards(truncate1)
|
||||||
g.logger.info("商务标技术标done")
|
logger.info("商务标技术标done")
|
||||||
return evaluation_standards_res
|
return evaluation_standards_res
|
||||||
|
|
||||||
|
|
||||||
# 无效、废标项解析
|
# 无效、废标项解析
|
||||||
def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3):
|
def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3):
|
||||||
# 废标项要求:千问
|
# 废标项要求:千问
|
||||||
g.logger.info("starting无效标与废标...")
|
logger.info("starting无效标与废标...")
|
||||||
find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3)
|
find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3)
|
||||||
g.logger.info("无效标与废标done...")
|
logger.info("无效标与废标done...")
|
||||||
return find_invalid_res
|
return find_invalid_res
|
||||||
|
|
||||||
|
|
||||||
# 投标文件要求
|
# 投标文件要求
|
||||||
def fetch_bidding_documents_requirements(clause_path):
|
def fetch_bidding_documents_requirements(clause_path):
|
||||||
g.logger.info("starting投标文件要求...")
|
logger.info("starting投标文件要求...")
|
||||||
fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1)
|
fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1)
|
||||||
qualify_nested_res = nest_json_under_key(fetch_bidding_documents_requirements_json, "投标文件要求")
|
qualify_nested_res = nest_json_under_key(fetch_bidding_documents_requirements_json, "投标文件要求")
|
||||||
g.logger.info("投标文件要求done...")
|
logger.info("投标文件要求done...")
|
||||||
return qualify_nested_res
|
return qualify_nested_res
|
||||||
|
|
||||||
|
|
||||||
# 开评定标流程
|
# 开评定标流程
|
||||||
def fetch_bid_opening(clause_path):
|
def fetch_bid_opening(clause_path):
|
||||||
g.logger.info("starting开评定标流程...")
|
logger.info("starting开评定标流程...")
|
||||||
fetch_bid_opening_json = extract_from_notice(clause_path, 2)
|
fetch_bid_opening_json = extract_from_notice(clause_path, 2)
|
||||||
qualify_nested_res = nest_json_under_key(fetch_bid_opening_json, "开评定标流程")
|
qualify_nested_res = nest_json_under_key(fetch_bid_opening_json, "开评定标流程")
|
||||||
g.logger.info("开评定标流程done...")
|
logger.info("开评定标流程done...")
|
||||||
return qualify_nested_res
|
return qualify_nested_res
|
||||||
|
|
||||||
|
|
||||||
def main_processing(output_folder, downloaded_file_path, file_type, unique_id): # file_type=1->docx file_type=2->pdf
|
def main_processing(output_folder, downloaded_file_path, file_type, unique_id): # file_type=1->docx file_type=2->pdf
|
||||||
# global global_logger
|
global logger
|
||||||
# global_logger = get_global_logger(unique_id)
|
logger = get_global_logger(unique_id)
|
||||||
# Preprocess files and get necessary data paths and knowledge index
|
# Preprocess files and get necessary data paths and knowledge index
|
||||||
processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id)
|
processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id)
|
||||||
if not processed_data:
|
if not processed_data:
|
||||||
@ -180,7 +179,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
|
|||||||
result = futures[key].result()
|
result = futures[key].result()
|
||||||
comprehensive_responses.append(result)
|
comprehensive_responses.append(result)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
g.logger.error(f"Error processing {key}: {exc}")
|
logger.error(f"Error processing {key}: {exc}")
|
||||||
# 合并 JSON 结果
|
# 合并 JSON 结果
|
||||||
combined_final_result = combine_json_results(comprehensive_responses)
|
combined_final_result = combine_json_results(comprehensive_responses)
|
||||||
includes = ["基础信息", "资格审查", "商务标", "技术标", "无效标与废标项", "投标文件要求", "开评定标流程"]
|
includes = ["基础信息", "资格审查", "商务标", "技术标", "无效标与废标项", "投标文件要求", "开评定标流程"]
|
||||||
@ -190,7 +189,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
|
|||||||
final_result_path = os.path.join(output_folder, "final_result.json")
|
final_result_path = os.path.join(output_folder, "final_result.json")
|
||||||
with open(final_result_path, 'w', encoding='utf-8') as file:
|
with open(final_result_path, 'w', encoding='utf-8') as file:
|
||||||
json.dump(modified_json, file, ensure_ascii=False, indent=2)
|
json.dump(modified_json, file, ensure_ascii=False, indent=2)
|
||||||
g.logger.info("final_result.json has been saved")
|
logger.info("final_result.json has been saved")
|
||||||
deleteKnowledge(processed_data['knowledge_index'])
|
deleteKnowledge(processed_data['knowledge_index'])
|
||||||
return final_result_path
|
return final_result_path
|
||||||
|
|
||||||
|
@ -27,5 +27,5 @@ def extract_text_by_page(file_path):
|
|||||||
print(f"Page {page_num + 1} is empty or text could not be extracted.")
|
print(f"Page {page_num + 1} is empty or text could not be extracted.")
|
||||||
return result
|
return result
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output\\ztb_20240903100337\\ztb_20240903100337_14-20.pdf"
|
file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件正文(1)(1).pdf"
|
||||||
extract_text_by_page(file_path)
|
extract_text_by_page(file_path)
|
||||||
|
@ -2,9 +2,6 @@ import re
|
|||||||
import PyPDF2
|
import PyPDF2
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from flask import g
|
|
||||||
|
|
||||||
|
|
||||||
def extract_key_value_pairs(text):
|
def extract_key_value_pairs(text):
|
||||||
# 更新正则表达式来包括对"团"的处理和行尾斜线
|
# 更新正则表达式来包括对"团"的处理和行尾斜线
|
||||||
pattern = r'\d+\.\d+\s*([\w\s()\u4e00-\u9fff]+)[\x01\x02☑团]([\w\s\u4e00-\u9fff]+)'
|
pattern = r'\d+\.\d+\s*([\w\s()\u4e00-\u9fff]+)[\x01\x02☑团]([\w\s\u4e00-\u9fff]+)'
|
||||||
@ -75,7 +72,7 @@ def read_pdf_and_judge_main(file_path, output_json_path):
|
|||||||
with open(output_json_path, "w", encoding="utf-8") as json_file:
|
with open(output_json_path, "w", encoding="utf-8") as json_file:
|
||||||
json.dump(all_data, json_file, ensure_ascii=False, indent=4)
|
json.dump(all_data, json_file, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
g.logger.info(f"da_gou signal: Data extraction complete and saved to '{output_json_path}'.")
|
print(f"da_gou signal: Data extraction complete and saved to '{output_json_path}'.")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -3,9 +3,6 @@ import json
|
|||||||
import os.path
|
import os.path
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from flask import g
|
|
||||||
|
|
||||||
from flask_app.main.json_utils import combine_json_results, nest_json_under_key
|
from flask_app.main.json_utils import combine_json_results, nest_json_under_key
|
||||||
from flask_app.main.通义千问long import upload_file, qianwen_long
|
from flask_app.main.通义千问long import upload_file, qianwen_long
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
@ -332,7 +329,7 @@ def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,t
|
|||||||
results.append(future.result())
|
results.append(future.result())
|
||||||
|
|
||||||
#禁止投标
|
#禁止投标
|
||||||
g.logger.info("starting不得存在的情形...")
|
print("starting不得存在的情形...")
|
||||||
forbidden_res = find_forbidden(truncate_json_path, clause_path, truncate3)
|
forbidden_res = find_forbidden(truncate_json_path, clause_path, truncate3)
|
||||||
results.append(forbidden_res)
|
results.append(forbidden_res)
|
||||||
|
|
||||||
@ -340,7 +337,7 @@ def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,t
|
|||||||
for d in results:
|
for d in results:
|
||||||
combined_dict.update(d)
|
combined_dict.update(d)
|
||||||
|
|
||||||
g.logger.info("无效标与废标done...")
|
print("无效标与废标done...")
|
||||||
return nest_json_under_key(combined_dict, "无效标与废标项")
|
return nest_json_under_key(combined_dict, "无效标与废标项")
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
from flask import g
|
|
||||||
from llama_index.readers.dashscope.base import DashScopeParse
|
from llama_index.readers.dashscope.base import DashScopeParse
|
||||||
from llama_index.readers.dashscope.utils import ResultType
|
from llama_index.readers.dashscope.utils import ResultType
|
||||||
from llama_index.indices.managed.dashscope import DashScopeCloudIndex
|
from llama_index.indices.managed.dashscope import DashScopeCloudIndex
|
||||||
@ -16,7 +14,7 @@ def addfileToKnowledge(filepath,knowledge_name):
|
|||||||
knowledge_name,
|
knowledge_name,
|
||||||
verbose=True,
|
verbose=True,
|
||||||
)
|
)
|
||||||
g.logger.info("knowledge created successfully!!!")
|
print("knowledge created successfully!!!")
|
||||||
# index = DashScopeCloudIndex(knowledge_name)
|
# index = DashScopeCloudIndex(knowledge_name)
|
||||||
# index._insert(documents)
|
# index._insert(documents)
|
||||||
# return index, documents
|
# return index, documents
|
||||||
|
@ -4,7 +4,6 @@ import os
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
from PyPDF2 import PdfWriter, PdfReader
|
from PyPDF2 import PdfWriter, PdfReader
|
||||||
from flask import g
|
|
||||||
|
|
||||||
from flask_app.main.通义千问long import upload_file, qianwen_long
|
from flask_app.main.通义千问long import upload_file, qianwen_long
|
||||||
|
|
||||||
@ -55,9 +54,9 @@ def extract_and_format_from_paths(json_paths, includes, excludes):
|
|||||||
# 将当前文件的结果添加到总结果列表
|
# 将当前文件的结果添加到总结果列表
|
||||||
all_formatted_results.extend(formatted_results)
|
all_formatted_results.extend(formatted_results)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
g.logger.error(f"禁止投标情形: Error: The file '{path}' does not exist.")
|
print(f"禁止投标情形: Error: The file '{path}' does not exist.")
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
g.logger.error(f"禁止投标情形: Error: The file '{path}' contains invalid JSON.")
|
print(f"禁止投标情形: Error: The file '{path}' contains invalid JSON.")
|
||||||
|
|
||||||
return all_formatted_results
|
return all_formatted_results
|
||||||
|
|
||||||
@ -126,9 +125,9 @@ def merge_pdfs(paths, output_filename):
|
|||||||
if output_path:
|
if output_path:
|
||||||
with open(output_path, 'wb') as out:
|
with open(output_path, 'wb') as out:
|
||||||
pdf_writer.write(out)
|
pdf_writer.write(out)
|
||||||
g.logger.info(f"禁止投标情形: Merged PDF saved to {output_path}")
|
print(f"禁止投标情形: Merged PDF saved to {output_path}")
|
||||||
else:
|
else:
|
||||||
g.logger.error("禁止投标情形: No files to merge.")
|
print("禁止投标情形: No files to merge.")
|
||||||
return output_path
|
return output_path
|
||||||
|
|
||||||
def process_string_list(string_list):
|
def process_string_list(string_list):
|
||||||
@ -153,7 +152,7 @@ def process_string_list(string_list):
|
|||||||
actual_list = ast.literal_eval(formatted_list)
|
actual_list = ast.literal_eval(formatted_list)
|
||||||
return actual_list
|
return actual_list
|
||||||
except SyntaxError as e:
|
except SyntaxError as e:
|
||||||
g.logger.error(f"禁止投标情形: Error parsing list: {e}")
|
print(f"禁止投标情形: Error parsing list: {e}")
|
||||||
return []
|
return []
|
||||||
else:
|
else:
|
||||||
# 如果没有匹配到内容,返回空列表
|
# 如果没有匹配到内容,返回空列表
|
||||||
|
@ -14,5 +14,5 @@ def read_docx(file_path):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile.docx"
|
file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件正文(1)(1).docx"
|
||||||
read_docx(file_path)
|
read_docx(file_path)
|
||||||
|
@ -2,9 +2,6 @@
|
|||||||
#资格审查中,首先排除'联合体投标'和'不得存在的情况',有'符合'等的,加入matching_keys列表,否则保留原字典
|
#资格审查中,首先排除'联合体投标'和'不得存在的情况',有'符合'等的,加入matching_keys列表,否则保留原字典
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from flask import g
|
|
||||||
|
|
||||||
from flask_app.main.json_utils import clean_json_string, combine_json_results, add_keys_to_json, nest_json_under_key
|
from flask_app.main.json_utils import clean_json_string, combine_json_results, add_keys_to_json, nest_json_under_key
|
||||||
from flask_app.main.多线程提问 import multi_threading,read_questions_from_file
|
from flask_app.main.多线程提问 import multi_threading,read_questions_from_file
|
||||||
from flask_app.main.通义千问long import upload_file
|
from flask_app.main.通义千问long import upload_file
|
||||||
@ -19,7 +16,7 @@ def merge_dictionaries_under_common_key(dicts, common_key):
|
|||||||
# 使用字典解包来合并字典
|
# 使用字典解包来合并字典
|
||||||
merged_dict[common_key].update(d[common_key])
|
merged_dict[common_key].update(d[common_key])
|
||||||
else:
|
else:
|
||||||
g.logger.error(f"资格评审: Warning: Dictionary does not contain the key {common_key}")
|
print(f"资格评审: Warning: Dictionary does not contain the key {common_key}")
|
||||||
|
|
||||||
return merged_dict
|
return merged_dict
|
||||||
def generate_qual_question(matching_keys_list): #这里假设资质、信誉与人员要求 要不都有、要不都没
|
def generate_qual_question(matching_keys_list): #这里假设资质、信誉与人员要求 要不都有、要不都没
|
||||||
@ -73,15 +70,14 @@ def get_consortium_dict(knowledge_name):
|
|||||||
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
||||||
qualify_list.append(response[1])
|
qualify_list.append(response[1])
|
||||||
else:
|
else:
|
||||||
g.logger.error(f"资格评审: Warning: Missing or incomplete response data for query index {_}.")
|
print(f"资格评审: Warning: Missing or incomplete response data for query index {_}.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
g.logger.error(f"资格评审: Error processing response for query index {_}: {e}")
|
print(f"资格评审: Error processing response for query index {_}: {e}")
|
||||||
consortium_dict = combine_json_results(qualify_list)
|
consortium_dict = combine_json_results(qualify_list)
|
||||||
return consortium_dict
|
return consortium_dict
|
||||||
|
|
||||||
def get_all_dict(knowledge_name):
|
def get_all_dict(knowledge_name):
|
||||||
# qualification_review_file_path = '../static/提示词/资格评审.txt' # 替换为你的txt文件路径
|
qualification_review_file_path='flask_app/static/提示词/资格评审.txt'
|
||||||
qualification_review_file_path='flask_app/static/提示词/资格评审问题.txt'
|
|
||||||
questions = read_questions_from_file(qualification_review_file_path)
|
questions = read_questions_from_file(qualification_review_file_path)
|
||||||
qualification_list = []
|
qualification_list = []
|
||||||
res1 = multi_threading(questions, knowledge_name)
|
res1 = multi_threading(questions, knowledge_name)
|
||||||
@ -90,9 +86,9 @@ def get_all_dict(knowledge_name):
|
|||||||
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
||||||
qualification_list.append(response[1])
|
qualification_list.append(response[1])
|
||||||
else:
|
else:
|
||||||
g.logger.error(f"资格评审: Warning: Missing or incomplete response data for query index {_}.")
|
print(f"资格评审: Warning: Missing or incomplete response data for query index {_}.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
g.logger.error(f"资格评审: Error processing response for query index {_}: {e}")
|
print(f"资格评审: Error processing response for query index {_}: {e}")
|
||||||
qualification_combined_res = combine_json_results(qualification_list)
|
qualification_combined_res = combine_json_results(qualification_list)
|
||||||
return {'资格评审': qualification_combined_res}
|
return {'资格评审': qualification_combined_res}
|
||||||
def process_qualification(qualification_review,truncate3,knowledge_name):
|
def process_qualification(qualification_review,truncate3,knowledge_name):
|
||||||
@ -101,14 +97,14 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
|
|||||||
if not matching_keys_list: #此时要求全部写在评分办法前附表中,不需要额外提取。
|
if not matching_keys_list: #此时要求全部写在评分办法前附表中,不需要额外提取。
|
||||||
if not non_matching_dict: #古法提取
|
if not non_matching_dict: #古法提取
|
||||||
if truncate3!="":
|
if truncate3!="":
|
||||||
g.logger.info("资格评审: type1")
|
print("资格评审: type1")
|
||||||
matching_keys_list=["资质条件","财务要求","业绩要求","信誉要求","其他要求"]
|
matching_keys_list=["资质条件","财务要求","业绩要求","信誉要求","其他要求"]
|
||||||
ques=generate_qual_question(matching_keys_list)
|
ques=generate_qual_question(matching_keys_list)
|
||||||
file_id2 = upload_file(truncate3)
|
file_id2 = upload_file(truncate3)
|
||||||
results2 = multi_threading(ques, "", file_id2, 2) # 资格评审表,调用qianwen-long
|
results2 = multi_threading(ques, "", file_id2, 2) # 资格评审表,调用qianwen-long
|
||||||
res_list = []
|
res_list = []
|
||||||
if not results2:
|
if not results2:
|
||||||
g.logger.error("资格评审: 调用大模型未成功获取资格评审文件中的要求!")
|
print("资格评审: 调用大模型未成功获取资格评审文件中的要求!")
|
||||||
else:
|
else:
|
||||||
# 打印结果
|
# 打印结果
|
||||||
for question, response in results2:
|
for question, response in results2:
|
||||||
@ -119,11 +115,11 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
|
|||||||
updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) # 合并字典
|
updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) # 合并字典
|
||||||
return updated_qualify_json
|
return updated_qualify_json
|
||||||
else:
|
else:
|
||||||
g.logger.info("资格评审: type2")
|
print("资格评审: type2")
|
||||||
return get_all_dict(knowledge_name)
|
return get_all_dict(knowledge_name)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
g.logger.info("资格评审: type3")
|
print("资格评审: type3")
|
||||||
new_non_matching_json={'资格评审':non_matching_dict}
|
new_non_matching_json={'资格评审':non_matching_dict}
|
||||||
substring = '联合体'
|
substring = '联合体'
|
||||||
found_key = any(substring in key for key in non_matching_dict.keys()) #没有联合体投标,则需生成,防止重复
|
found_key = any(substring in key for key in non_matching_dict.keys()) #没有联合体投标,则需生成,防止重复
|
||||||
@ -135,18 +131,18 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
|
|||||||
return new_non_matching_json
|
return new_non_matching_json
|
||||||
|
|
||||||
elif matching_keys_list and truncate3=="": #这种情况是评分办法前附表中有要求,但是没有正确截取到'资格审查表'
|
elif matching_keys_list and truncate3=="": #这种情况是评分办法前附表中有要求,但是没有正确截取到'资格审查表'
|
||||||
g.logger.info("资格评审: type4")
|
print("资格评审: type4")
|
||||||
final_qualification=get_all_dict(knowledge_name)
|
final_qualification=get_all_dict(knowledge_name)
|
||||||
final_qualify_json = add_keys_to_json(final_qualification, non_matching_dict)
|
final_qualify_json = add_keys_to_json(final_qualification, non_matching_dict)
|
||||||
return final_qualify_json
|
return final_qualify_json
|
||||||
else: #大多数情况
|
else: #大多数情况
|
||||||
g.logger.info("资格评审: type5")
|
print("资格评审: type5")
|
||||||
user_querys = generate_qual_question(matching_keys_list) # 生成提问->‘附件:资格审查’
|
user_querys = generate_qual_question(matching_keys_list) # 生成提问->‘附件:资格审查’
|
||||||
file_id2 = upload_file(truncate3)
|
file_id2 = upload_file(truncate3)
|
||||||
results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表,调用qianwen-long
|
results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表,调用qianwen-long
|
||||||
res_list = []
|
res_list = []
|
||||||
if not results2:
|
if not results2:
|
||||||
g.logger.error("资格评审: 调用大模型未成功获取资格评审文件中的要求!")
|
print("资格评审: 调用大模型未成功获取资格评审文件中的要求!")
|
||||||
else:
|
else:
|
||||||
# 打印结果
|
# 打印结果
|
||||||
for question, response in results2:
|
for question, response in results2:
|
||||||
|
9
flask_app/main/转化格式/pypandoc_d2p.py
Normal file
9
flask_app/main/转化格式/pypandoc_d2p.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
import pypandoc
|
||||||
|
|
||||||
|
def docx_to_pdf(docx_path, output_pdf_path):
|
||||||
|
output = pypandoc.convert_file(docx_path, 'pdf', outputfile=output_pdf_path)
|
||||||
|
assert output == ""
|
||||||
|
|
||||||
|
docx_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件正文(1)(1).docx'
|
||||||
|
output_pdf_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\output.pdf'
|
||||||
|
docx_to_pdf(docx_path, output_pdf_path)
|
@ -1,60 +0,0 @@
|
|||||||
import json
|
|
||||||
|
|
||||||
from flask_app.货物标.货物标截取pdf import truncate_pdf_main
|
|
||||||
from flask_app.main.format_change import docx2pdf, pdf2docx
|
|
||||||
from flask_app.main.多线程提问 import multi_threading
|
|
||||||
from flask_app.main.通义千问long import upload_file,qianwen_long
|
|
||||||
from flask_app.main.json_utils import clean_json_string,combine_json_results
|
|
||||||
|
|
||||||
def generate_key_paths(data, parent_key=''):
|
|
||||||
key_paths = []
|
|
||||||
for key, value in data.items():
|
|
||||||
current_key = f"{parent_key}.{key}" if parent_key else key
|
|
||||||
if isinstance(value, dict):
|
|
||||||
if value: # 字典非空时,递归处理
|
|
||||||
key_paths.extend(generate_key_paths(value, current_key))
|
|
||||||
else: # 字典为空时,直接添加键路径
|
|
||||||
key_paths.append(current_key)
|
|
||||||
else:
|
|
||||||
# 如果到达了末端,添加当前键路径
|
|
||||||
key_paths.append(current_key)
|
|
||||||
return key_paths
|
|
||||||
|
|
||||||
#获取采购清单
|
|
||||||
def fetch_purchasing_list(file_path,output_folder,file_type):
|
|
||||||
global pdf_path,docx_path
|
|
||||||
if file_type==1:
|
|
||||||
docx_path=file_path
|
|
||||||
pdf_path = docx2pdf(file_path)
|
|
||||||
elif file_type==2:
|
|
||||||
pdf_path=file_path
|
|
||||||
docx_path=pdf2docx(file_path)
|
|
||||||
technical_requirements=[]
|
|
||||||
truncate_path=truncate_pdf_main(pdf_path,output_folder,1)
|
|
||||||
user_query1="这是一份货物标中采购要求部分的内容,你需要摘取出需要采购的系统(货物),一个大系统(大项)中可能包含多个小系统(小项),你需要保留这种层次关系,给出货物名称,请以json格式返回,外层键名为\"采购需求\",嵌套键名为对应的系统名称或货物名称,无需给出采购数量和单位,如有未知内容,在对应键值处填\"未知\"。"
|
|
||||||
file_id=upload_file(truncate_path[0])
|
|
||||||
res=qianwen_long(file_id,user_query1)
|
|
||||||
cleaned_res=clean_json_string(res)
|
|
||||||
keys_list=generate_key_paths(cleaned_res['采购需求']) #提取需要采购的货物清单
|
|
||||||
user_query_template = "这是一份货物标中采购要求部分的内容,请你给出\"{}\"的具体型号参数要求,请以json格式返回结果,外层键名为\"{}\", 键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减。"
|
|
||||||
queries=[]
|
|
||||||
for key in keys_list:
|
|
||||||
# 替换 user_query2 中的 "网络硬盘录像机" 为当前 key
|
|
||||||
new_query = user_query_template.format(key, key)
|
|
||||||
print(new_query)
|
|
||||||
queries.append(new_query)
|
|
||||||
results=multi_threading(queries,"",file_id,2)
|
|
||||||
if not results:
|
|
||||||
print("errror!")
|
|
||||||
else:
|
|
||||||
# 打印结果
|
|
||||||
for question, response in results:
|
|
||||||
technical_requirements.append(response)
|
|
||||||
technical_requirements_combined_res=combine_json_results(technical_requirements)
|
|
||||||
json_string = json.dumps(technical_requirements_combined_res, ensure_ascii=False, indent=4)
|
|
||||||
print(json_string)
|
|
||||||
if __name__ == "__main__":
|
|
||||||
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
|
||||||
file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\磋商文件.doc"
|
|
||||||
fetch_purchasing_list(file_path,output_folder,1)
|
|
||||||
|
|
@ -12,7 +12,9 @@ def find_and_copy_files(input_folder, output_folder):
|
|||||||
for root, dirs, files in os.walk(input_folder):
|
for root, dirs, files in os.walk(input_folder):
|
||||||
for file in files:
|
for file in files:
|
||||||
# 检查文件名是否包含“招标”或“竞争性”并且文件格式正确
|
# 检查文件名是否包含“招标”或“竞争性”并且文件格式正确
|
||||||
if ('竞争性' in file or '招标' in file or '磋商' in file) and file.endswith(supported_formats):
|
if ('响应' not in file and '投标' not in file) and \
|
||||||
|
('竞争性' in file or '招标文件' in file or '磋商' in file) and \
|
||||||
|
file.endswith(supported_formats):
|
||||||
# 构造完整的文件路径
|
# 构造完整的文件路径
|
||||||
file_path = os.path.join(root, file)
|
file_path = os.path.join(root, file)
|
||||||
# 构造输出路径
|
# 构造输出路径
|
||||||
|
@ -1,115 +1,33 @@
|
|||||||
import json
|
def postprocess(data):
|
||||||
import re
|
"""转换字典中的值为列表,如果所有键对应的值都是'/', '{}' 或 '未知'"""
|
||||||
|
for key, value in data.items():
|
||||||
|
if all(v in ['/', '未知', {}] for v in value.values()):
|
||||||
|
data[key] = list(value.keys())
|
||||||
|
return data
|
||||||
|
|
||||||
|
# 示例数据
|
||||||
|
data = {
|
||||||
|
"第一包.耗材": {
|
||||||
|
"服务器": "未知",
|
||||||
|
"台式计算机": "未知",
|
||||||
|
"便携式计算机": "/",
|
||||||
|
"信息安全设备": {},
|
||||||
|
"喷墨打印机": "/",
|
||||||
|
"激光打印机": "/",
|
||||||
|
"针式打印机": "/",
|
||||||
|
"液晶显示器": "/",
|
||||||
|
"扫描仪": "/",
|
||||||
|
"基础软件": "/",
|
||||||
|
"信息安全软件": "/",
|
||||||
|
"复印机": "/",
|
||||||
|
"投影仪": "/",
|
||||||
|
"多功能一体机": "/",
|
||||||
|
"触控一体机": "/",
|
||||||
|
"碎纸机": "/",
|
||||||
|
"复印纸": "/"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
def extract_and_format_from_paths(json_paths, includes, excludes):
|
# 转换字典
|
||||||
"""
|
converted_data = postprocess(data)
|
||||||
从多个 JSON 文件路径读取数据,提取包含特定关键词的内容,并按照格式要求合并。
|
print(converted_data)
|
||||||
|
|
||||||
参数:
|
|
||||||
json_paths (list): 包含多个 JSON 文件路径的列表。
|
|
||||||
includes (list): 包含要检查的关键词的列表。
|
|
||||||
excludes (list): 包含要排除的关键词的列表。
|
|
||||||
|
|
||||||
返回:
|
|
||||||
list: 包含所有文件中满足条件的格式化字符串列表。
|
|
||||||
"""
|
|
||||||
all_formatted_results = []
|
|
||||||
|
|
||||||
# 遍历每个文件路径
|
|
||||||
for path in json_paths:
|
|
||||||
try:
|
|
||||||
with open(path, 'r', encoding='utf-8') as file:
|
|
||||||
# 加载 JSON 数据
|
|
||||||
json_data = json.load(file)
|
|
||||||
formatted_results = []
|
|
||||||
|
|
||||||
# 遍历 JSON 数据的每个键值对
|
|
||||||
for key, value in json_data.items():
|
|
||||||
if isinstance(value, dict):
|
|
||||||
# 如果值是字典,检查嵌套字典的每个键值对
|
|
||||||
for sub_key, sub_value in value.items():
|
|
||||||
if any(include in sub_key for include in includes):
|
|
||||||
# 如果子值包含关键词,格式化并添加到结果列表
|
|
||||||
formatted_results.append(f"{sub_value}")
|
|
||||||
elif isinstance(value, str): # clause
|
|
||||||
# 检查是否包含任何 include 关键词
|
|
||||||
for include in includes:
|
|
||||||
if include in value:
|
|
||||||
# 找到 include 之前的内容
|
|
||||||
prefix = value.split(include)[0]
|
|
||||||
# 检查 prefix 是否不包含任何 exclude 关键词
|
|
||||||
if not any(exclude in prefix for exclude in excludes):
|
|
||||||
# 如果不包含任何 exclude 关键词,添加整个 value 到结果列表
|
|
||||||
if '\n' in value:
|
|
||||||
value = value.split('\n', 1)[-1]
|
|
||||||
formatted_results.append(value)
|
|
||||||
break # 找到一个符合条件的就跳出循环
|
|
||||||
|
|
||||||
# 将当前文件的结果添加到总结果列表
|
|
||||||
all_formatted_results.extend(formatted_results)
|
|
||||||
except FileNotFoundError:
|
|
||||||
print(f"Error: The file '{path}' does not exist.")
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
print(f"Error: The file '{path}' contains invalid JSON.")
|
|
||||||
|
|
||||||
return all_formatted_results
|
|
||||||
|
|
||||||
|
|
||||||
def extract_unique_items_from_texts(texts):
|
|
||||||
# 更新正则表达式以包括更广泛的序号类型,包括中文序号
|
|
||||||
pattern = re.compile(r'(?:\d+\.|\(\d+\)|\(\d+\)|\d+\)|\①|\②|\③|\④|\⑤|\⑥|\⑦|\⑧|\⑨|\⑩|\⑪|\⑫)\s*')
|
|
||||||
intro_pattern = re.compile(r'^.*?[::]')
|
|
||||||
punctuation_pattern = re.compile(r'[;。,、..,:;!?!?]+$')
|
|
||||||
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
|
|
||||||
content_check_pattern = re.compile(r'[\u4e00-\u9fa5a-zA-Z0-9]{2,}') # 检查是否含有至少2个连续的字母、数字或汉字
|
|
||||||
|
|
||||||
all_results = []
|
|
||||||
seen = set()
|
|
||||||
|
|
||||||
for text in texts:
|
|
||||||
# 去除文本中的制表符和换行符
|
|
||||||
text = text.replace('\t', '').replace('\n', '')
|
|
||||||
|
|
||||||
# 删除引导性的文本(直到冒号,但保留冒号后的内容)
|
|
||||||
text = intro_pattern.sub('', text)
|
|
||||||
|
|
||||||
# 替换URL为占位符,并保存URL以便后续还原
|
|
||||||
urls = []
|
|
||||||
def url_replacer(match):
|
|
||||||
urls.append(match.group(0))
|
|
||||||
return f"{{URL{len(urls)}}}"
|
|
||||||
text = url_pattern.sub(url_replacer, text)
|
|
||||||
|
|
||||||
# 使用数字和括号的模式分割文本
|
|
||||||
items = pattern.split(text)
|
|
||||||
|
|
||||||
for item in items:
|
|
||||||
cleaned_item = item.strip()
|
|
||||||
if cleaned_item:
|
|
||||||
# 进一步清理每个条目
|
|
||||||
cleaned_item = pattern.sub('', cleaned_item)
|
|
||||||
cleaned_item = punctuation_pattern.sub('', cleaned_item)
|
|
||||||
cleaned_item = cleaned_item.strip()
|
|
||||||
|
|
||||||
# 还原URL
|
|
||||||
for i, url in enumerate(urls, 1):
|
|
||||||
cleaned_item = cleaned_item.replace(f"{{URL{i}}}", url)
|
|
||||||
|
|
||||||
# 添加未见过的独特条目,确保它包含足够的实质内容并长度大于3个字符
|
|
||||||
if cleaned_item and cleaned_item not in seen and len(cleaned_item) > 3 and content_check_pattern.search(cleaned_item):
|
|
||||||
seen.add(cleaned_item)
|
|
||||||
all_results.append(cleaned_item)
|
|
||||||
|
|
||||||
return all_results
|
|
||||||
# 使用上面定义的函数
|
|
||||||
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\truncate_output.json"
|
|
||||||
clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\clause1.json"
|
|
||||||
json_paths = [truncate_json_path,clause_path] # 根据实际存放的路径来填写
|
|
||||||
includes = ["不得存在","不得与","禁止投标","对投标人的纪律"]
|
|
||||||
excludes=["招标","评标","定标"]
|
|
||||||
# 调用函数
|
|
||||||
results = extract_and_format_from_paths(json_paths, includes,excludes)
|
|
||||||
print(results)
|
|
||||||
res=extract_unique_items_from_texts(results)
|
|
||||||
print(res)
|
|
||||||
|
128
flask_app/货物标/技术服务商务要求提取.py
Normal file
128
flask_app/货物标/技术服务商务要求提取.py
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
# -*- encoding:utf-8 -*-
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
from flask_app.main.多线程提问 import multi_threading
|
||||||
|
from flask_app.main.通义千问long import upload_file, qianwen_long
|
||||||
|
from flask_app.main.json_utils import clean_json_string, combine_json_results
|
||||||
|
|
||||||
|
|
||||||
|
def generate_key_paths(data, parent_key=''):
|
||||||
|
key_paths = []
|
||||||
|
|
||||||
|
for key, value in data.items():
|
||||||
|
current_key = f"{parent_key}.{key}" if parent_key else key
|
||||||
|
if isinstance(value, dict):
|
||||||
|
if value:
|
||||||
|
# 检查字典中的值是否为字典、列表或字符串'未知'
|
||||||
|
contains_dict_list_or_unknown = any(isinstance(v, (dict, list)) or v == "未知" for v in value.values())
|
||||||
|
if contains_dict_list_or_unknown:
|
||||||
|
# 递归生成键路径
|
||||||
|
sub_paths = generate_key_paths(value, current_key)
|
||||||
|
if sub_paths:
|
||||||
|
# 如果子路径非空,则扩展
|
||||||
|
key_paths.extend(sub_paths)
|
||||||
|
else:
|
||||||
|
# 当前字典内部为空或值全为"未知"
|
||||||
|
key_paths.append(current_key)
|
||||||
|
else:
|
||||||
|
# 字典中所有值都不是字典、列表或"未知",添加当前键
|
||||||
|
key_paths.append(current_key)
|
||||||
|
else:
|
||||||
|
# 空字典,直接添加键路径
|
||||||
|
key_paths.append(current_key)
|
||||||
|
elif isinstance(value, list):
|
||||||
|
# 列表类型,添加包含列表的键的路径
|
||||||
|
if value: # 只有当列表非空时才添加
|
||||||
|
key_paths.append(current_key)
|
||||||
|
elif value == "未知":
|
||||||
|
# 值为"未知",添加键路径
|
||||||
|
key_paths.append(current_key)
|
||||||
|
|
||||||
|
return key_paths
|
||||||
|
|
||||||
|
|
||||||
|
def get_technical_requirements(truncate_file):
|
||||||
|
user_query1 = "这是一份货物标中采购要求部分的内容,请告诉我需要采购的系统(货物),如果有采购清单,请直接根据清单上的货物名称给出结果,若没有采购清单,你要从文中摘取需要采购的系统(货物),采购需求中可能包含层次关系,如大系统中包含若干子系统,你需要保留这种层次关系,给出系统(货物)名称,请以json格式返回,外层键名为\"采购需求\",嵌套键名为对应的系统名称或货物名称,需与原文保持一致,无需给出采购数量和单位,如有未知内容,在对应键值处填\"未知\"。"
|
||||||
|
file_id = upload_file(truncate_file)
|
||||||
|
res = qianwen_long(file_id, user_query1)
|
||||||
|
print(res)
|
||||||
|
cleaned_res = clean_json_string(res)
|
||||||
|
keys_list = generate_key_paths(cleaned_res['采购需求']) # 提取需要采购的货物清单
|
||||||
|
user_query_template = "这是一份货物标中采购要求部分的内容,请你给出\"{}\"的技术参数(或采购要求)和数量,请以json格式返回结果,外层键名为\"{}\", 键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减。"
|
||||||
|
queries = []
|
||||||
|
for key in keys_list:
|
||||||
|
# 替换 user_query2 中的 "网络硬盘录像机" 为当前 key
|
||||||
|
new_query = user_query_template.format(key, key)
|
||||||
|
print(new_query)
|
||||||
|
queries.append(new_query)
|
||||||
|
results = multi_threading(queries, "", file_id, 2)
|
||||||
|
technical_requirements = []
|
||||||
|
if not results:
|
||||||
|
print("errror!未获得大模型的回答!")
|
||||||
|
else:
|
||||||
|
# 打印结果
|
||||||
|
for question, response in results:
|
||||||
|
technical_requirements.append(response)
|
||||||
|
technical_requirements_combined_res = combine_json_results(technical_requirements)
|
||||||
|
"""根据所有键是否已添加处理技术要求"""
|
||||||
|
# 更新原始采购需求字典
|
||||||
|
combine_and_update_results(cleaned_res['采购需求'], technical_requirements_combined_res)
|
||||||
|
final_res = postprocess(cleaned_res['采购需求'])
|
||||||
|
print("更新后的采购需求处理完成.")
|
||||||
|
|
||||||
|
# 输出最终的 JSON 字符串
|
||||||
|
json_string = json.dumps(final_res, ensure_ascii=False, indent=4)
|
||||||
|
return json_string
|
||||||
|
|
||||||
|
def combine_and_update_results(original_data, updates):
|
||||||
|
def recursive_update(data, key, value):
|
||||||
|
# 处理点分隔的键,递归定位并更新嵌套字典
|
||||||
|
keys = key.split('.')
|
||||||
|
for k in keys[:-1]:
|
||||||
|
data = data.setdefault(k, {})
|
||||||
|
if isinstance(value, dict) and isinstance(data.get(keys[-1], None), dict):
|
||||||
|
data[keys[-1]] = {**data.get(keys[-1], {}), **value}
|
||||||
|
else:
|
||||||
|
data[keys[-1]] = value
|
||||||
|
for key, value in updates.items():
|
||||||
|
recursive_update(original_data, key, value)
|
||||||
|
return original_data
|
||||||
|
|
||||||
|
def postprocess(data):
|
||||||
|
"""转换字典中的值为列表,如果所有键对应的值都是'/', '{}' 或 '未知'"""
|
||||||
|
for key, value in data.items():
|
||||||
|
if all(v in ['/', '未知', {}] for v in value.values()):
|
||||||
|
data[key] = list(value.keys())
|
||||||
|
return data
|
||||||
|
def test_all_files_in_folder(input_folder, output_folder):
|
||||||
|
# 确保输出文件夹存在
|
||||||
|
if not os.path.exists(output_folder):
|
||||||
|
os.makedirs(output_folder)
|
||||||
|
|
||||||
|
# 遍历指定文件夹中的所有文件
|
||||||
|
for filename in os.listdir(input_folder):
|
||||||
|
file_path = os.path.join(input_folder, filename)
|
||||||
|
# 检查是否是文件
|
||||||
|
if os.path.isfile(file_path):
|
||||||
|
print(f"处理文件: {file_path}")
|
||||||
|
# 调用函数处理文件
|
||||||
|
try:
|
||||||
|
json_result = get_technical_requirements(file_path)
|
||||||
|
# 定义输出文件的路径
|
||||||
|
output_file_path = os.path.join(output_folder, os.path.splitext(filename)[0] + '.json')
|
||||||
|
# 保存JSON结果到文件
|
||||||
|
with open(output_file_path, 'w', encoding='utf-8') as json_file:
|
||||||
|
json_file.write(json_result)
|
||||||
|
print(f"结果已保存到: {output_file_path}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"处理文件 {file_path} 时出错: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\招标文件(107国道)_procurement.pdf"
|
||||||
|
res=get_technical_requirements(truncate_file)
|
||||||
|
print(res)
|
||||||
|
# input_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
|
||||||
|
# output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output3"
|
||||||
|
# test_all_files_in_folder(input_folder, output_folder)
|
24
flask_app/货物标/提取采购需求main.py
Normal file
24
flask_app/货物标/提取采购需求main.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
from flask_app.货物标.货物标截取pdf import truncate_pdf_main
|
||||||
|
from flask_app.main.format_change import docx2pdf, pdf2docx
|
||||||
|
|
||||||
|
|
||||||
|
#获取采购清单
|
||||||
|
def fetch_purchasing_list(file_path,output_folder,file_type):
|
||||||
|
if file_type==1:
|
||||||
|
docx_path=file_path
|
||||||
|
pdf_path = docx2pdf(file_path)
|
||||||
|
elif file_type==2:
|
||||||
|
pdf_path=file_path
|
||||||
|
docx_path=pdf2docx(file_path)
|
||||||
|
else:
|
||||||
|
print("未传入指定格式的文件!")
|
||||||
|
return None
|
||||||
|
truncate_path=truncate_pdf_main(pdf_path,output_folder,1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output"
|
||||||
|
file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\磋商文件.doc"
|
||||||
|
fetch_purchasing_list(file_path,output_folder,1)
|
||||||
|
|
@ -1,10 +1,13 @@
|
|||||||
#这是一份货物标中采购要求部分的内容,你需要摘取出采购清单,一个大系统(大项)中可能包含多个小系统(小项),你需要保留这种层次关系,给出名称和数量和单位,请以json格式返回,外层键名为"采购需求",如有未知内容,在对应键值处填"未知"。
|
#这是一份货物标中采购要求部分的内容,你需要摘取出采购清单,一个大系统(大项)中可能包含多个小系统(小项),你需要保留这种层次关系,给出名称和数量和单位,请以json格式返回,外层键名为"采购需求",如有未知内容,在对应键值处填"未知"。
|
||||||
|
|
||||||
这是一份货物标中采购要求部分的内容,你需要摘取出需要采购的系统(货物),一个大系统(大项)中可能包含多个小系统(小项),小系统中也可能包含多个货物,你需要保留这种层次关系,给出货物名称,请以json格式返回,外层键名为"采购需求",嵌套键名为对应的系统名称或货物名称,无需给出采购数量和单位,如有未知内容,在对应键值处填"未知"。
|
#这是一份货物标中采购要求部分的内容,你需要摘取出需要采购的系统(货物),一个大系统(大项)中可能包含多个小系统(小项),小系统中也可能包含多个货物,你需要保留这种层次关系,给出货物名称,请以json格式返回,外层键名为"采购需求",嵌套键名为对应的系统名称或货物名称,无需给出采购数量和单位,如有未知内容,在对应键值处填"未知"。
|
||||||
|
|
||||||
|
"这是一份货物标中采购要求部分的内容,你需要摘取出需要采购的系统(货物),一个大系统(大项)中可能包含多个小系统(小项),你需要保留这种层次关系,给出系统(货物)名称,请以json格式返回,外层键名为\"采购需求\",嵌套键名为对应的系统名称或货物名称,无需给出采购数量和单位,如有未知内容,在对应键值处填\"未知\"。"
|
||||||
|
|
||||||
|
|
||||||
#这是一份货物标中采购要求部分的内容,请你给出所需的设备名称以及设备的具体型号参数要求,请以json格式返回结果,外层键名为采购要求。
|
#这是一份货物标中采购要求部分的内容,请你给出所需的设备名称以及设备的具体型号参数要求,请以json格式返回结果,外层键名为采购要求。
|
||||||
|
这是一份货物标中采购要求部分的内容,请你给出\"{}\"的具体型号参数要求和数量,请以json格式返回结果,外层键名为\"{}\", 键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减。
|
||||||
这是一份货物标中采购要求部分的内容,请你给出"网络硬盘录像机"的具体型号参数要求,请以json格式返回结果,外层键名为"网络硬盘录像机",键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减。
|
#这是一份货物标中采购要求部分的内容,请你给出"网络硬盘录像机"的具体型号参数要求,请以json格式返回结果,外层键名为"网络硬盘录像机",键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减。
|
||||||
|
|
||||||
{
|
{
|
||||||
"采购需求": {
|
"采购需求": {
|
||||||
@ -76,5 +79,71 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
{
|
||||||
|
"采购需求": {
|
||||||
|
"第一包": {
|
||||||
|
"办公电子设备": [
|
||||||
|
"服务器",
|
||||||
|
"台式计算机",
|
||||||
|
"便携式计算机",
|
||||||
|
"信息安全设备",
|
||||||
|
"喷墨打印机",
|
||||||
|
"激光打印机",
|
||||||
|
"针式打印机",
|
||||||
|
"液晶显示器",
|
||||||
|
"扫描仪",
|
||||||
|
"基础软件",
|
||||||
|
"信息安全软件",
|
||||||
|
"复印机",
|
||||||
|
"投影仪",
|
||||||
|
"多功能一体机",
|
||||||
|
"触控一体机",
|
||||||
|
"碎纸机"
|
||||||
|
],
|
||||||
|
"软件": [
|
||||||
|
"基础软件",
|
||||||
|
"信息安全软件"
|
||||||
|
],
|
||||||
|
"耗材": [
|
||||||
|
"复印纸"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"第二包": {
|
||||||
|
"办公家电": [
|
||||||
|
"空调机"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"第三包": {
|
||||||
|
"家具用具": [
|
||||||
|
"床类",
|
||||||
|
"台、桌类",
|
||||||
|
"椅凳类",
|
||||||
|
"沙发类",
|
||||||
|
"柜类",
|
||||||
|
"架类",
|
||||||
|
"屏风类",
|
||||||
|
"厨卫用具",
|
||||||
|
"组合家具",
|
||||||
|
"家用家具零配件",
|
||||||
|
"其他家具用具"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"第四包": {
|
||||||
|
"印刷服务": "未知"
|
||||||
|
},
|
||||||
|
"第五包": {
|
||||||
|
"汽车维修和保养服务": "未知"
|
||||||
|
},
|
||||||
|
"第六包": {
|
||||||
|
"会计服务": "未知"
|
||||||
|
},
|
||||||
|
"第七包": {
|
||||||
|
"工程造价咨询服务": "未知"
|
||||||
|
},
|
||||||
|
"第八包": {
|
||||||
|
"机动车保险服务": "未知"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,91 +1,179 @@
|
|||||||
from PyPDF2 import PdfReader, PdfWriter
|
from PyPDF2 import PdfReader, PdfWriter
|
||||||
import re # 导入正则表达式库
|
import re # 导入正则表达式库
|
||||||
import os # 用于文件和文件夹操作
|
import os # 用于文件和文件夹操作
|
||||||
|
from flask_app.main.format_change import docx2pdf
|
||||||
|
|
||||||
def clean_page_numbers(text):
|
|
||||||
# 使用正则表达式删除页码
|
def clean_page_content(text, common_header):
|
||||||
# 假设页码在文本的最开始,紧跟着文字且无空格分隔
|
# 首先删除抬头公共部分
|
||||||
cleaned_text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
|
if common_header: # 确保有公共抬头才进行替换
|
||||||
# 删除结尾的页码
|
for header_line in common_header.split('\n'):
|
||||||
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text)
|
if header_line.strip(): # 只处理非空行
|
||||||
# 删除形如 /129 的页码
|
# 替换首次出现的完整行
|
||||||
cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text)
|
text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
|
||||||
return cleaned_text
|
|
||||||
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
# 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除
|
||||||
# 打开PDF文件
|
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
|
||||||
|
text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码
|
||||||
|
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码
|
||||||
|
|
||||||
|
return text
|
||||||
|
def extract_common_header(pdf_path):
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
start_page = None
|
headers = []
|
||||||
end_page = None
|
total_pages = len(pdf_document.pages)
|
||||||
# 遍历文档的每一页,查找开始和结束短语的位置
|
middle_page = total_pages // 2 # 计算中间页
|
||||||
for i in range(len(pdf_document.pages)):
|
|
||||||
|
# 确保中间页前后各一页(共3页),如果不足3页,则尽可能取足
|
||||||
|
start_page = max(0, middle_page - 1)
|
||||||
|
num_pages_to_read = 3
|
||||||
|
|
||||||
|
for i in range(start_page, min(start_page + num_pages_to_read, total_pages)):
|
||||||
page = pdf_document.pages[i]
|
page = pdf_document.pages[i]
|
||||||
text = page.extract_text()
|
text = page.extract_text() or ""
|
||||||
if text:
|
if text:
|
||||||
cleaned_text = clean_page_numbers(text)
|
# 只取每页的前三行
|
||||||
if re.search(begin_pattern, cleaned_text) and i > begin_page:
|
first_lines = text.strip().split('\n')[:3]
|
||||||
start_page = i
|
headers.append(first_lines)
|
||||||
if start_page is not None and re.search(end_pattern, cleaned_text) and i > (start_page+1):
|
|
||||||
end_page = i
|
|
||||||
break
|
|
||||||
# 确保找到了起始和结束页面
|
|
||||||
if start_page is None or end_page is None:
|
|
||||||
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
|
|
||||||
return None
|
|
||||||
|
|
||||||
# 创建一个新的PDF文档保存截取的页面
|
if len(headers) < 2:
|
||||||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # Get the base name without extension
|
return "" # 如果没有足够的页来比较,返回空字符串
|
||||||
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
|
|
||||||
output_doc = PdfWriter()
|
|
||||||
|
|
||||||
# 添加需要的页面,从 start_page 开始,包括 end_page
|
# 寻找每一行中的公共部分
|
||||||
for page_num in range(start_page, end_page + 1):
|
common_headers = []
|
||||||
output_doc.add_page(pdf_document.pages[page_num])
|
for lines in zip(*headers):
|
||||||
# 保存新的PDF文件
|
# 在每一行中寻找公共单词
|
||||||
with open(output_pdf_path, 'wb') as f:
|
common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]])
|
||||||
output_doc.write(f)
|
if common_line:
|
||||||
|
common_headers.append(' '.join(common_line))
|
||||||
|
|
||||||
print(f"已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
|
return '\n'.join(common_headers)
|
||||||
|
|
||||||
return output_pdf_path
|
def is_pdf_or_doc(filename):
|
||||||
|
# 判断文件是否为PDF或Word文档
|
||||||
|
return filename.lower().endswith(('.pdf', '.doc', '.docx'))
|
||||||
|
|
||||||
|
def convert_to_pdf(file_path):
|
||||||
|
# 假设 docx2pdf 函数已经被定义,这里仅根据文件扩展名来决定是否需要转换
|
||||||
|
if file_path.lower().endswith(('.doc', '.docx')):
|
||||||
|
return docx2pdf(file_path)
|
||||||
|
return file_path
|
||||||
|
|
||||||
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
||||||
# 确保输出文件夹存在
|
|
||||||
if not os.path.exists(output_folder):
|
if not os.path.exists(output_folder):
|
||||||
os.makedirs(output_folder)
|
os.makedirs(output_folder)
|
||||||
if os.path.isdir(input_path):
|
|
||||||
generated_files = []
|
generated_files = []
|
||||||
|
if os.path.isdir(input_path):
|
||||||
# 遍历文件夹内的所有PDF文件
|
# 遍历文件夹内的所有PDF文件
|
||||||
for file in os.listdir(input_path):
|
for file in os.listdir(input_path):
|
||||||
if file.endswith(".pdf"):
|
file_path = os.path.join(input_path, file)
|
||||||
pdf_path = os.path.join(input_path, file)
|
if is_pdf_or_doc(file):
|
||||||
output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
pdf_path = convert_to_pdf(file_path)
|
||||||
if output_pdf_path and os.path.isfile(output_pdf_path):
|
output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern,
|
||||||
|
output_suffix)
|
||||||
|
if output_pdf_path:
|
||||||
generated_files.append(output_pdf_path)
|
generated_files.append(output_pdf_path)
|
||||||
return generated_files
|
|
||||||
elif os.path.isfile(input_path) and input_path.endswith(".pdf"):
|
elif os.path.isfile(input_path) and input_path.endswith(".pdf"):
|
||||||
# 处理单个PDF文件
|
# 处理单个PDF文件
|
||||||
output_pdf_path = extract_pages(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
output_pdf_path = extract_pages(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||||
if output_pdf_path and os.path.isfile(output_pdf_path):
|
if output_pdf_path:
|
||||||
return [output_pdf_path] # 以列表形式返回,以保持一致性
|
generated_files.append(output_pdf_path)
|
||||||
else:
|
else:
|
||||||
print("提供的路径既不是文件夹也不是PDF文件。")
|
print("提供的路径既不是文件夹也不是PDF文件。")
|
||||||
return []
|
return generated_files
|
||||||
|
|
||||||
|
def extract_pages_twice(pdf_path, output_folder, output_suffix,common_header):
|
||||||
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
begin_page=5
|
||||||
|
start_page = None
|
||||||
|
end_page = None
|
||||||
|
# 定义用于检测是否包含"文件的构成"的正则表达式
|
||||||
|
exclusion_pattern = re.compile(r'文件的构成|文件的组成')
|
||||||
|
if output_suffix == "procurement":
|
||||||
|
begin_pattern = re.compile(
|
||||||
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
|
||||||
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|'
|
||||||
|
r'^[一二三四五六七八九十百千]+、\s*采购清单',
|
||||||
|
re.MULTILINE
|
||||||
|
)
|
||||||
|
end_pattern = re.compile(
|
||||||
|
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
|
||||||
|
re.MULTILINE
|
||||||
|
)
|
||||||
|
for i in range(len(pdf_document.pages)):
|
||||||
|
page = pdf_document.pages[i]
|
||||||
|
text = page.extract_text() or ""
|
||||||
|
cleaned_text = clean_page_content(text,common_header)
|
||||||
|
if re.search(begin_pattern, cleaned_text) and i > begin_page and not re.search(exclusion_pattern, cleaned_text):
|
||||||
|
start_page = i
|
||||||
|
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
|
||||||
|
end_page = i
|
||||||
|
break
|
||||||
|
if start_page is None or end_page is None:
|
||||||
|
print(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!")
|
||||||
|
return ""
|
||||||
|
else:
|
||||||
|
return save_extracted_pages(pdf_document,start_page, end_page, pdf_path,output_folder, output_suffix)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
||||||
|
try:
|
||||||
|
common_header = extract_common_header(pdf_path)
|
||||||
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
start_page = None
|
||||||
|
end_page = None
|
||||||
|
for i in range(len(pdf_document.pages)):
|
||||||
|
page = pdf_document.pages[i]
|
||||||
|
text = page.extract_text() or ""
|
||||||
|
cleaned_text = clean_page_content(text,common_header)
|
||||||
|
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
|
||||||
|
start_page = i
|
||||||
|
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
|
||||||
|
end_page = i
|
||||||
|
break
|
||||||
|
if start_page is None or end_page is None:
|
||||||
|
if output_suffix == "procurement":
|
||||||
|
return extract_pages_twice(pdf_path, output_folder, output_suffix,common_header)
|
||||||
|
else:
|
||||||
|
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing {pdf_path}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):
|
||||||
|
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||||
|
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
|
||||||
|
output_doc = PdfWriter()
|
||||||
|
for page_num in range(start_page, end_page + 1):
|
||||||
|
output_doc.add_page(pdf_document.pages[page_num])
|
||||||
|
with open(output_pdf_path, 'wb') as f:
|
||||||
|
output_doc.write(f)
|
||||||
|
print(f"已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
|
||||||
|
return output_pdf_path
|
||||||
|
|
||||||
|
|
||||||
def truncate_pdf_main(input_path, output_folder, selection):
|
def truncate_pdf_main(input_path, output_folder, selection):
|
||||||
if selection == 1:
|
if selection == 1:
|
||||||
# Configure patterns and phrases for "第三章 项目技术、服务及商务要求"
|
# 更新的正则表达式以匹配"第x章"和"第x部分",考虑到可能的空格和其他文字
|
||||||
begin_pattern = re.compile(r'第[一二三四五六七八九十百千]+章.*?(?:服务|项目|商务).*?要求|第[一二三四五六七八九十百千]+章.*?采购.*')
|
begin_pattern = re.compile(
|
||||||
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
|
||||||
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?采购.*',
|
||||||
|
# r'^[一二三四五六七八九十百千]+、\s*采购清单',
|
||||||
|
)
|
||||||
begin_page = 5
|
begin_page = 5
|
||||||
end_pattern = re.compile(r'第[一二三四五六七八九十百千]+章\s*(资格审查|评标方法|评审办法|评定办法)')
|
end_pattern = re.compile(
|
||||||
# 示例文本进行测试
|
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
|
||||||
output_suffix = "tobidders_notice_table"
|
)
|
||||||
|
output_suffix = "procurement"
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print("无效的选择")
|
print("无效的选择")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Process the selected input
|
# 调用相应的处理函数
|
||||||
return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||||
|
|
||||||
def truncate_pdf_multiple(input_path, output_folder):
|
def truncate_pdf_multiple(input_path, output_folder):
|
||||||
@ -96,10 +184,10 @@ def truncate_pdf_multiple(input_path, output_folder):
|
|||||||
return truncate_files
|
return truncate_files
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
input_path = "C:\\Users\\Administrator\\WPSDrive\\292826853\\WPS云盘\\应用\\输出为PDF\\公安局音视频监控系统设备采购项目(定稿)_20240829133603.pdf"
|
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件(107国道).pdf"
|
||||||
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
|
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
|
||||||
# truncate_pdf_multiple(input_path,output_folder)
|
truncate_pdf_multiple(input_path,output_folder)
|
||||||
selection = 1 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-招标公告-合同条款前
|
# selection = 1 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-招标公告-合同条款前
|
||||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||||
# print("生成的文件:", generated_files)
|
# print("生成的文件:", generated_files)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user