This commit is contained in:
zy123 2024-09-13 15:03:55 +08:00
parent 010eff9405
commit 79834efb5e
27 changed files with 534 additions and 384 deletions

View File

@ -2,8 +2,6 @@ from docx import Document
import re import re
import os import os
from flask import g
def copy_docx(source_path): def copy_docx(source_path):
doc = Document(source_path) # 打开源文档 doc = Document(source_path) # 打开源文档
@ -45,7 +43,7 @@ def copy_docx(source_path):
break break
new_doc.save(destination_path) # 保存新文档 new_doc.save(destination_path) # 保存新文档
g.logger.info("docx截取docx成功") print("docx截取docx成功")
# 调用函数 # 调用函数

View File

@ -1,7 +1,6 @@
import requests import requests
import mimetypes import mimetypes
from flask import g
def download_file(url, local_filename): def download_file(url, local_filename):
@ -29,13 +28,13 @@ def download_file(url, local_filename):
else: else:
return full_filename,3 return full_filename,3
except requests.HTTPError as e: except requests.HTTPError as e:
g.logger.error(f"download: HTTP Error: {e}") print(f"download: HTTP Error: {e}")
return None return None
except requests.RequestException as e: except requests.RequestException as e:
g.logger.error(f"download: Error downloading the file: {e}") print(f"download: Error downloading the file: {e}")
return None return None
except Exception as e: except Exception as e:
g.logger.error(f"download: An error occurred: {e}") print(f"download: An error occurred: {e}")
return None return None
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,7 +1,6 @@
import json import json
import os import os
import requests import requests
from flask import g
from flask_app.main.download import download_file from flask_app.main.download import download_file
@ -21,14 +20,14 @@ def upload_file(file_path, url):
# 检查响应状态码 # 检查响应状态码
if response.status_code == 200: if response.status_code == 200:
g.logger.info("format_change 文件上传成功") print("format_change 文件上传成功")
receive_file_response = response.content.decode('utf-8') receive_file_response = response.content.decode('utf-8')
receive_file_json = json.loads(receive_file_response) receive_file_json = json.loads(receive_file_response)
receive_file_url = receive_file_json["data"] receive_file_url = receive_file_json["data"]
else: else:
g.logger.info(f"format_change 文件上传失败,状态码: {response.status_code}") print(f"format_change 文件上传失败,状态码: {response.status_code}")
g.logger.info(f"format_change {response.text}") print(f"format_change {response.text}")
return receive_file_url return receive_file_url
@ -46,7 +45,7 @@ def pdf2docx(local_path_in):
filename, folder = get_filename_and_folder(local_path_in) #输入输出在同一个文件夹 filename, folder = get_filename_and_folder(local_path_in) #输入输出在同一个文件夹
local_path_out=os.path.join(folder,filename) #输出文件名 local_path_out=os.path.join(folder,filename) #输出文件名
downloaded_filepath,file_type=download_file(receive_download_url, local_path_out) downloaded_filepath,file_type=download_file(receive_download_url, local_path_out)
g.logger.info("format_change p2d:have downloaded file to:",downloaded_filepath) print(f"format_change p2d:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath return downloaded_filepath
def docx2pdf(local_path_in): def docx2pdf(local_path_in):
@ -55,7 +54,7 @@ def docx2pdf(local_path_in):
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹 filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
local_path_out = os.path.join(folder, filename) # 输出文件名 local_path_out = os.path.join(folder, filename) # 输出文件名
downloaded_filepath,file_type = download_file(receive_download_url, local_path_out) downloaded_filepath,file_type = download_file(receive_download_url, local_path_out)
g.logger.info("format_change d2p:have downloaded file to:", downloaded_filepath) print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath return downloaded_filepath
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,9 +1,6 @@
import json import json
import re import re
from flask import g
def extract_content_from_json(json_data): def extract_content_from_json(json_data):
"""提取 { 和 } 之间的内容,并将其解析为字典""" """提取 { 和 } 之间的内容,并将其解析为字典"""
if not json_data.strip(): if not json_data.strip():
@ -14,10 +11,10 @@ def extract_content_from_json(json_data):
json_data = match.group(0) json_data = match.group(0)
return json.loads(json_data) #返回字典 return json.loads(json_data) #返回字典
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
g.logger.info(f"json_utils: extract_content_from_json: JSON decode error: {e}") print(f"json_utils: extract_content_from_json: JSON decode error: {e}")
return {} return {}
else: else:
g.logger.info("json_utils: extract_content_from_json: No valid JSON content found.") print("json_utils: extract_content_from_json: No valid JSON content found.")
return {} return {}
def clean_json_string(json_string): def clean_json_string(json_string):
@ -66,18 +63,18 @@ def add_keys_to_json(target_dict, source_dict):
dict: 更新后的字典 dict: 更新后的字典
""" """
if not target_dict: if not target_dict:
g.logger.error("json_utils: Error: Target dictionary is empty.") print("json_utils: Error: Target dictionary is empty.")
return {} return {}
if len(target_dict) != 1: if len(target_dict) != 1:
g.logger.error("json_utils: Error: Target dictionary must contain exactly one top-level key.") print("json_utils: Error: Target dictionary must contain exactly one top-level key.")
return target_dict return target_dict
# 获取唯一的外层键 # 获取唯一的外层键
target_key, existing_dict = next(iter(target_dict.items())) target_key, existing_dict = next(iter(target_dict.items()))
if not isinstance(existing_dict, dict): if not isinstance(existing_dict, dict):
g.logger.error(f"json_utils: Error: The value under the key '{target_key}' is not a dictionary.") print(f"json_utils: Error: The value under the key '{target_key}' is not a dictionary.")
return target_dict return target_dict
# 合并字典 # 合并字典
@ -95,7 +92,7 @@ def rename_outer_key(original_data,new_key):
# 提取原始数据中的唯一外层值(假设只有一个外层键) # 提取原始数据中的唯一外层值(假设只有一个外层键)
if not original_data or not isinstance(original_data, dict): if not original_data or not isinstance(original_data, dict):
g.logger.error("json_utils: Error: Invalid input or input is not a dictionary.") # 如果输入无效或不是字典,则返回空字典 print("json_utils: Error: Invalid input or input is not a dictionary.") # 如果输入无效或不是字典,则返回空字典
return {} return {}
# 使用 next(iter(...)) 提取第一个键的值 # 使用 next(iter(...)) 提取第一个键的值

View File

@ -1,6 +1,5 @@
import logging import logging
import shutil import shutil
import sys
import time import time
import uuid import uuid
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -40,7 +39,8 @@ def before_request():
def create_logger(): def create_logger():
unique_id = str(uuid.uuid4()) unique_id = str(uuid.uuid4())
g.unique_id = unique_id g.unique_id = unique_id
output_folder = f"flask_app/static/output/{unique_id}" # output_folder = f"flask_app/static/output/{unique_id}"
output_folder = f"C:/Users/Administrator/Desktop/货物标/zboutpub/{unique_id}"
os.makedirs(output_folder, exist_ok=True) os.makedirs(output_folder, exist_ok=True)
log_filename = "log.txt" log_filename = "log.txt"
log_path = os.path.join(output_folder, log_filename) log_path = os.path.join(output_folder, log_filename)
@ -59,11 +59,12 @@ def create_logger():
@app.route('/upload', methods=['POST']) @app.route('/upload', methods=['POST'])
def zbparse(): def zbparse():
logger=g.logger
file_url = validate_request() file_url = validate_request()
if isinstance(file_url, tuple): # Check if the returned value is an error response if isinstance(file_url, tuple): # Check if the returned value is an error response
return file_url return file_url
try: try:
app.logger.info("starting parsing url:" + file_url) logger.info("starting parsing url:" + file_url)
final_json_path, output_folder = download_and_process_file(file_url) final_json_path, output_folder = download_and_process_file(file_url)
if not final_json_path: if not final_json_path:
return jsonify({'error': 'File processing failed'}), 500 return jsonify({'error': 'File processing failed'}), 500
@ -71,7 +72,7 @@ def zbparse():
# remove_directory(output_folder) # 然后删除文件夹 # remove_directory(output_folder) # 然后删除文件夹
return response # 最后返回获取的响应 return response # 最后返回获取的响应
except Exception as e: except Exception as e:
app.logger.error('Exception occurred: ' + str(e)) # 使用全局 logger 记录 logger.error('Exception occurred: ' + str(e)) # 使用全局 logger 记录
return jsonify({'error': str(e)}), 500 return jsonify({'error': str(e)}), 500
@ -138,7 +139,8 @@ def validate_request():
def download_and_process_file(file_url): def download_and_process_file(file_url):
logger = g.logger logger = g.logger
unique_id = g.unique_id unique_id = g.unique_id
output_folder = f"flask_app/static/output/{unique_id}" # 直接使用全局 unique_id 构建路径 # output_folder = f"flask_app/static/output/{unique_id}" # 直接使用全局 unique_id 构建路径
output_folder = f"C:/Users/Administrator/Desktop/货物标/zboutpub/{unique_id}"
filename = "ztbfile" filename = "ztbfile"
downloaded_filename = os.path.join(output_folder, filename) downloaded_filename = os.path.join(output_folder, filename)

View File

@ -3,7 +3,6 @@ import os
from docx import Document from docx import Document
import json import json
from flask import g
def read_tables_from_docx(file_path): def read_tables_from_docx(file_path):
@ -89,13 +88,13 @@ def save_data_to_json(data, output_folder):
"""将数据保存到JSON文件中.""" """将数据保存到JSON文件中."""
with open(output_filepath, 'w', encoding='utf-8') as file: with open(output_filepath, 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4) json.dump(data, file, ensure_ascii=False, indent=4)
g.logger.info(f"table_content_extraction: The data has been processed and saved to '{output_filepath}'.") print(f"table_content_extraction: The data has been processed and saved to '{output_filepath}'.")
return output_filepath return output_filepath
def extract_tables_main(path, output_folder): def extract_tables_main(path, output_folder):
if not os.path.exists(path): if not os.path.exists(path):
g.logger.error(f"table_content_extraction: The specified file does not exist: {path}") print(f"table_content_extraction: The specified file does not exist: {path}")
return "" return ""
# 读取文档表格数据 # 读取文档表格数据
table_data = read_tables_from_docx(path) table_data = read_tables_from_docx(path)

View File

@ -1,7 +1,5 @@
import json import json
from flask import g
from flask_app.main.json_utils import clean_json_string, nest_json_under_key,rename_outer_key, combine_json_results from flask_app.main.json_utils import clean_json_string, nest_json_under_key,rename_outer_key, combine_json_results
from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice
from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge
@ -32,9 +30,6 @@ def combine_basic_info(baseinfo_list):
# 根据检测到的键动态调整 key_groups # 根据检测到的键动态调整 key_groups
dynamic_key_handling(key_groups, relevant_keys_detected) dynamic_key_handling(key_groups, relevant_keys_detected)
# 打印 key_groups 的内容检查它们是否被正确更新
# g.logger.info("Updated key_groups after dynamic handling:")
# 使用合并后的字典创建最终输出 # 使用合并后的字典创建最终输出
for group_name, keys in key_groups.items(): for group_name, keys in key_groups.items():
@ -82,8 +77,7 @@ def judge_consortium_bidding(baseinfo_list):
def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #投标人须知前附表 def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #投标人须知前附表
# 调用大模型回答项目基础信息 # 调用大模型回答项目基础信息
baseinfo_list = [] baseinfo_list = []
# baseinfo_file_path='../static/提示词/前两章提问总结.txt' baseinfo_file_path='flask_app/static/提示词/前两章提问总结.txt'
baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt' # 替换为你的txt文件路径
questions = read_questions_from_file(baseinfo_file_path) questions = read_questions_from_file(baseinfo_file_path)
res1 = multi_threading(questions, knowledge_name) res1 = multi_threading(questions, knowledge_name)
for _, response in res1: # _占位代表ques;response[0]也是ques;response[1]是ans for _, response in res1: # _占位代表ques;response[0]也是ques;response[1]是ans
@ -91,13 +85,12 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
if response and len(response) > 1: # 检查response存在且有至少两个元素 if response and len(response) > 1: # 检查response存在且有至少两个元素
baseinfo_list.append(response[1]) baseinfo_list.append(response[1])
else: else:
g.logger.error(f"基础信息整合: Warning: Missing or incomplete response data for query index {_}.") print(f"基础信息整合: Warning: Missing or incomplete response data for query index {_}.")
except Exception as e: except Exception as e:
g.logger.error(f"基础信息整合: Error processing response for query index {_}: {e}") print(f"基础信息整合: Error processing response for query index {_}: {e}")
# 判断是否分包、是否需要递交投标保证金等 # 判断是否分包、是否需要递交投标保证金等
chosen_numbers, merged = judge_whether_main(truncate0,output_folder) chosen_numbers, merged = judge_whether_main(truncate0,output_folder)
baseinfo_list.append(merged) baseinfo_list.append(merged)
# judge_file_path = '../static/提示词/是否相关问题.txt'
judge_file_path ='flask_app/static/提示词/是否相关问题.txt' judge_file_path ='flask_app/static/提示词/是否相关问题.txt'
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers) judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
@ -109,7 +102,7 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #
file_id=upload_file(truncate0) file_id=upload_file(truncate0)
res2 = multi_threading(judge_questions, "",file_id,2) #调用千问-long res2 = multi_threading(judge_questions, "",file_id,2) #调用千问-long
if not res2: if not res2:
g.logger.error("基础信息整合: multi_threading errror!") print("基础信息整合: multi_threading errror!")
else: else:
# 打印结果 # 打印结果
for question, response in res2: for question, response in res2:

View File

@ -6,7 +6,6 @@ import concurrent.futures
import time import time
from dashscope import Assistants, Messages, Runs, Threads from dashscope import Assistants, Messages, Runs, Threads
from flask import g
from llama_index.indices.managed.dashscope import DashScopeCloudRetriever from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
from flask_app.main.通义千问long import qianwen_long, upload_file from flask_app.main.通义千问long import qianwen_long, upload_file
prompt = """ prompt = """
@ -118,10 +117,10 @@ def pure_assistant():
def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type): def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type):
if llm_type==1: if llm_type==1:
g.logger.info(f"rag_assistant! question:{question}") print(f"rag_assistant! question:{question}")
assistant = rag_assistant(knowledge_name) assistant = rag_assistant(knowledge_name)
elif llm_type==2: elif llm_type==2:
g.logger.info(f"qianwen_long! question:{question}") print(f"qianwen_long! question:{question}")
qianwen_res = qianwen_long(file_id,question) qianwen_res = qianwen_long(file_id,question)
result_queue.put((ans_index,(question,qianwen_res))) result_queue.put((ans_index,(question,qianwen_res)))
return return
@ -131,7 +130,7 @@ def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type
result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans) result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans)
def multi_threading(queries, knowledge_name="", file_id="",llm_type=1): def multi_threading(queries, knowledge_name="", file_id="",llm_type=1):
g.logger.info("多线程提问starting multi_threading...") print("多线程提问starting multi_threading...")
result_queue = queue.Queue() result_queue = queue.Queue()
# 使用 ThreadPoolExecutor 管理线程 # 使用 ThreadPoolExecutor 管理线程
@ -150,7 +149,7 @@ def multi_threading(queries, knowledge_name="", file_id="",llm_type=1):
try: try:
future.result() # 可以用来捕获异常或确认任务完成 future.result() # 可以用来捕获异常或确认任务完成
except Exception as exc: except Exception as exc:
g.logger.error(f"Query {index} generated an exception: {exc}") print(f"Query {index} generated an exception: {exc}")
# 从队列中获取所有结果并按索引排序 # 从队列中获取所有结果并按索引排序
results = [None] * len(queries) results = [None] * len(queries)

View File

@ -3,8 +3,6 @@ import re
import json import json
import time import time
from flask import g
from flask_app.main.多线程提问 import multi_threading from flask_app.main.多线程提问 import multi_threading
from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2 from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2
from flask_app.main.json_utils import extract_content_from_json from flask_app.main.json_utils import extract_content_from_json
@ -189,9 +187,9 @@ def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause
temp = extract_content_from_json(response[1]) temp = extract_content_from_json(response[1])
first_response_list.append(temp) first_response_list.append(temp)
else: else:
g.logger.error(f"形式响应评审Warning: Missing or incomplete response data for query index {_}.") print(f"形式响应评审Warning: Missing or incomplete response data for query index {_}.")
except Exception as e: except Exception as e:
g.logger.error(f"形式响应评审Error processing response for query index {_}: {e}") print(f"形式响应评审Error processing response for query index {_}: {e}")
# Assume JSON file paths are defined or configured correctly # Assume JSON file paths are defined or configured correctly
# print(entries_with_numbers) #[{'形式评审标准.多标段投标': '3.7.45'}] # print(entries_with_numbers) #[{'形式评审标准.多标段投标': '3.7.45'}]

View File

@ -2,43 +2,47 @@ from PyPDF2 import PdfReader, PdfWriter
import re # 导入正则表达式库 import re # 导入正则表达式库
import os # 用于文件和文件夹操作 import os # 用于文件和文件夹操作
from flask import g def clean_page_content(text, common_header):
# 首先删除抬头公共部分
if common_header: # 确保有公共抬头才进行替换
for header_line in common_header.split('\n'):
if header_line.strip(): # 只处理非空行
# 替换首次出现的完整行
text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
# 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码
return text
def extract_common_header(pdf_path): def extract_common_header(pdf_path):
pdf_document = PdfReader(pdf_path) pdf_document = PdfReader(pdf_path)
headers = [] headers = []
num_pages_to_read = 3 # 预读页数 start_page = 4 # 从第5页开始读取索引为4
num_pages_to_read = 3 # 连续读取3页
for i in range(min(num_pages_to_read, len(pdf_document.pages))): # 确保从第5页开始且总页数足够
for i in range(start_page, min(start_page + num_pages_to_read, len(pdf_document.pages))):
page = pdf_document.pages[i] page = pdf_document.pages[i]
text = page.extract_text() text = page.extract_text() or ""
if text: # 确保页面有文本内容 if text:
first_line = text.strip().split('\n')[0] # 只取每页的前三行
headers.append(first_line) first_lines = text.strip().split('\n')[:3]
headers.append(first_lines)
if len(headers) < 2: if len(headers) < 2:
return "" # 如果没有足够的页来比较,返回空字符串 return "" # 如果没有足够的页来比较,返回空字符串
# 使用set交集来找出公共部分 # 寻找每一行中的公共部分
common_header = set(headers[0].split()).intersection(*[set(header.split()) for header in headers[1:]]) common_headers = []
common_header = ' '.join(common_header) for lines in zip(*headers):
return common_header # 在每一行中寻找公共单词
common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]])
if common_line:
common_headers.append(' '.join(common_line))
return '\n'.join(common_headers)
def clean_page_content(text, common_header):
# 首先删除抬头公共部分
if common_header: # 确保有公共抬头才进行替换
cleaned_text = text.replace(common_header, '', 1) # 假设抬头出现在文本开头,只替换一次
else:
cleaned_text = text
# 删除页码
cleaned_text = re.sub(r'^\s*\d+\s*(?=\D)', '', cleaned_text) # 删除开头的页码,仅当紧跟非数字字符时
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text) # 删除结尾的页码
cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text) # 删除形如 /129 的页码
return cleaned_text
def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page): def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page):
@ -62,16 +66,16 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
with open(output_pdf_path, 'wb') as f: with open(output_pdf_path, 'wb') as f:
output_doc.write(f) output_doc.write(f)
g.logger.info(f"已截取并保存页面从 {start_page + 1}{end_page + 1}{output_pdf_path}") print(f"已截取并保存页面从 {start_page + 1}{end_page + 1}{output_pdf_path}")
else: else:
g.logger.error("提供的页码范围无效。") print("提供的页码范围无效。")
return output_pdf_path return output_pdf_path
def extract_pages_twice(pdf_path, output_folder, output_suffix): def extract_pages_twice(pdf_path, output_folder, output_suffix):
common_header = extract_common_header(pdf_path) common_header = extract_common_header(pdf_path)
last_begin_index = 0 last_begin_index = 0
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷') begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|投标邀请书')
pdf_document = PdfReader(pdf_path) pdf_document = PdfReader(pdf_path)
for i, page in enumerate(pdf_document.pages): for i, page in enumerate(pdf_document.pages):
text = page.extract_text() text = page.extract_text()
@ -108,7 +112,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix):
break # 找到结束页后退出循环 break # 找到结束页后退出循环
if start_page is None or end_page is None: if start_page is None or end_page is None:
g.logger.error(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!") print(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!")
return "" return ""
else: else:
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page) return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
@ -157,7 +161,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
if output_suffix == "qualification" or output_suffix =="invalid": if output_suffix == "qualification" or output_suffix =="invalid":
return extract_pages_twice(pdf_path, output_folder, output_suffix) return extract_pages_twice(pdf_path, output_folder, output_suffix)
else: else:
g.logger.error(f"first: 未找到起始或结束页在文件 {pdf_path} 中!") print(f"first: 未找到起始或结束页在文件 {pdf_path} 中!")
return "" return ""
else: else:
return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page) return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page)
@ -185,7 +189,7 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
if output_pdf_path and os.path.isfile(output_pdf_path): if output_pdf_path and os.path.isfile(output_pdf_path):
return [output_pdf_path] # 以列表形式返回,以保持一致性 return [output_pdf_path] # 以列表形式返回,以保持一致性
else: else:
g.logger.error("提供的路径既不是文件夹也不是PDF文件。") print("提供的路径既不是文件夹也不是PDF文件。")
return [] return []
@ -224,7 +228,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
output_suffix = "qualification" output_suffix = "qualification"
elif selection == 5: elif selection == 5:
# 配置用于 "招标公告" 的正则表达式模式和短语 # 配置用于 "招标公告" 的正则表达式模式和短语
begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷') begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|投标邀请书')
begin_page = 0 begin_page = 0
end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*投标人须知', re.MULTILINE) end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*投标人须知', re.MULTILINE)
output_suffix = "notice" output_suffix = "notice"
@ -235,7 +239,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|第二卷', re.MULTILINE) end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|第二卷', re.MULTILINE)
output_suffix = "invalid" output_suffix = "invalid"
else: else:
g.logger.error("无效的选择:请选择1-6") print("无效的选择:请选择1-6")
return None return None
# Process the selected input # Process the selected input

View File

@ -4,8 +4,6 @@ import fitz
import re import re
import os import os
from flask import g
def extract_text_from_docx(file_path): def extract_text_from_docx(file_path):
doc = docx.Document(file_path) doc = docx.Document(file_path)
@ -129,7 +127,7 @@ def convert_to_json(file_path, start_word, end_phrases):
def convert_clause_to_json(input_path,output_folder,type=1): def convert_clause_to_json(input_path,output_folder,type=1):
if not os.path.exists(input_path): if not os.path.exists(input_path):
g.logger.error(f"The specified file does not exist: {input_path}") print(f"The specified file does not exist: {input_path}")
return "" return ""
if type==1: if type==1:
start_word = "投标人须知正文" start_word = "投标人须知正文"

View File

@ -4,8 +4,6 @@ import logging
import os import os
import time import time
from flask import g
from flask_app.main.截取pdf import truncate_pdf_multiple from flask_app.main.截取pdf import truncate_pdf_multiple
from flask_app.main.table_content_extraction import extract_tables_main from flask_app.main.table_content_extraction import extract_tables_main
from flask_app.main.知识库操作 import addfileToKnowledge, deleteKnowledge from flask_app.main.知识库操作 import addfileToKnowledge, deleteKnowledge
@ -20,16 +18,17 @@ from flask_app.main.商务标技术标整合 import combine_evaluation_standards
from flask_app.main.format_change import pdf2docx, docx2pdf from flask_app.main.format_change import pdf2docx, docx2pdf
from flask_app.main.docx截取docx import copy_docx from flask_app.main.docx截取docx import copy_docx
# def get_global_logger(unique_id): def get_global_logger(unique_id):
# if unique_id is None: if unique_id is None:
# return logging.getLogger() # 获取默认的日志器 return logging.getLogger() # 获取默认的日志器
# logger = logging.getLogger(unique_id) logger = logging.getLogger(unique_id)
# return logger return logger
logger=None
# 可能有问题pdf转docx导致打勾符号消失 # 可能有问题pdf转docx导致打勾符号消失
def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id): def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
g.logger.info("starting 文件预处理...") logger.info("starting 文件预处理...")
# 根据文件类型处理文件路径 # 根据文件类型处理文件路径
if file_type == 1: # docx if file_type == 1: # docx
docx_path = downloaded_file_path docx_path = downloaded_file_path
@ -39,7 +38,7 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库 docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库
else: else:
# 如果文件类型不是预期中的1或2记录错误并返回None # 如果文件类型不是预期中的1或2记录错误并返回None
g.logger.error("Unsupported file type provided. Preprocessing halted.") logger.error("Unsupported file type provided. Preprocessing halted.")
return None return None
# 上传知识库 # 上传知识库
@ -59,7 +58,7 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
truncate1 = truncate_files[1] #评标办法前附表 truncate1 = truncate_files[1] #评标办法前附表
truncate3 = truncate_files[3] #资格审查表 truncate3 = truncate_files[3] #资格审查表
clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json
g.logger.info("文件预处理done") logger.info("文件预处理done")
return { return {
'input_file_path':downloaded_file_path, 'input_file_path':downloaded_file_path,
@ -94,59 +93,59 @@ def post_processing(data,includes):
return result return result
# 基本信息 # 基本信息
def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path): # 投标人须知前附表 def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path): # 投标人须知前附表
g.logger.info("starting基础信息...") logger.info("starting基础信息...")
basic_res = project_basic_info(knowledge_name, truncate0, output_folder, clause_path) basic_res = project_basic_info(knowledge_name, truncate0, output_folder, clause_path)
g.logger.info("基础信息done") logger.info("基础信息done")
return basic_res return basic_res
# 形式、响应、资格评审 # 形式、响应、资格评审
def fetch_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path,input_file,output_folder): def fetch_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path,input_file,output_folder):
g.logger.info("starting资格审查...") logger.info("starting资格审查...")
review_standards_res = combine_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath, review_standards_res = combine_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath,
clause_path,input_file,output_folder) clause_path,input_file,output_folder)
g.logger.info("资格审查done") logger.info("资格审查done")
return review_standards_res return review_standards_res
# 评分细则 # 评分细则
def fetch_evaluation_standards(truncate1): # 评标办法前附表 def fetch_evaluation_standards(truncate1): # 评标办法前附表
g.logger.info("starting商务标技术标...") logger.info("starting商务标技术标...")
evaluation_standards_res = combine_evaluation_standards(truncate1) evaluation_standards_res = combine_evaluation_standards(truncate1)
g.logger.info("商务标技术标done") logger.info("商务标技术标done")
return evaluation_standards_res return evaluation_standards_res
# 无效、废标项解析 # 无效、废标项解析
def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3): def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3):
# 废标项要求:千问 # 废标项要求:千问
g.logger.info("starting无效标与废标...") logger.info("starting无效标与废标...")
find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3) find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3)
g.logger.info("无效标与废标done...") logger.info("无效标与废标done...")
return find_invalid_res return find_invalid_res
# 投标文件要求 # 投标文件要求
def fetch_bidding_documents_requirements(clause_path): def fetch_bidding_documents_requirements(clause_path):
g.logger.info("starting投标文件要求...") logger.info("starting投标文件要求...")
fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1) fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1)
qualify_nested_res = nest_json_under_key(fetch_bidding_documents_requirements_json, "投标文件要求") qualify_nested_res = nest_json_under_key(fetch_bidding_documents_requirements_json, "投标文件要求")
g.logger.info("投标文件要求done...") logger.info("投标文件要求done...")
return qualify_nested_res return qualify_nested_res
# 开评定标流程 # 开评定标流程
def fetch_bid_opening(clause_path): def fetch_bid_opening(clause_path):
g.logger.info("starting开评定标流程...") logger.info("starting开评定标流程...")
fetch_bid_opening_json = extract_from_notice(clause_path, 2) fetch_bid_opening_json = extract_from_notice(clause_path, 2)
qualify_nested_res = nest_json_under_key(fetch_bid_opening_json, "开评定标流程") qualify_nested_res = nest_json_under_key(fetch_bid_opening_json, "开评定标流程")
g.logger.info("开评定标流程done...") logger.info("开评定标流程done...")
return qualify_nested_res return qualify_nested_res
def main_processing(output_folder, downloaded_file_path, file_type, unique_id): # file_type=1->docx file_type=2->pdf def main_processing(output_folder, downloaded_file_path, file_type, unique_id): # file_type=1->docx file_type=2->pdf
# global global_logger global logger
# global_logger = get_global_logger(unique_id) logger = get_global_logger(unique_id)
# Preprocess files and get necessary data paths and knowledge index # Preprocess files and get necessary data paths and knowledge index
processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id) processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id)
if not processed_data: if not processed_data:
@ -180,7 +179,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
result = futures[key].result() result = futures[key].result()
comprehensive_responses.append(result) comprehensive_responses.append(result)
except Exception as exc: except Exception as exc:
g.logger.error(f"Error processing {key}: {exc}") logger.error(f"Error processing {key}: {exc}")
# 合并 JSON 结果 # 合并 JSON 结果
combined_final_result = combine_json_results(comprehensive_responses) combined_final_result = combine_json_results(comprehensive_responses)
includes = ["基础信息", "资格审查", "商务标", "技术标", "无效标与废标项", "投标文件要求", "开评定标流程"] includes = ["基础信息", "资格审查", "商务标", "技术标", "无效标与废标项", "投标文件要求", "开评定标流程"]
@ -190,7 +189,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
final_result_path = os.path.join(output_folder, "final_result.json") final_result_path = os.path.join(output_folder, "final_result.json")
with open(final_result_path, 'w', encoding='utf-8') as file: with open(final_result_path, 'w', encoding='utf-8') as file:
json.dump(modified_json, file, ensure_ascii=False, indent=2) json.dump(modified_json, file, ensure_ascii=False, indent=2)
g.logger.info("final_result.json has been saved") logger.info("final_result.json has been saved")
deleteKnowledge(processed_data['knowledge_index']) deleteKnowledge(processed_data['knowledge_index'])
return final_result_path return final_result_path

View File

@ -27,5 +27,5 @@ def extract_text_by_page(file_path):
print(f"Page {page_num + 1} is empty or text could not be extracted.") print(f"Page {page_num + 1} is empty or text could not be extracted.")
return result return result
if __name__ == '__main__': if __name__ == '__main__':
file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output\\ztb_20240903100337\\ztb_20240903100337_14-20.pdf" file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件正文(1)(1).pdf"
extract_text_by_page(file_path) extract_text_by_page(file_path)

View File

@ -2,9 +2,6 @@ import re
import PyPDF2 import PyPDF2
import json import json
from flask import g
def extract_key_value_pairs(text): def extract_key_value_pairs(text):
# 更新正则表达式来包括对"团"的处理和行尾斜线 # 更新正则表达式来包括对"团"的处理和行尾斜线
pattern = r'\d+\.\d+\s*([\w\s\u4e00-\u9fff]+)[\x01\x02☑团]([\w\s\u4e00-\u9fff]+)' pattern = r'\d+\.\d+\s*([\w\s\u4e00-\u9fff]+)[\x01\x02☑团]([\w\s\u4e00-\u9fff]+)'
@ -75,7 +72,7 @@ def read_pdf_and_judge_main(file_path, output_json_path):
with open(output_json_path, "w", encoding="utf-8") as json_file: with open(output_json_path, "w", encoding="utf-8") as json_file:
json.dump(all_data, json_file, ensure_ascii=False, indent=4) json.dump(all_data, json_file, ensure_ascii=False, indent=4)
g.logger.info(f"da_gou signal: Data extraction complete and saved to '{output_json_path}'.") print(f"da_gou signal: Data extraction complete and saved to '{output_json_path}'.")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -3,9 +3,6 @@ import json
import os.path import os.path
import time import time
import re import re
from flask import g
from flask_app.main.json_utils import combine_json_results, nest_json_under_key from flask_app.main.json_utils import combine_json_results, nest_json_under_key
from flask_app.main.通义千问long import upload_file, qianwen_long from flask_app.main.通义千问long import upload_file, qianwen_long
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
@ -332,7 +329,7 @@ def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,t
results.append(future.result()) results.append(future.result())
#禁止投标 #禁止投标
g.logger.info("starting不得存在的情形...") print("starting不得存在的情形...")
forbidden_res = find_forbidden(truncate_json_path, clause_path, truncate3) forbidden_res = find_forbidden(truncate_json_path, clause_path, truncate3)
results.append(forbidden_res) results.append(forbidden_res)
@ -340,7 +337,7 @@ def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,t
for d in results: for d in results:
combined_dict.update(d) combined_dict.update(d)
g.logger.info("无效标与废标done...") print("无效标与废标done...")
return nest_json_under_key(combined_dict, "无效标与废标项") return nest_json_under_key(combined_dict, "无效标与废标项")

View File

@ -1,7 +1,5 @@
import os import os
import uuid import uuid
from flask import g
from llama_index.readers.dashscope.base import DashScopeParse from llama_index.readers.dashscope.base import DashScopeParse
from llama_index.readers.dashscope.utils import ResultType from llama_index.readers.dashscope.utils import ResultType
from llama_index.indices.managed.dashscope import DashScopeCloudIndex from llama_index.indices.managed.dashscope import DashScopeCloudIndex
@ -16,7 +14,7 @@ def addfileToKnowledge(filepath,knowledge_name):
knowledge_name, knowledge_name,
verbose=True, verbose=True,
) )
g.logger.info("knowledge created successfully!!!") print("knowledge created successfully!!!")
# index = DashScopeCloudIndex(knowledge_name) # index = DashScopeCloudIndex(knowledge_name)
# index._insert(documents) # index._insert(documents)
# return index, documents # return index, documents

View File

@ -4,7 +4,6 @@ import os
import re import re
from PyPDF2 import PdfWriter, PdfReader from PyPDF2 import PdfWriter, PdfReader
from flask import g
from flask_app.main.通义千问long import upload_file, qianwen_long from flask_app.main.通义千问long import upload_file, qianwen_long
@ -55,9 +54,9 @@ def extract_and_format_from_paths(json_paths, includes, excludes):
# 将当前文件的结果添加到总结果列表 # 将当前文件的结果添加到总结果列表
all_formatted_results.extend(formatted_results) all_formatted_results.extend(formatted_results)
except FileNotFoundError: except FileNotFoundError:
g.logger.error(f"禁止投标情形: Error: The file '{path}' does not exist.") print(f"禁止投标情形: Error: The file '{path}' does not exist.")
except json.JSONDecodeError: except json.JSONDecodeError:
g.logger.error(f"禁止投标情形: Error: The file '{path}' contains invalid JSON.") print(f"禁止投标情形: Error: The file '{path}' contains invalid JSON.")
return all_formatted_results return all_formatted_results
@ -126,9 +125,9 @@ def merge_pdfs(paths, output_filename):
if output_path: if output_path:
with open(output_path, 'wb') as out: with open(output_path, 'wb') as out:
pdf_writer.write(out) pdf_writer.write(out)
g.logger.info(f"禁止投标情形: Merged PDF saved to {output_path}") print(f"禁止投标情形: Merged PDF saved to {output_path}")
else: else:
g.logger.error("禁止投标情形: No files to merge.") print("禁止投标情形: No files to merge.")
return output_path return output_path
def process_string_list(string_list): def process_string_list(string_list):
@ -153,7 +152,7 @@ def process_string_list(string_list):
actual_list = ast.literal_eval(formatted_list) actual_list = ast.literal_eval(formatted_list)
return actual_list return actual_list
except SyntaxError as e: except SyntaxError as e:
g.logger.error(f"禁止投标情形: Error parsing list: {e}") print(f"禁止投标情形: Error parsing list: {e}")
return [] return []
else: else:
# 如果没有匹配到内容,返回空列表 # 如果没有匹配到内容,返回空列表

View File

@ -14,5 +14,5 @@ def read_docx(file_path):
if __name__ == "__main__": if __name__ == "__main__":
file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile.docx" file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件正文(1)(1).docx"
read_docx(file_path) read_docx(file_path)

View File

@ -2,9 +2,6 @@
#资格审查中,首先排除'联合体投标'和'不得存在的情况',有'符合'等的加入matching_keys列表否则保留原字典 #资格审查中,首先排除'联合体投标'和'不得存在的情况',有'符合'等的加入matching_keys列表否则保留原字典
import json import json
import re import re
from flask import g
from flask_app.main.json_utils import clean_json_string, combine_json_results, add_keys_to_json, nest_json_under_key from flask_app.main.json_utils import clean_json_string, combine_json_results, add_keys_to_json, nest_json_under_key
from flask_app.main.多线程提问 import multi_threading,read_questions_from_file from flask_app.main.多线程提问 import multi_threading,read_questions_from_file
from flask_app.main.通义千问long import upload_file from flask_app.main.通义千问long import upload_file
@ -19,7 +16,7 @@ def merge_dictionaries_under_common_key(dicts, common_key):
# 使用字典解包来合并字典 # 使用字典解包来合并字典
merged_dict[common_key].update(d[common_key]) merged_dict[common_key].update(d[common_key])
else: else:
g.logger.error(f"资格评审: Warning: Dictionary does not contain the key {common_key}") print(f"资格评审: Warning: Dictionary does not contain the key {common_key}")
return merged_dict return merged_dict
def generate_qual_question(matching_keys_list): #这里假设资质、信誉与人员要求 要不都有、要不都没 def generate_qual_question(matching_keys_list): #这里假设资质、信誉与人员要求 要不都有、要不都没
@ -73,15 +70,14 @@ def get_consortium_dict(knowledge_name):
if response and len(response) > 1: # 检查response存在且有至少两个元素 if response and len(response) > 1: # 检查response存在且有至少两个元素
qualify_list.append(response[1]) qualify_list.append(response[1])
else: else:
g.logger.error(f"资格评审: Warning: Missing or incomplete response data for query index {_}.") print(f"资格评审: Warning: Missing or incomplete response data for query index {_}.")
except Exception as e: except Exception as e:
g.logger.error(f"资格评审: Error processing response for query index {_}: {e}") print(f"资格评审: Error processing response for query index {_}: {e}")
consortium_dict = combine_json_results(qualify_list) consortium_dict = combine_json_results(qualify_list)
return consortium_dict return consortium_dict
def get_all_dict(knowledge_name): def get_all_dict(knowledge_name):
# qualification_review_file_path = '../static/提示词/资格评审.txt' # 替换为你的txt文件路径 qualification_review_file_path='flask_app/static/提示词/资格评审.txt'
qualification_review_file_path='flask_app/static/提示词/资格评审问题.txt'
questions = read_questions_from_file(qualification_review_file_path) questions = read_questions_from_file(qualification_review_file_path)
qualification_list = [] qualification_list = []
res1 = multi_threading(questions, knowledge_name) res1 = multi_threading(questions, knowledge_name)
@ -90,9 +86,9 @@ def get_all_dict(knowledge_name):
if response and len(response) > 1: # 检查response存在且有至少两个元素 if response and len(response) > 1: # 检查response存在且有至少两个元素
qualification_list.append(response[1]) qualification_list.append(response[1])
else: else:
g.logger.error(f"资格评审: Warning: Missing or incomplete response data for query index {_}.") print(f"资格评审: Warning: Missing or incomplete response data for query index {_}.")
except Exception as e: except Exception as e:
g.logger.error(f"资格评审: Error processing response for query index {_}: {e}") print(f"资格评审: Error processing response for query index {_}: {e}")
qualification_combined_res = combine_json_results(qualification_list) qualification_combined_res = combine_json_results(qualification_list)
return {'资格评审': qualification_combined_res} return {'资格评审': qualification_combined_res}
def process_qualification(qualification_review,truncate3,knowledge_name): def process_qualification(qualification_review,truncate3,knowledge_name):
@ -101,14 +97,14 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
if not matching_keys_list: #此时要求全部写在评分办法前附表中,不需要额外提取。 if not matching_keys_list: #此时要求全部写在评分办法前附表中,不需要额外提取。
if not non_matching_dict: #古法提取 if not non_matching_dict: #古法提取
if truncate3!="": if truncate3!="":
g.logger.info("资格评审: type1") print("资格评审: type1")
matching_keys_list=["资质条件","财务要求","业绩要求","信誉要求","其他要求"] matching_keys_list=["资质条件","财务要求","业绩要求","信誉要求","其他要求"]
ques=generate_qual_question(matching_keys_list) ques=generate_qual_question(matching_keys_list)
file_id2 = upload_file(truncate3) file_id2 = upload_file(truncate3)
results2 = multi_threading(ques, "", file_id2, 2) # 资格评审表调用qianwen-long results2 = multi_threading(ques, "", file_id2, 2) # 资格评审表调用qianwen-long
res_list = [] res_list = []
if not results2: if not results2:
g.logger.error("资格评审: 调用大模型未成功获取资格评审文件中的要求!") print("资格评审: 调用大模型未成功获取资格评审文件中的要求!")
else: else:
# 打印结果 # 打印结果
for question, response in results2: for question, response in results2:
@ -119,11 +115,11 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) # 合并字典 updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) # 合并字典
return updated_qualify_json return updated_qualify_json
else: else:
g.logger.info("资格评审: type2") print("资格评审: type2")
return get_all_dict(knowledge_name) return get_all_dict(knowledge_name)
else: else:
g.logger.info("资格评审: type3") print("资格评审: type3")
new_non_matching_json={'资格评审':non_matching_dict} new_non_matching_json={'资格评审':non_matching_dict}
substring = '联合体' substring = '联合体'
found_key = any(substring in key for key in non_matching_dict.keys()) #没有联合体投标,则需生成,防止重复 found_key = any(substring in key for key in non_matching_dict.keys()) #没有联合体投标,则需生成,防止重复
@ -135,18 +131,18 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
return new_non_matching_json return new_non_matching_json
elif matching_keys_list and truncate3=="": #这种情况是评分办法前附表中有要求,但是没有正确截取到'资格审查表' elif matching_keys_list and truncate3=="": #这种情况是评分办法前附表中有要求,但是没有正确截取到'资格审查表'
g.logger.info("资格评审: type4") print("资格评审: type4")
final_qualification=get_all_dict(knowledge_name) final_qualification=get_all_dict(knowledge_name)
final_qualify_json = add_keys_to_json(final_qualification, non_matching_dict) final_qualify_json = add_keys_to_json(final_qualification, non_matching_dict)
return final_qualify_json return final_qualify_json
else: #大多数情况 else: #大多数情况
g.logger.info("资格评审: type5") print("资格评审: type5")
user_querys = generate_qual_question(matching_keys_list) # 生成提问->‘附件:资格审查’ user_querys = generate_qual_question(matching_keys_list) # 生成提问->‘附件:资格审查’
file_id2 = upload_file(truncate3) file_id2 = upload_file(truncate3)
results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表调用qianwen-long results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表调用qianwen-long
res_list = [] res_list = []
if not results2: if not results2:
g.logger.error("资格评审: 调用大模型未成功获取资格评审文件中的要求!") print("资格评审: 调用大模型未成功获取资格评审文件中的要求!")
else: else:
# 打印结果 # 打印结果
for question, response in results2: for question, response in results2:

View File

@ -0,0 +1,9 @@
import pypandoc
def docx_to_pdf(docx_path, output_pdf_path):
output = pypandoc.convert_file(docx_path, 'pdf', outputfile=output_pdf_path)
assert output == ""
docx_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件正文(1)(1).docx'
output_pdf_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\output.pdf'
docx_to_pdf(docx_path, output_pdf_path)

View File

@ -1,60 +0,0 @@
import json
from flask_app.货物标.货物标截取pdf import truncate_pdf_main
from flask_app.main.format_change import docx2pdf, pdf2docx
from flask_app.main.多线程提问 import multi_threading
from flask_app.main.通义千问long import upload_file,qianwen_long
from flask_app.main.json_utils import clean_json_string,combine_json_results
def generate_key_paths(data, parent_key=''):
key_paths = []
for key, value in data.items():
current_key = f"{parent_key}.{key}" if parent_key else key
if isinstance(value, dict):
if value: # 字典非空时,递归处理
key_paths.extend(generate_key_paths(value, current_key))
else: # 字典为空时,直接添加键路径
key_paths.append(current_key)
else:
# 如果到达了末端,添加当前键路径
key_paths.append(current_key)
return key_paths
#获取采购清单
def fetch_purchasing_list(file_path,output_folder,file_type):
global pdf_path,docx_path
if file_type==1:
docx_path=file_path
pdf_path = docx2pdf(file_path)
elif file_type==2:
pdf_path=file_path
docx_path=pdf2docx(file_path)
technical_requirements=[]
truncate_path=truncate_pdf_main(pdf_path,output_folder,1)
user_query1="这是一份货物标中采购要求部分的内容你需要摘取出需要采购的系统货物一个大系统大项中可能包含多个小系统小项你需要保留这种层次关系给出货物名称请以json格式返回外层键名为\"采购需求\",嵌套键名为对应的系统名称或货物名称,无需给出采购数量和单位,如有未知内容,在对应键值处填\"未知\""
file_id=upload_file(truncate_path[0])
res=qianwen_long(file_id,user_query1)
cleaned_res=clean_json_string(res)
keys_list=generate_key_paths(cleaned_res['采购需求']) #提取需要采购的货物清单
user_query_template = "这是一份货物标中采购要求部分的内容,请你给出\"{}\"的具体型号参数要求请以json格式返回结果外层键名为\"{}\", 键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减。"
queries=[]
for key in keys_list:
# 替换 user_query2 中的 "网络硬盘录像机" 为当前 key
new_query = user_query_template.format(key, key)
print(new_query)
queries.append(new_query)
results=multi_threading(queries,"",file_id,2)
if not results:
print("errror!")
else:
# 打印结果
for question, response in results:
technical_requirements.append(response)
technical_requirements_combined_res=combine_json_results(technical_requirements)
json_string = json.dumps(technical_requirements_combined_res, ensure_ascii=False, indent=4)
print(json_string)
if __name__ == "__main__":
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\磋商文件.doc"
fetch_purchasing_list(file_path,output_folder,1)

View File

@ -12,7 +12,9 @@ def find_and_copy_files(input_folder, output_folder):
for root, dirs, files in os.walk(input_folder): for root, dirs, files in os.walk(input_folder):
for file in files: for file in files:
# 检查文件名是否包含“招标”或“竞争性”并且文件格式正确 # 检查文件名是否包含“招标”或“竞争性”并且文件格式正确
if ('竞争性' in file or '招标' in file or '磋商' in file) and file.endswith(supported_formats): if ('响应' not in file and '投标' not in file) and \
('竞争性' in file or '招标文件' in file or '磋商' in file) and \
file.endswith(supported_formats):
# 构造完整的文件路径 # 构造完整的文件路径
file_path = os.path.join(root, file) file_path = os.path.join(root, file)
# 构造输出路径 # 构造输出路径

View File

@ -1,115 +1,33 @@
import json def postprocess(data):
import re """转换字典中的值为列表,如果所有键对应的值都是'/', '{}''未知'"""
for key, value in data.items():
if all(v in ['/', '未知', {}] for v in value.values()):
data[key] = list(value.keys())
return data
# 示例数据
data = {
"第一包.耗材": {
"服务器": "未知",
"台式计算机": "未知",
"便携式计算机": "/",
"信息安全设备": {},
"喷墨打印机": "/",
"激光打印机": "/",
"针式打印机": "/",
"液晶显示器": "/",
"扫描仪": "/",
"基础软件": "/",
"信息安全软件": "/",
"复印机": "/",
"投影仪": "/",
"多功能一体机": "/",
"触控一体机": "/",
"碎纸机": "/",
"复印纸": "/"
}
}
def extract_and_format_from_paths(json_paths, includes, excludes): # 转换字典
""" converted_data = postprocess(data)
从多个 JSON 文件路径读取数据提取包含特定关键词的内容并按照格式要求合并 print(converted_data)
参数:
json_paths (list): 包含多个 JSON 文件路径的列表
includes (list): 包含要检查的关键词的列表
excludes (list): 包含要排除的关键词的列表
返回:
list: 包含所有文件中满足条件的格式化字符串列表
"""
all_formatted_results = []
# 遍历每个文件路径
for path in json_paths:
try:
with open(path, 'r', encoding='utf-8') as file:
# 加载 JSON 数据
json_data = json.load(file)
formatted_results = []
# 遍历 JSON 数据的每个键值对
for key, value in json_data.items():
if isinstance(value, dict):
# 如果值是字典,检查嵌套字典的每个键值对
for sub_key, sub_value in value.items():
if any(include in sub_key for include in includes):
# 如果子值包含关键词,格式化并添加到结果列表
formatted_results.append(f"{sub_value}")
elif isinstance(value, str): # clause
# 检查是否包含任何 include 关键词
for include in includes:
if include in value:
# 找到 include 之前的内容
prefix = value.split(include)[0]
# 检查 prefix 是否不包含任何 exclude 关键词
if not any(exclude in prefix for exclude in excludes):
# 如果不包含任何 exclude 关键词,添加整个 value 到结果列表
if '\n' in value:
value = value.split('\n', 1)[-1]
formatted_results.append(value)
break # 找到一个符合条件的就跳出循环
# 将当前文件的结果添加到总结果列表
all_formatted_results.extend(formatted_results)
except FileNotFoundError:
print(f"Error: The file '{path}' does not exist.")
except json.JSONDecodeError:
print(f"Error: The file '{path}' contains invalid JSON.")
return all_formatted_results
def extract_unique_items_from_texts(texts):
# 更新正则表达式以包括更广泛的序号类型,包括中文序号
pattern = re.compile(r'(?:\d+\.|\\d+\|\(\d+\)|\d+\)|\①|\②|\③|\④|\⑤|\⑥|\⑦|\⑧|\⑨|\⑩|\⑪|\⑫)\s*')
intro_pattern = re.compile(r'^.*?[:]')
punctuation_pattern = re.compile(r'[;。,、..,:;!?]+$')
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
content_check_pattern = re.compile(r'[\u4e00-\u9fa5a-zA-Z0-9]{2,}') # 检查是否含有至少2个连续的字母、数字或汉字
all_results = []
seen = set()
for text in texts:
# 去除文本中的制表符和换行符
text = text.replace('\t', '').replace('\n', '')
# 删除引导性的文本(直到冒号,但保留冒号后的内容)
text = intro_pattern.sub('', text)
# 替换URL为占位符并保存URL以便后续还原
urls = []
def url_replacer(match):
urls.append(match.group(0))
return f"{{URL{len(urls)}}}"
text = url_pattern.sub(url_replacer, text)
# 使用数字和括号的模式分割文本
items = pattern.split(text)
for item in items:
cleaned_item = item.strip()
if cleaned_item:
# 进一步清理每个条目
cleaned_item = pattern.sub('', cleaned_item)
cleaned_item = punctuation_pattern.sub('', cleaned_item)
cleaned_item = cleaned_item.strip()
# 还原URL
for i, url in enumerate(urls, 1):
cleaned_item = cleaned_item.replace(f"{{URL{i}}}", url)
# 添加未见过的独特条目确保它包含足够的实质内容并长度大于3个字符
if cleaned_item and cleaned_item not in seen and len(cleaned_item) > 3 and content_check_pattern.search(cleaned_item):
seen.add(cleaned_item)
all_results.append(cleaned_item)
return all_results
# 使用上面定义的函数
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\truncate_output.json"
clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\clause1.json"
json_paths = [truncate_json_path,clause_path] # 根据实际存放的路径来填写
includes = ["不得存在","不得与","禁止投标","对投标人的纪律"]
excludes=["招标","评标","定标"]
# 调用函数
results = extract_and_format_from_paths(json_paths, includes,excludes)
print(results)
res=extract_unique_items_from_texts(results)
print(res)

View File

@ -0,0 +1,128 @@
# -*- encoding:utf-8 -*-
import json
import os
from flask_app.main.多线程提问 import multi_threading
from flask_app.main.通义千问long import upload_file, qianwen_long
from flask_app.main.json_utils import clean_json_string, combine_json_results
def generate_key_paths(data, parent_key=''):
key_paths = []
for key, value in data.items():
current_key = f"{parent_key}.{key}" if parent_key else key
if isinstance(value, dict):
if value:
# 检查字典中的值是否为字典、列表或字符串'未知'
contains_dict_list_or_unknown = any(isinstance(v, (dict, list)) or v == "未知" for v in value.values())
if contains_dict_list_or_unknown:
# 递归生成键路径
sub_paths = generate_key_paths(value, current_key)
if sub_paths:
# 如果子路径非空,则扩展
key_paths.extend(sub_paths)
else:
# 当前字典内部为空或值全为"未知"
key_paths.append(current_key)
else:
# 字典中所有值都不是字典、列表或"未知",添加当前键
key_paths.append(current_key)
else:
# 空字典,直接添加键路径
key_paths.append(current_key)
elif isinstance(value, list):
# 列表类型,添加包含列表的键的路径
if value: # 只有当列表非空时才添加
key_paths.append(current_key)
elif value == "未知":
# 值为"未知",添加键路径
key_paths.append(current_key)
return key_paths
def get_technical_requirements(truncate_file):
user_query1 = "这是一份货物标中采购要求部分的内容请告诉我需要采购的系统货物如果有采购清单请直接根据清单上的货物名称给出结果若没有采购清单你要从文中摘取需要采购的系统货物采购需求中可能包含层次关系如大系统中包含若干子系统你需要保留这种层次关系给出系统货物名称请以json格式返回外层键名为\"采购需求\",嵌套键名为对应的系统名称或货物名称,需与原文保持一致,无需给出采购数量和单位,如有未知内容,在对应键值处填\"未知\""
file_id = upload_file(truncate_file)
res = qianwen_long(file_id, user_query1)
print(res)
cleaned_res = clean_json_string(res)
keys_list = generate_key_paths(cleaned_res['采购需求']) # 提取需要采购的货物清单
user_query_template = "这是一份货物标中采购要求部分的内容,请你给出\"{}\"的技术参数或采购要求和数量请以json格式返回结果外层键名为\"{}\", 键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减。"
queries = []
for key in keys_list:
# 替换 user_query2 中的 "网络硬盘录像机" 为当前 key
new_query = user_query_template.format(key, key)
print(new_query)
queries.append(new_query)
results = multi_threading(queries, "", file_id, 2)
technical_requirements = []
if not results:
print("errror!未获得大模型的回答!")
else:
# 打印结果
for question, response in results:
technical_requirements.append(response)
technical_requirements_combined_res = combine_json_results(technical_requirements)
"""根据所有键是否已添加处理技术要求"""
# 更新原始采购需求字典
combine_and_update_results(cleaned_res['采购需求'], technical_requirements_combined_res)
final_res = postprocess(cleaned_res['采购需求'])
print("更新后的采购需求处理完成.")
# 输出最终的 JSON 字符串
json_string = json.dumps(final_res, ensure_ascii=False, indent=4)
return json_string
def combine_and_update_results(original_data, updates):
def recursive_update(data, key, value):
# 处理点分隔的键,递归定位并更新嵌套字典
keys = key.split('.')
for k in keys[:-1]:
data = data.setdefault(k, {})
if isinstance(value, dict) and isinstance(data.get(keys[-1], None), dict):
data[keys[-1]] = {**data.get(keys[-1], {}), **value}
else:
data[keys[-1]] = value
for key, value in updates.items():
recursive_update(original_data, key, value)
return original_data
def postprocess(data):
"""转换字典中的值为列表,如果所有键对应的值都是'/', '{}''未知'"""
for key, value in data.items():
if all(v in ['/', '未知', {}] for v in value.values()):
data[key] = list(value.keys())
return data
def test_all_files_in_folder(input_folder, output_folder):
# 确保输出文件夹存在
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 遍历指定文件夹中的所有文件
for filename in os.listdir(input_folder):
file_path = os.path.join(input_folder, filename)
# 检查是否是文件
if os.path.isfile(file_path):
print(f"处理文件: {file_path}")
# 调用函数处理文件
try:
json_result = get_technical_requirements(file_path)
# 定义输出文件的路径
output_file_path = os.path.join(output_folder, os.path.splitext(filename)[0] + '.json')
# 保存JSON结果到文件
with open(output_file_path, 'w', encoding='utf-8') as json_file:
json_file.write(json_result)
print(f"结果已保存到: {output_file_path}")
except Exception as e:
print(f"处理文件 {file_path} 时出错: {e}")
if __name__ == "__main__":
truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\招标文件(107国道)_procurement.pdf"
res=get_technical_requirements(truncate_file)
print(res)
# input_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
# output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output3"
# test_all_files_in_folder(input_folder, output_folder)

View File

@ -0,0 +1,24 @@
import json
from flask_app.货物标.货物标截取pdf import truncate_pdf_main
from flask_app.main.format_change import docx2pdf, pdf2docx
#获取采购清单
def fetch_purchasing_list(file_path,output_folder,file_type):
if file_type==1:
docx_path=file_path
pdf_path = docx2pdf(file_path)
elif file_type==2:
pdf_path=file_path
docx_path=pdf2docx(file_path)
else:
print("未传入指定格式的文件!")
return None
truncate_path=truncate_pdf_main(pdf_path,output_folder,1)
if __name__ == "__main__":
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output"
file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\磋商文件.doc"
fetch_purchasing_list(file_path,output_folder,1)

View File

@ -1,10 +1,13 @@
#这是一份货物标中采购要求部分的内容你需要摘取出采购清单一个大系统大项中可能包含多个小系统小项你需要保留这种层次关系给出名称和数量和单位请以json格式返回外层键名为"采购需求",如有未知内容,在对应键值处填"未知"。 #这是一份货物标中采购要求部分的内容你需要摘取出采购清单一个大系统大项中可能包含多个小系统小项你需要保留这种层次关系给出名称和数量和单位请以json格式返回外层键名为"采购需求",如有未知内容,在对应键值处填"未知"。
这是一份货物标中采购要求部分的内容你需要摘取出需要采购的系统货物一个大系统大项中可能包含多个小系统小项小系统中也可能包含多个货物你需要保留这种层次关系给出货物名称请以json格式返回外层键名为"采购需求",嵌套键名为对应的系统名称或货物名称,无需给出采购数量和单位,如有未知内容,在对应键值处填"未知"。 #这是一份货物标中采购要求部分的内容你需要摘取出需要采购的系统货物一个大系统大项中可能包含多个小系统小项小系统中也可能包含多个货物你需要保留这种层次关系给出货物名称请以json格式返回外层键名为"采购需求",嵌套键名为对应的系统名称或货物名称,无需给出采购数量和单位,如有未知内容,在对应键值处填"未知"。
"这是一份货物标中采购要求部分的内容你需要摘取出需要采购的系统货物一个大系统大项中可能包含多个小系统小项你需要保留这种层次关系给出系统货物名称请以json格式返回外层键名为\"采购需求\",嵌套键名为对应的系统名称或货物名称,无需给出采购数量和单位,如有未知内容,在对应键值处填\"未知\"。"
#这是一份货物标中采购要求部分的内容请你给出所需的设备名称以及设备的具体型号参数要求请以json格式返回结果外层键名为采购要求。 #这是一份货物标中采购要求部分的内容请你给出所需的设备名称以及设备的具体型号参数要求请以json格式返回结果外层键名为采购要求。
这是一份货物标中采购要求部分的内容,请你给出\"{}\"的具体型号参数要求和数量请以json格式返回结果外层键名为\"{}\", 键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减。
这是一份货物标中采购要求部分的内容,请你给出"网络硬盘录像机"的具体型号参数要求请以json格式返回结果外层键名为"网络硬盘录像机",键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减。 #这是一份货物标中采购要求部分的内容,请你给出"网络硬盘录像机"的具体型号参数要求请以json格式返回结果外层键名为"网络硬盘录像机",键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减。
{ {
"采购需求": { "采购需求": {
@ -76,5 +79,71 @@
} }
} }
} }
{
"采购需求": {
"第一包": {
"办公电子设备": [
"服务器",
"台式计算机",
"便携式计算机",
"信息安全设备",
"喷墨打印机",
"激光打印机",
"针式打印机",
"液晶显示器",
"扫描仪",
"基础软件",
"信息安全软件",
"复印机",
"投影仪",
"多功能一体机",
"触控一体机",
"碎纸机"
],
"软件": [
"基础软件",
"信息安全软件"
],
"耗材": [
"复印纸"
]
},
"第二包": {
"办公家电": [
"空调机"
]
},
"第三包": {
"家具用具": [
"床类",
"台、桌类",
"椅凳类",
"沙发类",
"柜类",
"架类",
"屏风类",
"厨卫用具",
"组合家具",
"家用家具零配件",
"其他家具用具"
]
},
"第四包": {
"印刷服务": "未知"
},
"第五包": {
"汽车维修和保养服务": "未知"
},
"第六包": {
"会计服务": "未知"
},
"第七包": {
"工程造价咨询服务": "未知"
},
"第八包": {
"机动车保险服务": "未知"
}
}
}

View File

@ -1,91 +1,179 @@
from PyPDF2 import PdfReader, PdfWriter from PyPDF2 import PdfReader, PdfWriter
import re # 导入正则表达式库 import re # 导入正则表达式库
import os # 用于文件和文件夹操作 import os # 用于文件和文件夹操作
from flask_app.main.format_change import docx2pdf
def clean_page_numbers(text):
# 使用正则表达式删除页码 def clean_page_content(text, common_header):
# 假设页码在文本的最开始,紧跟着文字且无空格分隔 # 首先删除抬头公共部分
cleaned_text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 if common_header: # 确保有公共抬头才进行替换
# 删除结尾的页码 for header_line in common_header.split('\n'):
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text) if header_line.strip(): # 只处理非空行
# 删除形如 /129 的页码 # 替换首次出现的完整行
cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text) text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
return cleaned_text
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): # 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除
# 打开PDF文件 text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码
return text
def extract_common_header(pdf_path):
pdf_document = PdfReader(pdf_path) pdf_document = PdfReader(pdf_path)
start_page = None headers = []
end_page = None total_pages = len(pdf_document.pages)
# 遍历文档的每一页,查找开始和结束短语的位置 middle_page = total_pages // 2 # 计算中间页
for i in range(len(pdf_document.pages)):
# 确保中间页前后各一页共3页如果不足3页则尽可能取足
start_page = max(0, middle_page - 1)
num_pages_to_read = 3
for i in range(start_page, min(start_page + num_pages_to_read, total_pages)):
page = pdf_document.pages[i] page = pdf_document.pages[i]
text = page.extract_text() text = page.extract_text() or ""
if text: if text:
cleaned_text = clean_page_numbers(text) # 只取每页的前三行
if re.search(begin_pattern, cleaned_text) and i > begin_page: first_lines = text.strip().split('\n')[:3]
start_page = i headers.append(first_lines)
if start_page is not None and re.search(end_pattern, cleaned_text) and i > (start_page+1):
end_page = i
break
# 确保找到了起始和结束页面
if start_page is None or end_page is None:
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
return None
# 创建一个新的PDF文档保存截取的页面 if len(headers) < 2:
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # Get the base name without extension return "" # 如果没有足够的页来比较,返回空字符串
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
output_doc = PdfWriter()
# 添加需要的页面,从 start_page 开始,包括 end_page # 寻找每一行中的公共部分
for page_num in range(start_page, end_page + 1): common_headers = []
output_doc.add_page(pdf_document.pages[page_num]) for lines in zip(*headers):
# 保存新的PDF文件 # 在每一行中寻找公共单词
with open(output_pdf_path, 'wb') as f: common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]])
output_doc.write(f) if common_line:
common_headers.append(' '.join(common_line))
print(f"已截取并保存页面从 {start_page}{end_page}{output_pdf_path}") return '\n'.join(common_headers)
return output_pdf_path def is_pdf_or_doc(filename):
# 判断文件是否为PDF或Word文档
return filename.lower().endswith(('.pdf', '.doc', '.docx'))
def convert_to_pdf(file_path):
# 假设 docx2pdf 函数已经被定义,这里仅根据文件扩展名来决定是否需要转换
if file_path.lower().endswith(('.doc', '.docx')):
return docx2pdf(file_path)
return file_path
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
# 确保输出文件夹存在
if not os.path.exists(output_folder): if not os.path.exists(output_folder):
os.makedirs(output_folder) os.makedirs(output_folder)
if os.path.isdir(input_path):
generated_files = [] generated_files = []
if os.path.isdir(input_path):
# 遍历文件夹内的所有PDF文件 # 遍历文件夹内的所有PDF文件
for file in os.listdir(input_path): for file in os.listdir(input_path):
if file.endswith(".pdf"): file_path = os.path.join(input_path, file)
pdf_path = os.path.join(input_path, file) if is_pdf_or_doc(file):
output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) pdf_path = convert_to_pdf(file_path)
if output_pdf_path and os.path.isfile(output_pdf_path): output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern,
output_suffix)
if output_pdf_path:
generated_files.append(output_pdf_path) generated_files.append(output_pdf_path)
return generated_files
elif os.path.isfile(input_path) and input_path.endswith(".pdf"): elif os.path.isfile(input_path) and input_path.endswith(".pdf"):
# 处理单个PDF文件 # 处理单个PDF文件
output_pdf_path = extract_pages(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) output_pdf_path = extract_pages(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
if output_pdf_path and os.path.isfile(output_pdf_path): if output_pdf_path:
return [output_pdf_path] # 以列表形式返回,以保持一致性 generated_files.append(output_pdf_path)
else: else:
print("提供的路径既不是文件夹也不是PDF文件。") print("提供的路径既不是文件夹也不是PDF文件。")
return [] return generated_files
def extract_pages_twice(pdf_path, output_folder, output_suffix,common_header):
pdf_document = PdfReader(pdf_path)
begin_page=5
start_page = None
end_page = None
# 定义用于检测是否包含"文件的构成"的正则表达式
exclusion_pattern = re.compile(r'文件的构成|文件的组成')
if output_suffix == "procurement":
begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|'
r'^[一二三四五六七八九十百千]+、\s*采购清单',
re.MULTILINE
)
end_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
re.MULTILINE
)
for i in range(len(pdf_document.pages)):
page = pdf_document.pages[i]
text = page.extract_text() or ""
cleaned_text = clean_page_content(text,common_header)
if re.search(begin_pattern, cleaned_text) and i > begin_page and not re.search(exclusion_pattern, cleaned_text):
start_page = i
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
end_page = i
break
if start_page is None or end_page is None:
print(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!")
return ""
else:
return save_extracted_pages(pdf_document,start_page, end_page, pdf_path,output_folder, output_suffix)
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
try:
common_header = extract_common_header(pdf_path)
pdf_document = PdfReader(pdf_path)
start_page = None
end_page = None
for i in range(len(pdf_document.pages)):
page = pdf_document.pages[i]
text = page.extract_text() or ""
cleaned_text = clean_page_content(text,common_header)
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
start_page = i
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
end_page = i
break
if start_page is None or end_page is None:
if output_suffix == "procurement":
return extract_pages_twice(pdf_path, output_folder, output_suffix,common_header)
else:
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
return None
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
except Exception as e:
print(f"Error processing {pdf_path}: {e}")
return None
def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
output_doc = PdfWriter()
for page_num in range(start_page, end_page + 1):
output_doc.add_page(pdf_document.pages[page_num])
with open(output_pdf_path, 'wb') as f:
output_doc.write(f)
print(f"已截取并保存页面从 {start_page}{end_page}{output_pdf_path}")
return output_pdf_path
def truncate_pdf_main(input_path, output_folder, selection): def truncate_pdf_main(input_path, output_folder, selection):
if selection == 1: if selection == 1:
# Configure patterns and phrases for "第三章 项目技术、服务及商务要求" # 更新的正则表达式以匹配"第x章"和"第x部分",考虑到可能的空格和其他文字
begin_pattern = re.compile(r'第[一二三四五六七八九十百千]+章.*?(?:服务|项目|商务).*?要求|第[一二三四五六七八九十百千]+章.*?采购.*') begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?采购.*',
# r'^[一二三四五六七八九十百千]+、\s*采购清单',
)
begin_page = 5 begin_page = 5
end_pattern = re.compile(r'第[一二三四五六七八九十百千]+章\s*(资格审查|评标方法|评审办法|评定办法)') end_pattern = re.compile(
# 示例文本进行测试 r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
output_suffix = "tobidders_notice_table" )
output_suffix = "procurement"
else: else:
print("无效的选择") print("无效的选择")
return None return None
# Process the selected input # 调用相应的处理函数
return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
def truncate_pdf_multiple(input_path, output_folder): def truncate_pdf_multiple(input_path, output_folder):
@ -96,10 +184,10 @@ def truncate_pdf_multiple(input_path, output_folder):
return truncate_files return truncate_files
if __name__ == "__main__": if __name__ == "__main__":
input_path = "C:\\Users\\Administrator\\WPSDrive\\292826853\\WPS云盘\\应用\\输出为PDF\\公安局音视频监控系统设备采购项目(定稿_20240829133603.pdf" input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件(107国道).pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1" output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
# truncate_pdf_multiple(input_path,output_folder) truncate_pdf_multiple(input_path,output_folder)
selection = 1 # 例如1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-招标公告-合同条款前 # selection = 1 # 例如1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-招标公告-合同条款前
generated_files = truncate_pdf_main(input_path, output_folder, selection) # generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print("生成的文件:", generated_files) # print("生成的文件:", generated_files)