diff --git a/flask_app/main/docx截取docx.py b/flask_app/main/docx截取docx.py index bce6e9b..79d9fe6 100644 --- a/flask_app/main/docx截取docx.py +++ b/flask_app/main/docx截取docx.py @@ -2,8 +2,6 @@ from docx import Document import re import os -from flask import g - def copy_docx(source_path): doc = Document(source_path) # 打开源文档 @@ -45,7 +43,7 @@ def copy_docx(source_path): break new_doc.save(destination_path) # 保存新文档 - g.logger.info("docx截取docx成功!") + print("docx截取docx成功!") # 调用函数 diff --git a/flask_app/main/download.py b/flask_app/main/download.py index 32bbe92..8aaa520 100644 --- a/flask_app/main/download.py +++ b/flask_app/main/download.py @@ -1,7 +1,6 @@ import requests import mimetypes -from flask import g def download_file(url, local_filename): @@ -29,13 +28,13 @@ def download_file(url, local_filename): else: return full_filename,3 except requests.HTTPError as e: - g.logger.error(f"download: HTTP Error: {e}") + print(f"download: HTTP Error: {e}") return None except requests.RequestException as e: - g.logger.error(f"download: Error downloading the file: {e}") + print(f"download: Error downloading the file: {e}") return None except Exception as e: - g.logger.error(f"download: An error occurred: {e}") + print(f"download: An error occurred: {e}") return None if __name__ == '__main__': diff --git a/flask_app/main/format_change.py b/flask_app/main/format_change.py index 38a3296..150f80e 100644 --- a/flask_app/main/format_change.py +++ b/flask_app/main/format_change.py @@ -1,7 +1,6 @@ import json import os import requests -from flask import g from flask_app.main.download import download_file @@ -21,14 +20,14 @@ def upload_file(file_path, url): # 检查响应状态码 if response.status_code == 200: - g.logger.info("format_change 文件上传成功") + print("format_change 文件上传成功") receive_file_response = response.content.decode('utf-8') receive_file_json = json.loads(receive_file_response) receive_file_url = receive_file_json["data"] else: - g.logger.info(f"format_change 文件上传失败,状态码: {response.status_code}") - g.logger.info(f"format_change {response.text}") + print(f"format_change 文件上传失败,状态码: {response.status_code}") + print(f"format_change {response.text}") return receive_file_url @@ -46,7 +45,7 @@ def pdf2docx(local_path_in): filename, folder = get_filename_and_folder(local_path_in) #输入输出在同一个文件夹 local_path_out=os.path.join(folder,filename) #输出文件名 downloaded_filepath,file_type=download_file(receive_download_url, local_path_out) - g.logger.info("format_change p2d:have downloaded file to:",downloaded_filepath) + print(f"format_change p2d:have downloaded file to: {downloaded_filepath}") return downloaded_filepath def docx2pdf(local_path_in): @@ -55,7 +54,7 @@ def docx2pdf(local_path_in): filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹 local_path_out = os.path.join(folder, filename) # 输出文件名 downloaded_filepath,file_type = download_file(receive_download_url, local_path_out) - g.logger.info("format_change d2p:have downloaded file to:", downloaded_filepath) + print(f"format_change d2p:have downloaded file to: {downloaded_filepath}") return downloaded_filepath if __name__ == '__main__': diff --git a/flask_app/main/json_utils.py b/flask_app/main/json_utils.py index 6076d51..cb63f52 100644 --- a/flask_app/main/json_utils.py +++ b/flask_app/main/json_utils.py @@ -1,9 +1,6 @@ import json import re -from flask import g - - def extract_content_from_json(json_data): """提取 { 和 } 之间的内容,并将其解析为字典""" if not json_data.strip(): @@ -14,10 +11,10 @@ def extract_content_from_json(json_data): json_data = match.group(0) return json.loads(json_data) #返回字典 except json.JSONDecodeError as e: - g.logger.info(f"json_utils: extract_content_from_json: JSON decode error: {e}") + print(f"json_utils: extract_content_from_json: JSON decode error: {e}") return {} else: - g.logger.info("json_utils: extract_content_from_json: No valid JSON content found.") + print("json_utils: extract_content_from_json: No valid JSON content found.") return {} def clean_json_string(json_string): @@ -66,18 +63,18 @@ def add_keys_to_json(target_dict, source_dict): dict: 更新后的字典。 """ if not target_dict: - g.logger.error("json_utils: Error: Target dictionary is empty.") + print("json_utils: Error: Target dictionary is empty.") return {} if len(target_dict) != 1: - g.logger.error("json_utils: Error: Target dictionary must contain exactly one top-level key.") + print("json_utils: Error: Target dictionary must contain exactly one top-level key.") return target_dict # 获取唯一的外层键 target_key, existing_dict = next(iter(target_dict.items())) if not isinstance(existing_dict, dict): - g.logger.error(f"json_utils: Error: The value under the key '{target_key}' is not a dictionary.") + print(f"json_utils: Error: The value under the key '{target_key}' is not a dictionary.") return target_dict # 合并字典 @@ -95,7 +92,7 @@ def rename_outer_key(original_data,new_key): # 提取原始数据中的唯一外层值(假设只有一个外层键) if not original_data or not isinstance(original_data, dict): - g.logger.error("json_utils: Error: Invalid input or input is not a dictionary.") # 如果输入无效或不是字典,则返回空字典 + print("json_utils: Error: Invalid input or input is not a dictionary.") # 如果输入无效或不是字典,则返回空字典 return {} # 使用 next(iter(...)) 提取第一个键的值 diff --git a/flask_app/main/start_up.py b/flask_app/main/start_up.py index 04fd137..956c8d4 100644 --- a/flask_app/main/start_up.py +++ b/flask_app/main/start_up.py @@ -1,6 +1,5 @@ import logging import shutil -import sys import time import uuid from datetime import datetime, timedelta @@ -40,7 +39,8 @@ def before_request(): def create_logger(): unique_id = str(uuid.uuid4()) g.unique_id = unique_id - output_folder = f"flask_app/static/output/{unique_id}" + # output_folder = f"flask_app/static/output/{unique_id}" + output_folder = f"C:/Users/Administrator/Desktop/货物标/zboutpub/{unique_id}" os.makedirs(output_folder, exist_ok=True) log_filename = "log.txt" log_path = os.path.join(output_folder, log_filename) @@ -59,11 +59,12 @@ def create_logger(): @app.route('/upload', methods=['POST']) def zbparse(): + logger=g.logger file_url = validate_request() if isinstance(file_url, tuple): # Check if the returned value is an error response return file_url try: - app.logger.info("starting parsing url:" + file_url) + logger.info("starting parsing url:" + file_url) final_json_path, output_folder = download_and_process_file(file_url) if not final_json_path: return jsonify({'error': 'File processing failed'}), 500 @@ -71,7 +72,7 @@ def zbparse(): # remove_directory(output_folder) # 然后删除文件夹 return response # 最后返回获取的响应 except Exception as e: - app.logger.error('Exception occurred: ' + str(e)) # 使用全局 logger 记录 + logger.error('Exception occurred: ' + str(e)) # 使用全局 logger 记录 return jsonify({'error': str(e)}), 500 @@ -138,7 +139,8 @@ def validate_request(): def download_and_process_file(file_url): logger = g.logger unique_id = g.unique_id - output_folder = f"flask_app/static/output/{unique_id}" # 直接使用全局 unique_id 构建路径 + # output_folder = f"flask_app/static/output/{unique_id}" # 直接使用全局 unique_id 构建路径 + output_folder = f"C:/Users/Administrator/Desktop/货物标/zboutpub/{unique_id}" filename = "ztbfile" downloaded_filename = os.path.join(output_folder, filename) diff --git a/flask_app/main/table_content_extraction.py b/flask_app/main/table_content_extraction.py index 81dd69e..82dfe48 100644 --- a/flask_app/main/table_content_extraction.py +++ b/flask_app/main/table_content_extraction.py @@ -3,7 +3,6 @@ import os from docx import Document import json -from flask import g def read_tables_from_docx(file_path): @@ -89,13 +88,13 @@ def save_data_to_json(data, output_folder): """将数据保存到JSON文件中.""" with open(output_filepath, 'w', encoding='utf-8') as file: json.dump(data, file, ensure_ascii=False, indent=4) - g.logger.info(f"table_content_extraction: The data has been processed and saved to '{output_filepath}'.") + print(f"table_content_extraction: The data has been processed and saved to '{output_filepath}'.") return output_filepath def extract_tables_main(path, output_folder): if not os.path.exists(path): - g.logger.error(f"table_content_extraction: The specified file does not exist: {path}") + print(f"table_content_extraction: The specified file does not exist: {path}") return "" # 读取文档表格数据 table_data = read_tables_from_docx(path) diff --git a/flask_app/main/基础信息整合.py b/flask_app/main/基础信息整合.py index 1f82ada..09cf8e9 100644 --- a/flask_app/main/基础信息整合.py +++ b/flask_app/main/基础信息整合.py @@ -1,7 +1,5 @@ import json -from flask import g - from flask_app.main.json_utils import clean_json_string, nest_json_under_key,rename_outer_key, combine_json_results from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge @@ -32,9 +30,6 @@ def combine_basic_info(baseinfo_list): # 根据检测到的键动态调整 key_groups dynamic_key_handling(key_groups, relevant_keys_detected) - # 打印 key_groups 的内容检查它们是否被正确更新 - # g.logger.info("Updated key_groups after dynamic handling:") - # 使用合并后的字典创建最终输出 for group_name, keys in key_groups.items(): @@ -82,8 +77,7 @@ def judge_consortium_bidding(baseinfo_list): def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #投标人须知前附表 # 调用大模型回答项目基础信息 baseinfo_list = [] - # baseinfo_file_path='../static/提示词/前两章提问总结.txt' - baseinfo_file_path = 'flask_app/static/提示词/前两章提问总结.txt' # 替换为你的txt文件路径 + baseinfo_file_path='flask_app/static/提示词/前两章提问总结.txt' questions = read_questions_from_file(baseinfo_file_path) res1 = multi_threading(questions, knowledge_name) for _, response in res1: # _占位,代表ques;response[0]也是ques;response[1]是ans @@ -91,13 +85,12 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): # if response and len(response) > 1: # 检查response存在且有至少两个元素 baseinfo_list.append(response[1]) else: - g.logger.error(f"基础信息整合: Warning: Missing or incomplete response data for query index {_}.") + print(f"基础信息整合: Warning: Missing or incomplete response data for query index {_}.") except Exception as e: - g.logger.error(f"基础信息整合: Error processing response for query index {_}: {e}") + print(f"基础信息整合: Error processing response for query index {_}: {e}") # 判断是否分包、是否需要递交投标保证金等 chosen_numbers, merged = judge_whether_main(truncate0,output_folder) baseinfo_list.append(merged) - # judge_file_path = '../static/提示词/是否相关问题.txt' judge_file_path ='flask_app/static/提示词/是否相关问题.txt' judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers) @@ -109,7 +102,7 @@ def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): # file_id=upload_file(truncate0) res2 = multi_threading(judge_questions, "",file_id,2) #调用千问-long if not res2: - g.logger.error("基础信息整合: multi_threading errror!") + print("基础信息整合: multi_threading errror!") else: # 打印结果 for question, response in res2: diff --git a/flask_app/main/多线程提问.py b/flask_app/main/多线程提问.py index 1b931e8..7a32b81 100644 --- a/flask_app/main/多线程提问.py +++ b/flask_app/main/多线程提问.py @@ -6,7 +6,6 @@ import concurrent.futures import time from dashscope import Assistants, Messages, Runs, Threads -from flask import g from llama_index.indices.managed.dashscope import DashScopeCloudRetriever from flask_app.main.通义千问long import qianwen_long, upload_file prompt = """ @@ -118,10 +117,10 @@ def pure_assistant(): def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type): if llm_type==1: - g.logger.info(f"rag_assistant! question:{question}") + print(f"rag_assistant! question:{question}") assistant = rag_assistant(knowledge_name) elif llm_type==2: - g.logger.info(f"qianwen_long! question:{question}") + print(f"qianwen_long! question:{question}") qianwen_res = qianwen_long(file_id,question) result_queue.put((ans_index,(question,qianwen_res))) return @@ -131,7 +130,7 @@ def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans) def multi_threading(queries, knowledge_name="", file_id="",llm_type=1): - g.logger.info("多线程提问:starting multi_threading...") + print("多线程提问:starting multi_threading...") result_queue = queue.Queue() # 使用 ThreadPoolExecutor 管理线程 @@ -150,7 +149,7 @@ def multi_threading(queries, knowledge_name="", file_id="",llm_type=1): try: future.result() # 可以用来捕获异常或确认任务完成 except Exception as exc: - g.logger.error(f"Query {index} generated an exception: {exc}") + print(f"Query {index} generated an exception: {exc}") # 从队列中获取所有结果并按索引排序 results = [None] * len(queries) diff --git a/flask_app/main/形式响应评审.py b/flask_app/main/形式响应评审.py index bc7880a..339bb6e 100644 --- a/flask_app/main/形式响应评审.py +++ b/flask_app/main/形式响应评审.py @@ -3,8 +3,6 @@ import re import json import time -from flask import g - from flask_app.main.多线程提问 import multi_threading from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2 from flask_app.main.json_utils import extract_content_from_json @@ -189,9 +187,9 @@ def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause temp = extract_content_from_json(response[1]) first_response_list.append(temp) else: - g.logger.error(f"形式响应评审:Warning: Missing or incomplete response data for query index {_}.") + print(f"形式响应评审:Warning: Missing or incomplete response data for query index {_}.") except Exception as e: - g.logger.error(f"形式响应评审:Error processing response for query index {_}: {e}") + print(f"形式响应评审:Error processing response for query index {_}: {e}") # Assume JSON file paths are defined or configured correctly # print(entries_with_numbers) #[{'形式评审标准.多标段投标': '3.7.4(5)'}] diff --git a/flask_app/main/截取pdf.py b/flask_app/main/截取pdf.py index 083a9ab..55c82f7 100644 --- a/flask_app/main/截取pdf.py +++ b/flask_app/main/截取pdf.py @@ -2,43 +2,47 @@ from PyPDF2 import PdfReader, PdfWriter import re # 导入正则表达式库 import os # 用于文件和文件夹操作 -from flask import g +def clean_page_content(text, common_header): + # 首先删除抬头公共部分 + if common_header: # 确保有公共抬头才进行替换 + for header_line in common_header.split('\n'): + if header_line.strip(): # 只处理非空行 + # 替换首次出现的完整行 + text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1) + # 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除 + text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 + text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码 + text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码 + return text def extract_common_header(pdf_path): pdf_document = PdfReader(pdf_path) headers = [] - num_pages_to_read = 3 # 预读页数 + start_page = 4 # 从第5页开始读取,索引为4 + num_pages_to_read = 3 # 连续读取3页 - for i in range(min(num_pages_to_read, len(pdf_document.pages))): + # 确保从第5页开始,且总页数足够 + for i in range(start_page, min(start_page + num_pages_to_read, len(pdf_document.pages))): page = pdf_document.pages[i] - text = page.extract_text() - if text: # 确保页面有文本内容 - first_line = text.strip().split('\n')[0] - headers.append(first_line) + text = page.extract_text() or "" + if text: + # 只取每页的前三行 + first_lines = text.strip().split('\n')[:3] + headers.append(first_lines) if len(headers) < 2: return "" # 如果没有足够的页来比较,返回空字符串 - # 使用set交集来找出公共部分 - common_header = set(headers[0].split()).intersection(*[set(header.split()) for header in headers[1:]]) - common_header = ' '.join(common_header) - return common_header + # 寻找每一行中的公共部分 + common_headers = [] + for lines in zip(*headers): + # 在每一行中寻找公共单词 + common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]]) + if common_line: + common_headers.append(' '.join(common_line)) - -def clean_page_content(text, common_header): - # 首先删除抬头公共部分 - if common_header: # 确保有公共抬头才进行替换 - cleaned_text = text.replace(common_header, '', 1) # 假设抬头出现在文本开头,只替换一次 - else: - cleaned_text = text - - # 删除页码 - cleaned_text = re.sub(r'^\s*\d+\s*(?=\D)', '', cleaned_text) # 删除开头的页码,仅当紧跟非数字字符时 - cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text) # 删除结尾的页码 - cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text) # 删除形如 /129 的页码 - - return cleaned_text + return '\n'.join(common_headers) def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page): @@ -62,16 +66,16 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en with open(output_pdf_path, 'wb') as f: output_doc.write(f) - g.logger.info(f"已截取并保存页面从 {start_page + 1} 到 {end_page + 1} 为 {output_pdf_path}") + print(f"已截取并保存页面从 {start_page + 1} 到 {end_page + 1} 为 {output_pdf_path}") else: - g.logger.error("提供的页码范围无效。") + print("提供的页码范围无效。") return output_pdf_path def extract_pages_twice(pdf_path, output_folder, output_suffix): common_header = extract_common_header(pdf_path) last_begin_index = 0 - begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷') + begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|投标邀请书') pdf_document = PdfReader(pdf_path) for i, page in enumerate(pdf_document.pages): text = page.extract_text() @@ -108,7 +112,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix): break # 找到结束页后退出循环 if start_page is None or end_page is None: - g.logger.error(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!") + print(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!") return "" else: return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page) @@ -157,7 +161,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter if output_suffix == "qualification" or output_suffix =="invalid": return extract_pages_twice(pdf_path, output_folder, output_suffix) else: - g.logger.error(f"first: 未找到起始或结束页在文件 {pdf_path} 中!") + print(f"first: 未找到起始或结束页在文件 {pdf_path} 中!") return "" else: return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page) @@ -185,7 +189,7 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt if output_pdf_path and os.path.isfile(output_pdf_path): return [output_pdf_path] # 以列表形式返回,以保持一致性 else: - g.logger.error("提供的路径既不是文件夹也不是PDF文件。") + print("提供的路径既不是文件夹也不是PDF文件。") return [] @@ -224,7 +228,7 @@ def truncate_pdf_main(input_path, output_folder, selection): output_suffix = "qualification" elif selection == 5: # 配置用于 "招标公告" 的正则表达式模式和短语 - begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷') + begin_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|投标邀请书') begin_page = 0 end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*投标人须知', re.MULTILINE) output_suffix = "notice" @@ -235,7 +239,7 @@ def truncate_pdf_main(input_path, output_folder, selection): end_pattern = re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|第二卷', re.MULTILINE) output_suffix = "invalid" else: - g.logger.error("无效的选择:请选择1-6") + print("无效的选择:请选择1-6") return None # Process the selected input diff --git a/flask_app/main/投标人须知正文条款提取成json文件.py b/flask_app/main/投标人须知正文条款提取成json文件.py index 9c27e72..0765ed5 100644 --- a/flask_app/main/投标人须知正文条款提取成json文件.py +++ b/flask_app/main/投标人须知正文条款提取成json文件.py @@ -4,8 +4,6 @@ import fitz import re import os -from flask import g - def extract_text_from_docx(file_path): doc = docx.Document(file_path) @@ -129,7 +127,7 @@ def convert_to_json(file_path, start_word, end_phrases): def convert_clause_to_json(input_path,output_folder,type=1): if not os.path.exists(input_path): - g.logger.error(f"The specified file does not exist: {input_path}") + print(f"The specified file does not exist: {input_path}") return "" if type==1: start_word = "投标人须知正文" diff --git a/flask_app/main/招标文件解析.py b/flask_app/main/招标文件解析.py index a427932..c600725 100644 --- a/flask_app/main/招标文件解析.py +++ b/flask_app/main/招标文件解析.py @@ -4,8 +4,6 @@ import logging import os import time -from flask import g - from flask_app.main.截取pdf import truncate_pdf_multiple from flask_app.main.table_content_extraction import extract_tables_main from flask_app.main.知识库操作 import addfileToKnowledge, deleteKnowledge @@ -20,16 +18,17 @@ from flask_app.main.商务标技术标整合 import combine_evaluation_standards from flask_app.main.format_change import pdf2docx, docx2pdf from flask_app.main.docx截取docx import copy_docx -# def get_global_logger(unique_id): -# if unique_id is None: -# return logging.getLogger() # 获取默认的日志器 -# logger = logging.getLogger(unique_id) -# return logger +def get_global_logger(unique_id): + if unique_id is None: + return logging.getLogger() # 获取默认的日志器 + logger = logging.getLogger(unique_id) + return logger +logger=None # 可能有问题:pdf转docx导致打勾符号消失 def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id): - g.logger.info("starting 文件预处理...") + logger.info("starting 文件预处理...") # 根据文件类型处理文件路径 if file_type == 1: # docx docx_path = downloaded_file_path @@ -39,7 +38,7 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id): docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库 else: # 如果文件类型不是预期中的1或2,记录错误并返回None - g.logger.error("Unsupported file type provided. Preprocessing halted.") + logger.error("Unsupported file type provided. Preprocessing halted.") return None # 上传知识库 @@ -59,7 +58,7 @@ def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id): truncate1 = truncate_files[1] #评标办法前附表 truncate3 = truncate_files[3] #资格审查表 clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json - g.logger.info("文件预处理done") + logger.info("文件预处理done") return { 'input_file_path':downloaded_file_path, @@ -94,59 +93,59 @@ def post_processing(data,includes): return result # 基本信息 def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path): # 投标人须知前附表 - g.logger.info("starting基础信息...") + logger.info("starting基础信息...") basic_res = project_basic_info(knowledge_name, truncate0, output_folder, clause_path) - g.logger.info("基础信息done") + logger.info("基础信息done") return basic_res # 形式、响应、资格评审 def fetch_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path,input_file,output_folder): - g.logger.info("starting资格审查...") + logger.info("starting资格审查...") review_standards_res = combine_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path,input_file,output_folder) - g.logger.info("资格审查done") + logger.info("资格审查done") return review_standards_res # 评分细则 def fetch_evaluation_standards(truncate1): # 评标办法前附表 - g.logger.info("starting商务标技术标...") + logger.info("starting商务标技术标...") evaluation_standards_res = combine_evaluation_standards(truncate1) - g.logger.info("商务标技术标done") + logger.info("商务标技术标done") return evaluation_standards_res # 无效、废标项解析 def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3): # 废标项要求:千问 - g.logger.info("starting无效标与废标...") + logger.info("starting无效标与废标...") find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3) - g.logger.info("无效标与废标done...") + logger.info("无效标与废标done...") return find_invalid_res # 投标文件要求 def fetch_bidding_documents_requirements(clause_path): - g.logger.info("starting投标文件要求...") + logger.info("starting投标文件要求...") fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1) qualify_nested_res = nest_json_under_key(fetch_bidding_documents_requirements_json, "投标文件要求") - g.logger.info("投标文件要求done...") + logger.info("投标文件要求done...") return qualify_nested_res # 开评定标流程 def fetch_bid_opening(clause_path): - g.logger.info("starting开评定标流程...") + logger.info("starting开评定标流程...") fetch_bid_opening_json = extract_from_notice(clause_path, 2) qualify_nested_res = nest_json_under_key(fetch_bid_opening_json, "开评定标流程") - g.logger.info("开评定标流程done...") + logger.info("开评定标流程done...") return qualify_nested_res def main_processing(output_folder, downloaded_file_path, file_type, unique_id): # file_type=1->docx file_type=2->pdf - # global global_logger - # global_logger = get_global_logger(unique_id) + global logger + logger = get_global_logger(unique_id) # Preprocess files and get necessary data paths and knowledge index processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id) if not processed_data: @@ -180,7 +179,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id): result = futures[key].result() comprehensive_responses.append(result) except Exception as exc: - g.logger.error(f"Error processing {key}: {exc}") + logger.error(f"Error processing {key}: {exc}") # 合并 JSON 结果 combined_final_result = combine_json_results(comprehensive_responses) includes = ["基础信息", "资格审查", "商务标", "技术标", "无效标与废标项", "投标文件要求", "开评定标流程"] @@ -190,7 +189,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id): final_result_path = os.path.join(output_folder, "final_result.json") with open(final_result_path, 'w', encoding='utf-8') as file: json.dump(modified_json, file, ensure_ascii=False, indent=2) - g.logger.info("final_result.json has been saved") + logger.info("final_result.json has been saved") deleteKnowledge(processed_data['knowledge_index']) return final_result_path diff --git a/flask_app/main/按页读取pdf.py b/flask_app/main/按页读取pdf.py index 8c50dcf..6fe5ce0 100644 --- a/flask_app/main/按页读取pdf.py +++ b/flask_app/main/按页读取pdf.py @@ -27,5 +27,5 @@ def extract_text_by_page(file_path): print(f"Page {page_num + 1} is empty or text could not be extracted.") return result if __name__ == '__main__': - file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output\\ztb_20240903100337\\ztb_20240903100337_14-20.pdf" + file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件正文(1)(1).pdf" extract_text_by_page(file_path) diff --git a/flask_app/main/提取打勾符号.py b/flask_app/main/提取打勾符号.py index 0ae12fa..625998a 100644 --- a/flask_app/main/提取打勾符号.py +++ b/flask_app/main/提取打勾符号.py @@ -2,9 +2,6 @@ import re import PyPDF2 import json -from flask import g - - def extract_key_value_pairs(text): # 更新正则表达式来包括对"团"的处理和行尾斜线 pattern = r'\d+\.\d+\s*([\w\s()\u4e00-\u9fff]+)[\x01\x02☑团]([\w\s\u4e00-\u9fff]+)' @@ -75,7 +72,7 @@ def read_pdf_and_judge_main(file_path, output_json_path): with open(output_json_path, "w", encoding="utf-8") as json_file: json.dump(all_data, json_file, ensure_ascii=False, indent=4) - g.logger.info(f"da_gou signal: Data extraction complete and saved to '{output_json_path}'.") + print(f"da_gou signal: Data extraction complete and saved to '{output_json_path}'.") if __name__ == "__main__": diff --git a/flask_app/main/无效标和废标和禁止投标整合.py b/flask_app/main/无效标和废标和禁止投标整合.py index 87f3760..0f7fab2 100644 --- a/flask_app/main/无效标和废标和禁止投标整合.py +++ b/flask_app/main/无效标和废标和禁止投标整合.py @@ -3,9 +3,6 @@ import json import os.path import time import re - -from flask import g - from flask_app.main.json_utils import combine_json_results, nest_json_under_key from flask_app.main.通义千问long import upload_file, qianwen_long from concurrent.futures import ThreadPoolExecutor @@ -332,7 +329,7 @@ def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,t results.append(future.result()) #禁止投标 - g.logger.info("starting不得存在的情形...") + print("starting不得存在的情形...") forbidden_res = find_forbidden(truncate_json_path, clause_path, truncate3) results.append(forbidden_res) @@ -340,7 +337,7 @@ def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,t for d in results: combined_dict.update(d) - g.logger.info("无效标与废标done...") + print("无效标与废标done...") return nest_json_under_key(combined_dict, "无效标与废标项") diff --git a/flask_app/main/知识库操作.py b/flask_app/main/知识库操作.py index 1606a7c..f5300d5 100644 --- a/flask_app/main/知识库操作.py +++ b/flask_app/main/知识库操作.py @@ -1,7 +1,5 @@ import os import uuid - -from flask import g from llama_index.readers.dashscope.base import DashScopeParse from llama_index.readers.dashscope.utils import ResultType from llama_index.indices.managed.dashscope import DashScopeCloudIndex @@ -16,7 +14,7 @@ def addfileToKnowledge(filepath,knowledge_name): knowledge_name, verbose=True, ) - g.logger.info("knowledge created successfully!!!") + print("knowledge created successfully!!!") # index = DashScopeCloudIndex(knowledge_name) # index._insert(documents) # return index, documents diff --git a/flask_app/main/禁止投标情形.py b/flask_app/main/禁止投标情形.py index f16a552..4a7753d 100644 --- a/flask_app/main/禁止投标情形.py +++ b/flask_app/main/禁止投标情形.py @@ -4,7 +4,6 @@ import os import re from PyPDF2 import PdfWriter, PdfReader -from flask import g from flask_app.main.通义千问long import upload_file, qianwen_long @@ -55,9 +54,9 @@ def extract_and_format_from_paths(json_paths, includes, excludes): # 将当前文件的结果添加到总结果列表 all_formatted_results.extend(formatted_results) except FileNotFoundError: - g.logger.error(f"禁止投标情形: Error: The file '{path}' does not exist.") + print(f"禁止投标情形: Error: The file '{path}' does not exist.") except json.JSONDecodeError: - g.logger.error(f"禁止投标情形: Error: The file '{path}' contains invalid JSON.") + print(f"禁止投标情形: Error: The file '{path}' contains invalid JSON.") return all_formatted_results @@ -126,9 +125,9 @@ def merge_pdfs(paths, output_filename): if output_path: with open(output_path, 'wb') as out: pdf_writer.write(out) - g.logger.info(f"禁止投标情形: Merged PDF saved to {output_path}") + print(f"禁止投标情形: Merged PDF saved to {output_path}") else: - g.logger.error("禁止投标情形: No files to merge.") + print("禁止投标情形: No files to merge.") return output_path def process_string_list(string_list): @@ -153,7 +152,7 @@ def process_string_list(string_list): actual_list = ast.literal_eval(formatted_list) return actual_list except SyntaxError as e: - g.logger.error(f"禁止投标情形: Error parsing list: {e}") + print(f"禁止投标情形: Error parsing list: {e}") return [] else: # 如果没有匹配到内容,返回空列表 diff --git a/flask_app/main/读取docx.py b/flask_app/main/读取docx.py index e037f52..4bf5e91 100644 --- a/flask_app/main/读取docx.py +++ b/flask_app/main/读取docx.py @@ -14,5 +14,5 @@ def read_docx(file_path): if __name__ == "__main__": - file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile.docx" + file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件正文(1)(1).docx" read_docx(file_path) diff --git a/flask_app/main/资格评审.py b/flask_app/main/资格评审.py index 62adad0..5c27e95 100644 --- a/flask_app/main/资格评审.py +++ b/flask_app/main/资格评审.py @@ -2,9 +2,6 @@ #资格审查中,首先排除'联合体投标'和'不得存在的情况',有'符合'等的,加入matching_keys列表,否则保留原字典 import json import re - -from flask import g - from flask_app.main.json_utils import clean_json_string, combine_json_results, add_keys_to_json, nest_json_under_key from flask_app.main.多线程提问 import multi_threading,read_questions_from_file from flask_app.main.通义千问long import upload_file @@ -19,7 +16,7 @@ def merge_dictionaries_under_common_key(dicts, common_key): # 使用字典解包来合并字典 merged_dict[common_key].update(d[common_key]) else: - g.logger.error(f"资格评审: Warning: Dictionary does not contain the key {common_key}") + print(f"资格评审: Warning: Dictionary does not contain the key {common_key}") return merged_dict def generate_qual_question(matching_keys_list): #这里假设资质、信誉与人员要求 要不都有、要不都没 @@ -73,15 +70,14 @@ def get_consortium_dict(knowledge_name): if response and len(response) > 1: # 检查response存在且有至少两个元素 qualify_list.append(response[1]) else: - g.logger.error(f"资格评审: Warning: Missing or incomplete response data for query index {_}.") + print(f"资格评审: Warning: Missing or incomplete response data for query index {_}.") except Exception as e: - g.logger.error(f"资格评审: Error processing response for query index {_}: {e}") + print(f"资格评审: Error processing response for query index {_}: {e}") consortium_dict = combine_json_results(qualify_list) return consortium_dict def get_all_dict(knowledge_name): - # qualification_review_file_path = '../static/提示词/资格评审.txt' # 替换为你的txt文件路径 - qualification_review_file_path='flask_app/static/提示词/资格评审问题.txt' + qualification_review_file_path='flask_app/static/提示词/资格评审.txt' questions = read_questions_from_file(qualification_review_file_path) qualification_list = [] res1 = multi_threading(questions, knowledge_name) @@ -90,9 +86,9 @@ def get_all_dict(knowledge_name): if response and len(response) > 1: # 检查response存在且有至少两个元素 qualification_list.append(response[1]) else: - g.logger.error(f"资格评审: Warning: Missing or incomplete response data for query index {_}.") + print(f"资格评审: Warning: Missing or incomplete response data for query index {_}.") except Exception as e: - g.logger.error(f"资格评审: Error processing response for query index {_}: {e}") + print(f"资格评审: Error processing response for query index {_}: {e}") qualification_combined_res = combine_json_results(qualification_list) return {'资格评审': qualification_combined_res} def process_qualification(qualification_review,truncate3,knowledge_name): @@ -101,14 +97,14 @@ def process_qualification(qualification_review,truncate3,knowledge_name): if not matching_keys_list: #此时要求全部写在评分办法前附表中,不需要额外提取。 if not non_matching_dict: #古法提取 if truncate3!="": - g.logger.info("资格评审: type1") + print("资格评审: type1") matching_keys_list=["资质条件","财务要求","业绩要求","信誉要求","其他要求"] ques=generate_qual_question(matching_keys_list) file_id2 = upload_file(truncate3) results2 = multi_threading(ques, "", file_id2, 2) # 资格评审表,调用qianwen-long res_list = [] if not results2: - g.logger.error("资格评审: 调用大模型未成功获取资格评审文件中的要求!") + print("资格评审: 调用大模型未成功获取资格评审文件中的要求!") else: # 打印结果 for question, response in results2: @@ -119,11 +115,11 @@ def process_qualification(qualification_review,truncate3,knowledge_name): updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) # 合并字典 return updated_qualify_json else: - g.logger.info("资格评审: type2") + print("资格评审: type2") return get_all_dict(knowledge_name) else: - g.logger.info("资格评审: type3") + print("资格评审: type3") new_non_matching_json={'资格评审':non_matching_dict} substring = '联合体' found_key = any(substring in key for key in non_matching_dict.keys()) #没有联合体投标,则需生成,防止重复 @@ -135,18 +131,18 @@ def process_qualification(qualification_review,truncate3,knowledge_name): return new_non_matching_json elif matching_keys_list and truncate3=="": #这种情况是评分办法前附表中有要求,但是没有正确截取到'资格审查表' - g.logger.info("资格评审: type4") + print("资格评审: type4") final_qualification=get_all_dict(knowledge_name) final_qualify_json = add_keys_to_json(final_qualification, non_matching_dict) return final_qualify_json else: #大多数情况 - g.logger.info("资格评审: type5") + print("资格评审: type5") user_querys = generate_qual_question(matching_keys_list) # 生成提问->‘附件:资格审查’ file_id2 = upload_file(truncate3) results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表,调用qianwen-long res_list = [] if not results2: - g.logger.error("资格评审: 调用大模型未成功获取资格评审文件中的要求!") + print("资格评审: 调用大模型未成功获取资格评审文件中的要求!") else: # 打印结果 for question, response in results2: diff --git a/flask_app/main/转化格式/pypandoc_d2p.py b/flask_app/main/转化格式/pypandoc_d2p.py new file mode 100644 index 0000000..fd72e5b --- /dev/null +++ b/flask_app/main/转化格式/pypandoc_d2p.py @@ -0,0 +1,9 @@ +import pypandoc + +def docx_to_pdf(docx_path, output_pdf_path): + output = pypandoc.convert_file(docx_path, 'pdf', outputfile=output_pdf_path) + assert output == "" + +docx_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\招标文件正文(1)(1).docx' +output_pdf_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\output.pdf' +docx_to_pdf(docx_path, output_pdf_path) diff --git a/flask_app/货物标/extract_procurement_requirements.py b/flask_app/货物标/extract_procurement_requirements.py deleted file mode 100644 index 32a0dcb..0000000 --- a/flask_app/货物标/extract_procurement_requirements.py +++ /dev/null @@ -1,60 +0,0 @@ -import json - -from flask_app.货物标.货物标截取pdf import truncate_pdf_main -from flask_app.main.format_change import docx2pdf, pdf2docx -from flask_app.main.多线程提问 import multi_threading -from flask_app.main.通义千问long import upload_file,qianwen_long -from flask_app.main.json_utils import clean_json_string,combine_json_results - -def generate_key_paths(data, parent_key=''): - key_paths = [] - for key, value in data.items(): - current_key = f"{parent_key}.{key}" if parent_key else key - if isinstance(value, dict): - if value: # 字典非空时,递归处理 - key_paths.extend(generate_key_paths(value, current_key)) - else: # 字典为空时,直接添加键路径 - key_paths.append(current_key) - else: - # 如果到达了末端,添加当前键路径 - key_paths.append(current_key) - return key_paths - -#获取采购清单 -def fetch_purchasing_list(file_path,output_folder,file_type): - global pdf_path,docx_path - if file_type==1: - docx_path=file_path - pdf_path = docx2pdf(file_path) - elif file_type==2: - pdf_path=file_path - docx_path=pdf2docx(file_path) - technical_requirements=[] - truncate_path=truncate_pdf_main(pdf_path,output_folder,1) - user_query1="这是一份货物标中采购要求部分的内容,你需要摘取出需要采购的系统(货物),一个大系统(大项)中可能包含多个小系统(小项),你需要保留这种层次关系,给出货物名称,请以json格式返回,外层键名为\"采购需求\",嵌套键名为对应的系统名称或货物名称,无需给出采购数量和单位,如有未知内容,在对应键值处填\"未知\"。" - file_id=upload_file(truncate_path[0]) - res=qianwen_long(file_id,user_query1) - cleaned_res=clean_json_string(res) - keys_list=generate_key_paths(cleaned_res['采购需求']) #提取需要采购的货物清单 - user_query_template = "这是一份货物标中采购要求部分的内容,请你给出\"{}\"的具体型号参数要求,请以json格式返回结果,外层键名为\"{}\", 键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减。" - queries=[] - for key in keys_list: - # 替换 user_query2 中的 "网络硬盘录像机" 为当前 key - new_query = user_query_template.format(key, key) - print(new_query) - queries.append(new_query) - results=multi_threading(queries,"",file_id,2) - if not results: - print("errror!") - else: - # 打印结果 - for question, response in results: - technical_requirements.append(response) - technical_requirements_combined_res=combine_json_results(technical_requirements) - json_string = json.dumps(technical_requirements_combined_res, ensure_ascii=False, indent=4) - print(json_string) -if __name__ == "__main__": - output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles" - file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\磋商文件.doc" - fetch_purchasing_list(file_path,output_folder,1) - diff --git a/flask_app/货物标/find_zbfile.py b/flask_app/货物标/find_zbfile.py index 2561397..c209b28 100644 --- a/flask_app/货物标/find_zbfile.py +++ b/flask_app/货物标/find_zbfile.py @@ -12,7 +12,9 @@ def find_and_copy_files(input_folder, output_folder): for root, dirs, files in os.walk(input_folder): for file in files: # 检查文件名是否包含“招标”或“竞争性”并且文件格式正确 - if ('竞争性' in file or '招标' in file or '磋商' in file) and file.endswith(supported_formats): + if ('响应' not in file and '投标' not in file) and \ + ('竞争性' in file or '招标文件' in file or '磋商' in file) and \ + file.endswith(supported_formats): # 构造完整的文件路径 file_path = os.path.join(root, file) # 构造输出路径 diff --git a/flask_app/货物标/test.py b/flask_app/货物标/test.py index cf8e7a1..14102b9 100644 --- a/flask_app/货物标/test.py +++ b/flask_app/货物标/test.py @@ -1,115 +1,33 @@ -import json -import re +def postprocess(data): + """转换字典中的值为列表,如果所有键对应的值都是'/', '{}' 或 '未知'""" + for key, value in data.items(): + if all(v in ['/', '未知', {}] for v in value.values()): + data[key] = list(value.keys()) + return data +# 示例数据 +data = { + "第一包.耗材": { + "服务器": "未知", + "台式计算机": "未知", + "便携式计算机": "/", + "信息安全设备": {}, + "喷墨打印机": "/", + "激光打印机": "/", + "针式打印机": "/", + "液晶显示器": "/", + "扫描仪": "/", + "基础软件": "/", + "信息安全软件": "/", + "复印机": "/", + "投影仪": "/", + "多功能一体机": "/", + "触控一体机": "/", + "碎纸机": "/", + "复印纸": "/" + } +} -def extract_and_format_from_paths(json_paths, includes, excludes): - """ - 从多个 JSON 文件路径读取数据,提取包含特定关键词的内容,并按照格式要求合并。 - - 参数: - json_paths (list): 包含多个 JSON 文件路径的列表。 - includes (list): 包含要检查的关键词的列表。 - excludes (list): 包含要排除的关键词的列表。 - - 返回: - list: 包含所有文件中满足条件的格式化字符串列表。 - """ - all_formatted_results = [] - - # 遍历每个文件路径 - for path in json_paths: - try: - with open(path, 'r', encoding='utf-8') as file: - # 加载 JSON 数据 - json_data = json.load(file) - formatted_results = [] - - # 遍历 JSON 数据的每个键值对 - for key, value in json_data.items(): - if isinstance(value, dict): - # 如果值是字典,检查嵌套字典的每个键值对 - for sub_key, sub_value in value.items(): - if any(include in sub_key for include in includes): - # 如果子值包含关键词,格式化并添加到结果列表 - formatted_results.append(f"{sub_value}") - elif isinstance(value, str): # clause - # 检查是否包含任何 include 关键词 - for include in includes: - if include in value: - # 找到 include 之前的内容 - prefix = value.split(include)[0] - # 检查 prefix 是否不包含任何 exclude 关键词 - if not any(exclude in prefix for exclude in excludes): - # 如果不包含任何 exclude 关键词,添加整个 value 到结果列表 - if '\n' in value: - value = value.split('\n', 1)[-1] - formatted_results.append(value) - break # 找到一个符合条件的就跳出循环 - - # 将当前文件的结果添加到总结果列表 - all_formatted_results.extend(formatted_results) - except FileNotFoundError: - print(f"Error: The file '{path}' does not exist.") - except json.JSONDecodeError: - print(f"Error: The file '{path}' contains invalid JSON.") - - return all_formatted_results - - -def extract_unique_items_from_texts(texts): - # 更新正则表达式以包括更广泛的序号类型,包括中文序号 - pattern = re.compile(r'(?:\d+\.|\(\d+\)|\(\d+\)|\d+\)|\①|\②|\③|\④|\⑤|\⑥|\⑦|\⑧|\⑨|\⑩|\⑪|\⑫)\s*') - intro_pattern = re.compile(r'^.*?[::]') - punctuation_pattern = re.compile(r'[;。,、..,:;!?!?]+$') - url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') - content_check_pattern = re.compile(r'[\u4e00-\u9fa5a-zA-Z0-9]{2,}') # 检查是否含有至少2个连续的字母、数字或汉字 - - all_results = [] - seen = set() - - for text in texts: - # 去除文本中的制表符和换行符 - text = text.replace('\t', '').replace('\n', '') - - # 删除引导性的文本(直到冒号,但保留冒号后的内容) - text = intro_pattern.sub('', text) - - # 替换URL为占位符,并保存URL以便后续还原 - urls = [] - def url_replacer(match): - urls.append(match.group(0)) - return f"{{URL{len(urls)}}}" - text = url_pattern.sub(url_replacer, text) - - # 使用数字和括号的模式分割文本 - items = pattern.split(text) - - for item in items: - cleaned_item = item.strip() - if cleaned_item: - # 进一步清理每个条目 - cleaned_item = pattern.sub('', cleaned_item) - cleaned_item = punctuation_pattern.sub('', cleaned_item) - cleaned_item = cleaned_item.strip() - - # 还原URL - for i, url in enumerate(urls, 1): - cleaned_item = cleaned_item.replace(f"{{URL{i}}}", url) - - # 添加未见过的独特条目,确保它包含足够的实质内容并长度大于3个字符 - if cleaned_item and cleaned_item not in seen and len(cleaned_item) > 3 and content_check_pattern.search(cleaned_item): - seen.add(cleaned_item) - all_results.append(cleaned_item) - - return all_results -# 使用上面定义的函数 -truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\truncate_output.json" -clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\clause1.json" -json_paths = [truncate_json_path,clause_path] # 根据实际存放的路径来填写 -includes = ["不得存在","不得与","禁止投标","对投标人的纪律"] -excludes=["招标","评标","定标"] -# 调用函数 -results = extract_and_format_from_paths(json_paths, includes,excludes) -print(results) -res=extract_unique_items_from_texts(results) -print(res) +# 转换字典 +converted_data = postprocess(data) +print(converted_data) diff --git a/flask_app/货物标/技术服务商务要求提取.py b/flask_app/货物标/技术服务商务要求提取.py new file mode 100644 index 0000000..01562fb --- /dev/null +++ b/flask_app/货物标/技术服务商务要求提取.py @@ -0,0 +1,128 @@ +# -*- encoding:utf-8 -*- +import json +import os + +from flask_app.main.多线程提问 import multi_threading +from flask_app.main.通义千问long import upload_file, qianwen_long +from flask_app.main.json_utils import clean_json_string, combine_json_results + + +def generate_key_paths(data, parent_key=''): + key_paths = [] + + for key, value in data.items(): + current_key = f"{parent_key}.{key}" if parent_key else key + if isinstance(value, dict): + if value: + # 检查字典中的值是否为字典、列表或字符串'未知' + contains_dict_list_or_unknown = any(isinstance(v, (dict, list)) or v == "未知" for v in value.values()) + if contains_dict_list_or_unknown: + # 递归生成键路径 + sub_paths = generate_key_paths(value, current_key) + if sub_paths: + # 如果子路径非空,则扩展 + key_paths.extend(sub_paths) + else: + # 当前字典内部为空或值全为"未知" + key_paths.append(current_key) + else: + # 字典中所有值都不是字典、列表或"未知",添加当前键 + key_paths.append(current_key) + else: + # 空字典,直接添加键路径 + key_paths.append(current_key) + elif isinstance(value, list): + # 列表类型,添加包含列表的键的路径 + if value: # 只有当列表非空时才添加 + key_paths.append(current_key) + elif value == "未知": + # 值为"未知",添加键路径 + key_paths.append(current_key) + + return key_paths + + +def get_technical_requirements(truncate_file): + user_query1 = "这是一份货物标中采购要求部分的内容,请告诉我需要采购的系统(货物),如果有采购清单,请直接根据清单上的货物名称给出结果,若没有采购清单,你要从文中摘取需要采购的系统(货物),采购需求中可能包含层次关系,如大系统中包含若干子系统,你需要保留这种层次关系,给出系统(货物)名称,请以json格式返回,外层键名为\"采购需求\",嵌套键名为对应的系统名称或货物名称,需与原文保持一致,无需给出采购数量和单位,如有未知内容,在对应键值处填\"未知\"。" + file_id = upload_file(truncate_file) + res = qianwen_long(file_id, user_query1) + print(res) + cleaned_res = clean_json_string(res) + keys_list = generate_key_paths(cleaned_res['采购需求']) # 提取需要采购的货物清单 + user_query_template = "这是一份货物标中采购要求部分的内容,请你给出\"{}\"的技术参数(或采购要求)和数量,请以json格式返回结果,外层键名为\"{}\", 键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减。" + queries = [] + for key in keys_list: + # 替换 user_query2 中的 "网络硬盘录像机" 为当前 key + new_query = user_query_template.format(key, key) + print(new_query) + queries.append(new_query) + results = multi_threading(queries, "", file_id, 2) + technical_requirements = [] + if not results: + print("errror!未获得大模型的回答!") + else: + # 打印结果 + for question, response in results: + technical_requirements.append(response) + technical_requirements_combined_res = combine_json_results(technical_requirements) + """根据所有键是否已添加处理技术要求""" + # 更新原始采购需求字典 + combine_and_update_results(cleaned_res['采购需求'], technical_requirements_combined_res) + final_res = postprocess(cleaned_res['采购需求']) + print("更新后的采购需求处理完成.") + + # 输出最终的 JSON 字符串 + json_string = json.dumps(final_res, ensure_ascii=False, indent=4) + return json_string + +def combine_and_update_results(original_data, updates): + def recursive_update(data, key, value): + # 处理点分隔的键,递归定位并更新嵌套字典 + keys = key.split('.') + for k in keys[:-1]: + data = data.setdefault(k, {}) + if isinstance(value, dict) and isinstance(data.get(keys[-1], None), dict): + data[keys[-1]] = {**data.get(keys[-1], {}), **value} + else: + data[keys[-1]] = value + for key, value in updates.items(): + recursive_update(original_data, key, value) + return original_data + +def postprocess(data): + """转换字典中的值为列表,如果所有键对应的值都是'/', '{}' 或 '未知'""" + for key, value in data.items(): + if all(v in ['/', '未知', {}] for v in value.values()): + data[key] = list(value.keys()) + return data +def test_all_files_in_folder(input_folder, output_folder): + # 确保输出文件夹存在 + if not os.path.exists(output_folder): + os.makedirs(output_folder) + + # 遍历指定文件夹中的所有文件 + for filename in os.listdir(input_folder): + file_path = os.path.join(input_folder, filename) + # 检查是否是文件 + if os.path.isfile(file_path): + print(f"处理文件: {file_path}") + # 调用函数处理文件 + try: + json_result = get_technical_requirements(file_path) + # 定义输出文件的路径 + output_file_path = os.path.join(output_folder, os.path.splitext(filename)[0] + '.json') + # 保存JSON结果到文件 + with open(output_file_path, 'w', encoding='utf-8') as json_file: + json_file.write(json_result) + print(f"结果已保存到: {output_file_path}") + except Exception as e: + print(f"处理文件 {file_path} 时出错: {e}") + + +if __name__ == "__main__": + truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\招标文件(107国道)_procurement.pdf" + res=get_technical_requirements(truncate_file) + print(res) + # input_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1" + # output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output3" + # test_all_files_in_folder(input_folder, output_folder) diff --git a/flask_app/货物标/提取采购需求main.py b/flask_app/货物标/提取采购需求main.py new file mode 100644 index 0000000..6cef1cd --- /dev/null +++ b/flask_app/货物标/提取采购需求main.py @@ -0,0 +1,24 @@ +import json + +from flask_app.货物标.货物标截取pdf import truncate_pdf_main +from flask_app.main.format_change import docx2pdf, pdf2docx + + +#获取采购清单 +def fetch_purchasing_list(file_path,output_folder,file_type): + if file_type==1: + docx_path=file_path + pdf_path = docx2pdf(file_path) + elif file_type==2: + pdf_path=file_path + docx_path=pdf2docx(file_path) + else: + print("未传入指定格式的文件!") + return None + truncate_path=truncate_pdf_main(pdf_path,output_folder,1) + +if __name__ == "__main__": + output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output" + file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\磋商文件.doc" + fetch_purchasing_list(file_path,output_folder,1) + diff --git a/flask_app/货物标/提示词/prompt1.txt b/flask_app/货物标/提示词/prompt1.txt index b5f8b4a..844f660 100644 --- a/flask_app/货物标/提示词/prompt1.txt +++ b/flask_app/货物标/提示词/prompt1.txt @@ -1,10 +1,13 @@ #这是一份货物标中采购要求部分的内容,你需要摘取出采购清单,一个大系统(大项)中可能包含多个小系统(小项),你需要保留这种层次关系,给出名称和数量和单位,请以json格式返回,外层键名为"采购需求",如有未知内容,在对应键值处填"未知"。 -这是一份货物标中采购要求部分的内容,你需要摘取出需要采购的系统(货物),一个大系统(大项)中可能包含多个小系统(小项),小系统中也可能包含多个货物,你需要保留这种层次关系,给出货物名称,请以json格式返回,外层键名为"采购需求",嵌套键名为对应的系统名称或货物名称,无需给出采购数量和单位,如有未知内容,在对应键值处填"未知"。 +#这是一份货物标中采购要求部分的内容,你需要摘取出需要采购的系统(货物),一个大系统(大项)中可能包含多个小系统(小项),小系统中也可能包含多个货物,你需要保留这种层次关系,给出货物名称,请以json格式返回,外层键名为"采购需求",嵌套键名为对应的系统名称或货物名称,无需给出采购数量和单位,如有未知内容,在对应键值处填"未知"。 + +"这是一份货物标中采购要求部分的内容,你需要摘取出需要采购的系统(货物),一个大系统(大项)中可能包含多个小系统(小项),你需要保留这种层次关系,给出系统(货物)名称,请以json格式返回,外层键名为\"采购需求\",嵌套键名为对应的系统名称或货物名称,无需给出采购数量和单位,如有未知内容,在对应键值处填\"未知\"。" + #这是一份货物标中采购要求部分的内容,请你给出所需的设备名称以及设备的具体型号参数要求,请以json格式返回结果,外层键名为采购要求。 - -这是一份货物标中采购要求部分的内容,请你给出"网络硬盘录像机"的具体型号参数要求,请以json格式返回结果,外层键名为"网络硬盘录像机",键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减。 +这是一份货物标中采购要求部分的内容,请你给出\"{}\"的具体型号参数要求和数量,请以json格式返回结果,外层键名为\"{}\", 键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减。 +#这是一份货物标中采购要求部分的内容,请你给出"网络硬盘录像机"的具体型号参数要求,请以json格式返回结果,外层键名为"网络硬盘录像机",键值对中的键是你对该要求的总结,而值需要完全与原文保持一致,不可擅自总结删减。 { "采购需求": { @@ -76,5 +79,71 @@ } } } +{ + "采购需求": { + "第一包": { + "办公电子设备": [ + "服务器", + "台式计算机", + "便携式计算机", + "信息安全设备", + "喷墨打印机", + "激光打印机", + "针式打印机", + "液晶显示器", + "扫描仪", + "基础软件", + "信息安全软件", + "复印机", + "投影仪", + "多功能一体机", + "触控一体机", + "碎纸机" + ], + "软件": [ + "基础软件", + "信息安全软件" + ], + "耗材": [ + "复印纸" + ] + }, + "第二包": { + "办公家电": [ + "空调机" + ] + }, + "第三包": { + "家具用具": [ + "床类", + "台、桌类", + "椅凳类", + "沙发类", + "柜类", + "架类", + "屏风类", + "厨卫用具", + "组合家具", + "家用家具零配件", + "其他家具用具" + ] + }, + "第四包": { + "印刷服务": "未知" + }, + "第五包": { + "汽车维修和保养服务": "未知" + }, + "第六包": { + "会计服务": "未知" + }, + "第七包": { + "工程造价咨询服务": "未知" + }, + "第八包": { + "机动车保险服务": "未知" + } + } +} diff --git a/flask_app/货物标/货物标截取pdf.py b/flask_app/货物标/货物标截取pdf.py index 8ec3065..ff5e417 100644 --- a/flask_app/货物标/货物标截取pdf.py +++ b/flask_app/货物标/货物标截取pdf.py @@ -1,91 +1,179 @@ from PyPDF2 import PdfReader, PdfWriter import re # 导入正则表达式库 import os # 用于文件和文件夹操作 +from flask_app.main.format_change import docx2pdf -def clean_page_numbers(text): - # 使用正则表达式删除页码 - # 假设页码在文本的最开始,紧跟着文字且无空格分隔 - cleaned_text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 - # 删除结尾的页码 - cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text) - # 删除形如 /129 的页码 - cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text) - return cleaned_text -def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): - # 打开PDF文件 + +def clean_page_content(text, common_header): + # 首先删除抬头公共部分 + if common_header: # 确保有公共抬头才进行替换 + for header_line in common_header.split('\n'): + if header_line.strip(): # 只处理非空行 + # 替换首次出现的完整行 + text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1) + + # 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除 + text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 + text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码 + text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码 + + return text +def extract_common_header(pdf_path): pdf_document = PdfReader(pdf_path) - start_page = None - end_page = None - # 遍历文档的每一页,查找开始和结束短语的位置 - for i in range(len(pdf_document.pages)): + headers = [] + total_pages = len(pdf_document.pages) + middle_page = total_pages // 2 # 计算中间页 + + # 确保中间页前后各一页(共3页),如果不足3页,则尽可能取足 + start_page = max(0, middle_page - 1) + num_pages_to_read = 3 + + for i in range(start_page, min(start_page + num_pages_to_read, total_pages)): page = pdf_document.pages[i] - text = page.extract_text() + text = page.extract_text() or "" if text: - cleaned_text = clean_page_numbers(text) - if re.search(begin_pattern, cleaned_text) and i > begin_page: - start_page = i - if start_page is not None and re.search(end_pattern, cleaned_text) and i > (start_page+1): - end_page = i - break - # 确保找到了起始和结束页面 - if start_page is None or end_page is None: - print(f"未找到起始或结束页在文件 {pdf_path} 中!") - return None + # 只取每页的前三行 + first_lines = text.strip().split('\n')[:3] + headers.append(first_lines) - # 创建一个新的PDF文档保存截取的页面 - base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # Get the base name without extension - output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf") - output_doc = PdfWriter() + if len(headers) < 2: + return "" # 如果没有足够的页来比较,返回空字符串 - # 添加需要的页面,从 start_page 开始,包括 end_page - for page_num in range(start_page, end_page + 1): - output_doc.add_page(pdf_document.pages[page_num]) - # 保存新的PDF文件 - with open(output_pdf_path, 'wb') as f: - output_doc.write(f) + # 寻找每一行中的公共部分 + common_headers = [] + for lines in zip(*headers): + # 在每一行中寻找公共单词 + common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]]) + if common_line: + common_headers.append(' '.join(common_line)) - print(f"已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}") + return '\n'.join(common_headers) - return output_pdf_path +def is_pdf_or_doc(filename): + # 判断文件是否为PDF或Word文档 + return filename.lower().endswith(('.pdf', '.doc', '.docx')) + +def convert_to_pdf(file_path): + # 假设 docx2pdf 函数已经被定义,这里仅根据文件扩展名来决定是否需要转换 + if file_path.lower().endswith(('.doc', '.docx')): + return docx2pdf(file_path) + return file_path def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): - # 确保输出文件夹存在 if not os.path.exists(output_folder): os.makedirs(output_folder) + generated_files = [] if os.path.isdir(input_path): - generated_files = [] # 遍历文件夹内的所有PDF文件 for file in os.listdir(input_path): - if file.endswith(".pdf"): - pdf_path = os.path.join(input_path, file) - output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) - if output_pdf_path and os.path.isfile(output_pdf_path): + file_path = os.path.join(input_path, file) + if is_pdf_or_doc(file): + pdf_path = convert_to_pdf(file_path) + output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, + output_suffix) + if output_pdf_path: generated_files.append(output_pdf_path) - return generated_files elif os.path.isfile(input_path) and input_path.endswith(".pdf"): # 处理单个PDF文件 output_pdf_path = extract_pages(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) - if output_pdf_path and os.path.isfile(output_pdf_path): - return [output_pdf_path] # 以列表形式返回,以保持一致性 + if output_pdf_path: + generated_files.append(output_pdf_path) else: print("提供的路径既不是文件夹也不是PDF文件。") - return [] + return generated_files + +def extract_pages_twice(pdf_path, output_folder, output_suffix,common_header): + pdf_document = PdfReader(pdf_path) + begin_page=5 + start_page = None + end_page = None + # 定义用于检测是否包含"文件的构成"的正则表达式 + exclusion_pattern = re.compile(r'文件的构成|文件的组成') + if output_suffix == "procurement": + begin_pattern = re.compile( + r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|' + r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|' + r'^[一二三四五六七八九十百千]+、\s*采购清单', + re.MULTILINE + ) + end_pattern = re.compile( + r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', + re.MULTILINE + ) + for i in range(len(pdf_document.pages)): + page = pdf_document.pages[i] + text = page.extract_text() or "" + cleaned_text = clean_page_content(text,common_header) + if re.search(begin_pattern, cleaned_text) and i > begin_page and not re.search(exclusion_pattern, cleaned_text): + start_page = i + if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page: + end_page = i + break + if start_page is None or end_page is None: + print(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!") + return "" + else: + return save_extracted_pages(pdf_document,start_page, end_page, pdf_path,output_folder, output_suffix) + + +def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): + try: + common_header = extract_common_header(pdf_path) + pdf_document = PdfReader(pdf_path) + start_page = None + end_page = None + for i in range(len(pdf_document.pages)): + page = pdf_document.pages[i] + text = page.extract_text() or "" + cleaned_text = clean_page_content(text,common_header) + if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page: + start_page = i + if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page: + end_page = i + break + if start_page is None or end_page is None: + if output_suffix == "procurement": + return extract_pages_twice(pdf_path, output_folder, output_suffix,common_header) + else: + print(f"未找到起始或结束页在文件 {pdf_path} 中!") + return None + + return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix) + except Exception as e: + print(f"Error processing {pdf_path}: {e}") + return None + +def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix): + base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] + output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf") + output_doc = PdfWriter() + for page_num in range(start_page, end_page + 1): + output_doc.add_page(pdf_document.pages[page_num]) + with open(output_pdf_path, 'wb') as f: + output_doc.write(f) + print(f"已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}") + return output_pdf_path def truncate_pdf_main(input_path, output_folder, selection): if selection == 1: - # Configure patterns and phrases for "第三章 项目技术、服务及商务要求" - begin_pattern = re.compile(r'第[一二三四五六七八九十百千]+章.*?(?:服务|项目|商务).*?要求|第[一二三四五六七八九十百千]+章.*?采购.*') + # 更新的正则表达式以匹配"第x章"和"第x部分",考虑到可能的空格和其他文字 + begin_pattern = re.compile( + r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|' + r'^第[一二三四五六七八九十百千]+(?:章|部分).*?采购.*', + # r'^[一二三四五六七八九十百千]+、\s*采购清单', + ) begin_page = 5 - end_pattern = re.compile(r'第[一二三四五六七八九十百千]+章\s*(资格审查|评标方法|评审办法|评定办法)') - # 示例文本进行测试 - output_suffix = "tobidders_notice_table" + end_pattern = re.compile( + r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+' + ) + output_suffix = "procurement" else: print("无效的选择") return None - # Process the selected input + # 调用相应的处理函数 return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) def truncate_pdf_multiple(input_path, output_folder): @@ -96,10 +184,10 @@ def truncate_pdf_multiple(input_path, output_folder): return truncate_files if __name__ == "__main__": - input_path = "C:\\Users\\Administrator\\WPSDrive\\292826853\\WPS云盘\\应用\\输出为PDF\\公安局音视频监控系统设备采购项目(定稿)_20240829133603.pdf" + input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件(107国道).pdf" output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1" - # truncate_pdf_multiple(input_path,output_folder) - selection = 1 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-招标公告-合同条款前 - generated_files = truncate_pdf_main(input_path, output_folder, selection) + truncate_pdf_multiple(input_path,output_folder) + # selection = 1 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-招标公告-合同条款前 + # generated_files = truncate_pdf_main(input_path, output_folder, selection) # print("生成的文件:", generated_files)