From 6463a9e593e497c25ae9a60652cedb24702c8b3d Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Thu, 29 Aug 2024 16:37:09 +0800 Subject: [PATCH] 8.29 --- .idea/.gitignore | 8 + .idea/first_pro.iml | 19 + .../inspectionProfiles/profiles_settings.xml | 6 + .idea/misc.xml | 4 + .idea/modules.xml | 8 + .idea/vcs.xml | 6 + Dockerfile | 29 ++ __init__.py | 0 flask_app/__init__.py | 0 flask_app/generate/__init__.py | 0 flask_app/generate/find_dictory.py | 6 + flask_app/main/JSON内容提取.py | 58 +++ flask_app/main/__init__.py | 0 flask_app/main/download.py | 45 +++ flask_app/main/find_tbformat.py | 42 ++ flask_app/main/format_change.py | 69 ++++ flask_app/main/json_utils.py | 116 ++++++ flask_app/main/json提取.py | 28 ++ flask_app/main/start_up.py | 219 ++++++++++ flask_app/main/table_content_extraction.py | 87 ++++ flask_app/main/test.py | 54 +++ flask_app/main/ttt.py | 18 + flask_app/main/删除知识库.py | 55 +++ flask_app/main/判断是否分包等.py | 157 ++++++++ flask_app/main/商务标技术标整合.py | 43 ++ flask_app/main/回答来源.py | 216 ++++++++++ flask_app/main/基础信息整合.py | 135 +++++++ flask_app/main/多线程分类.py | 373 ++++++++++++++++++ flask_app/main/多线程提问.py | 189 +++++++++ flask_app/main/废标项.py | 66 ++++ flask_app/main/形式响应评审.py | 170 ++++++++ flask_app/main/截取pdf.py | 149 +++++++ flask_app/main/投标人须知正文提取指定内容.py | 120 ++++++ .../main/投标人须知正文条款提取成json文件.py | 152 +++++++ flask_app/main/招标文件解析.py | 216 ++++++++++ flask_app/main/按页读取pdf.py | 30 ++ flask_app/main/提取打勾符号.py | 83 ++++ flask_app/main/文件分类普通版.py | 350 ++++++++++++++++ flask_app/main/无效标和废标和禁止投标整合.py | 302 ++++++++++++++ flask_app/main/根据条款号整合json.py | 133 +++++++ flask_app/main/知识库操作.py | 57 +++ flask_app/main/禁止投标情形.py | 151 +++++++ flask_app/main/读取docx.py | 18 + flask_app/main/资格审查模块.py | 35 ++ flask_app/main/资格评审.py | 87 ++++ flask_app/main/资格评审前判断.py | 29 ++ flask_app/main/转化格式/__init__.py | 0 flask_app/main/转化格式/check_status.py | 30 ++ flask_app/main/转化格式/download.py | 41 ++ flask_app/main/转化格式/main_pdf_to_docx.py | 39 ++ flask_app/main/转化格式/pdf2doc.py | 48 +++ flask_app/main/转化格式/submit_conversion.py | 37 ++ flask_app/main/通义千问.py | 48 +++ flask_app/main/通义千问long.py | 67 ++++ flask_app/static/提示词/前两章提问总结.txt | 31 ++ flask_app/static/提示词/是否相关问题.txt | 10 + flask_app/static/提示词/第三章提示词.txt | 111 ++++++ flask_app/static/提示词/资格评审问题.txt | 35 ++ flask_app/货物标/__init__.py | 0 .../extract_procurement_requirements.py | 36 ++ flask_app/货物标/货物标截取pdf.py | 105 +++++ requirements.txt | 17 + 62 files changed, 4793 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/first_pro.iml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 Dockerfile create mode 100644 __init__.py create mode 100644 flask_app/__init__.py create mode 100644 flask_app/generate/__init__.py create mode 100644 flask_app/generate/find_dictory.py create mode 100644 flask_app/main/JSON内容提取.py create mode 100644 flask_app/main/__init__.py create mode 100644 flask_app/main/download.py create mode 100644 flask_app/main/find_tbformat.py create mode 100644 flask_app/main/format_change.py create mode 100644 flask_app/main/json_utils.py create mode 100644 flask_app/main/json提取.py create mode 100644 flask_app/main/start_up.py create mode 100644 flask_app/main/table_content_extraction.py create mode 100644 flask_app/main/test.py create mode 100644 flask_app/main/ttt.py create mode 100644 flask_app/main/删除知识库.py create mode 100644 flask_app/main/判断是否分包等.py create mode 100644 flask_app/main/商务标技术标整合.py create mode 100644 flask_app/main/回答来源.py create mode 100644 flask_app/main/基础信息整合.py create mode 100644 flask_app/main/多线程分类.py create mode 100644 flask_app/main/多线程提问.py create mode 100644 flask_app/main/废标项.py create mode 100644 flask_app/main/形式响应评审.py create mode 100644 flask_app/main/截取pdf.py create mode 100644 flask_app/main/投标人须知正文提取指定内容.py create mode 100644 flask_app/main/投标人须知正文条款提取成json文件.py create mode 100644 flask_app/main/招标文件解析.py create mode 100644 flask_app/main/按页读取pdf.py create mode 100644 flask_app/main/提取打勾符号.py create mode 100644 flask_app/main/文件分类普通版.py create mode 100644 flask_app/main/无效标和废标和禁止投标整合.py create mode 100644 flask_app/main/根据条款号整合json.py create mode 100644 flask_app/main/知识库操作.py create mode 100644 flask_app/main/禁止投标情形.py create mode 100644 flask_app/main/读取docx.py create mode 100644 flask_app/main/资格审查模块.py create mode 100644 flask_app/main/资格评审.py create mode 100644 flask_app/main/资格评审前判断.py create mode 100644 flask_app/main/转化格式/__init__.py create mode 100644 flask_app/main/转化格式/check_status.py create mode 100644 flask_app/main/转化格式/download.py create mode 100644 flask_app/main/转化格式/main_pdf_to_docx.py create mode 100644 flask_app/main/转化格式/pdf2doc.py create mode 100644 flask_app/main/转化格式/submit_conversion.py create mode 100644 flask_app/main/通义千问.py create mode 100644 flask_app/main/通义千问long.py create mode 100644 flask_app/static/提示词/前两章提问总结.txt create mode 100644 flask_app/static/提示词/是否相关问题.txt create mode 100644 flask_app/static/提示词/第三章提示词.txt create mode 100644 flask_app/static/提示词/资格评审问题.txt create mode 100644 flask_app/货物标/__init__.py create mode 100644 flask_app/货物标/extract_procurement_requirements.py create mode 100644 flask_app/货物标/货物标截取pdf.py create mode 100644 requirements.txt diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/first_pro.iml b/.idea/first_pro.iml new file mode 100644 index 0000000..1d7e5a4 --- /dev/null +++ b/.idea/first_pro.iml @@ -0,0 +1,19 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..727db8c --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..aa486af --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..2927062 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,29 @@ +# 使用官方 Python 运行时作为父镜像 +FROM python:3.8-slim + +# 设置工作目录 +WORKDIR /ZbparseProjects + +RUN pip config set global.progress_bar off + +# 复制 requirements.txt 并安装依赖,确保每次构建都可以使用缓存(除非 requirements.txt 改变) +COPY ../../requirements.txt . + +# 安装依赖 +RUN pip install --upgrade pip --default-timeout=100 \ + && pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt + +# 将当前目录的内容复制到容器的 /PycharmProjects 中 +COPY .. . + +# 定义环境变量 +ENV DASHSCOPE_API_KEY=sk-f7ad8ad193064cf482588f7064e75183 +ENV DASHSCOPE_WORKSPACE_ID=llm-mo38469hdfwtervi +ENV ALIBABA_CLOUD_ACCESS_KEY_ID=LTAI5tRWhjktXyY5MovoiNuF +ENV ALIBABA_CLOUD_ACCESS_KEY_SECRET=88oyw7LniqV8i0SnOuSFS5lprfrPtw + +# 暴露端口 +EXPOSE 5000 + +# 在容器启动时运行你的应用 +CMD ["python", "main/start_up.py"] diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/flask_app/__init__.py b/flask_app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/flask_app/generate/__init__.py b/flask_app/generate/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/flask_app/generate/find_dictory.py b/flask_app/generate/find_dictory.py new file mode 100644 index 0000000..78fb392 --- /dev/null +++ b/flask_app/generate/find_dictory.py @@ -0,0 +1,6 @@ +from ..main.通义千问long import qianwen_long,upload_file + + +def read_dictory(file_path): + file_id=upload_file(file_path) + user_query="根据该文档中的评标办法前附表,请你列出该文件的技术标以及它对应的具体评分要求,若对应内容中存在其他信息,在嵌套键如'技术标'中新增键名'备注'存放该信息。如果评分内容不是这3个,则返回文档中给定的评分内容以及它的评分要求,都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容" diff --git a/flask_app/main/JSON内容提取.py b/flask_app/main/JSON内容提取.py new file mode 100644 index 0000000..d0910d9 --- /dev/null +++ b/flask_app/main/JSON内容提取.py @@ -0,0 +1,58 @@ +import json + + +def find_keys_by_value(target_value, json_data): + matched_keys = [k for k, v in json_data.items() if v == target_value] + if not matched_keys: + matched_keys = [k for k, v in json_data.items() if isinstance(v, str) and v.startswith(target_value)] + return matched_keys + + +def find_keys_with_prefix(key_prefix, json_data): + subheadings = [k for k in json_data if k.startswith(key_prefix) and k != key_prefix] + return subheadings + + +def extract_json(data, target_values): + results = {} + for target_value in target_values: + matched_keys = find_keys_by_value(target_value, data) + for key in matched_keys: + key_and_subheadings = {key: data[key]} + subheadings = find_keys_with_prefix(key, data) + for subkey in subheadings: + key_and_subheadings[subkey] = data[subkey] + results[target_value] = key_and_subheadings + return results + + +def renumber_keys(data, level=1): + if isinstance(data, dict): + new_dict = {} + for key in data: + parts = key.split('.') + parts[0] = '1' + new_key = '.'.join(parts) + new_dict[new_key] = renumber_keys(data[key], level + 1) + return new_dict + else: + return data + + +def json_results(extr_json): + renumbered_data = {} + for key in extr_json: + renumbered_data[key] = renumber_keys(extr_json[key]) + return renumbered_data + + +if __name__ == "__main__": + target_values = ["投标文件"] + with open('clause3.json', 'r', encoding='utf-8') as file: + data = json.load(file) + extracted_data = extract_json(data, target_values) + renumbered_data = json_results(extracted_data) + + with open('output_results1.json', 'w', encoding='utf-8') as file: + json.dump(renumbered_data, file, indent=4, ensure_ascii=False) + print("JSON文件已按要求重新编号并保存.") diff --git a/flask_app/main/__init__.py b/flask_app/main/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/flask_app/main/download.py b/flask_app/main/download.py new file mode 100644 index 0000000..31695ea --- /dev/null +++ b/flask_app/main/download.py @@ -0,0 +1,45 @@ +import requests +import mimetypes + + +def download_file(url, local_filename): + try: + with requests.get(url, stream=True) as response: + response.raise_for_status() # 确保请求成功,否则抛出异常 + + # 获取文件类型并设置适当的文件扩展名 + content_type = response.headers.get('Content-Type') + extension = mimetypes.guess_extension(content_type, strict=False) + if not extension: + # 如果无法猜测扩展名,默认使用 .docx + extension = '.docx' + full_filename = local_filename + extension # 追加扩展名 + + with open(full_filename, 'wb') as file: + for chunk in response.iter_content(chunk_size=8192): + file.write(chunk) + + # 根据扩展名返回对应的值 + if extension == '.docx': + return full_filename,1 + elif extension == '.pdf': + return full_filename,2 + else: + return full_filename,3 + except requests.HTTPError as e: + print(f"HTTP Error: {e}") + return None + except requests.RequestException as e: + print(f"Error downloading the file: {e}") + return None + except Exception as e: + print(f"An error occurred: {e}") + return None + +if __name__ == '__main__': + # 测试下载的URL + test_url ="https://temp-pdf2docx.oss-cn-wuhan-lr.aliyuncs.com/docx/zbfile.docx?Expires=1724866978&OSSAccessKeyId=TMP.3KhJJmRnpG3r3FKwULgxRm7pfH2wHVDgwo7HotjD9j3w23omXG1mwrnBtP7n1G6j4HWW6CURq7JHqZ4kmC6RBMAZFcoDsw&Signature=LMczkwe6nVNbAHX4xvgCs8MtZ48%3D" + local_file_name = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output\\downloaded_file' + file_path = download_file(test_url, local_file_name) + if file_path: + print(f"Downloaded file path: {file_path}") diff --git a/flask_app/main/find_tbformat.py b/flask_app/main/find_tbformat.py new file mode 100644 index 0000000..5575cdd --- /dev/null +++ b/flask_app/main/find_tbformat.py @@ -0,0 +1,42 @@ +import PyPDF2 +import re + +def extract_contents_with_pages(pdf_path, keyword): + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + for page_number in range(len(reader.pages)): + page = reader.pages[page_number] + text = page.extract_text() + if text: + lines = text.split('\n') + for line in lines: + if keyword.lower() in line.lower(): + match = re.search(r"\d+(?=\s*$)", line) + if match: + return int(match.group(0)) # 直接返回整数类型的页码 + return None # 如果遍历完所有页面后仍未找到页码,返回None + +def split_pdf(pdf_path, start_page, output_path): + """切分PDF文件从start_page到end_page""" + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + writer = PyPDF2.PdfWriter() + end_page = len(reader.pages) + # 确保start_page是整数 + start_page = int(start_page) + # 注意页码从0开始,因此需要调整页码索引 + for i in range(start_page - 1, end_page): + writer.add_page(reader.pages[i]) + with open(output_path, "wb") as output_pdf: + writer.write(output_pdf) + +# 使用示例 +pdf_path = "D:\\项目\\工程招标\\zb1.pdf" +output_path = "D:\\项目\\工程招标\\tb_format.pdf" +keyword = "投标文件格式" # 修改为你想查找的关键字 +page_number = extract_contents_with_pages(pdf_path, keyword) +print(page_number) +if page_number is not None: + split_pdf(pdf_path, page_number, output_path) +else: + print("未找到含有关键字的页码") diff --git a/flask_app/main/format_change.py b/flask_app/main/format_change.py new file mode 100644 index 0000000..5c448bd --- /dev/null +++ b/flask_app/main/format_change.py @@ -0,0 +1,69 @@ +import json +import os + +import requests +from download import download_file + + +def upload_file(file_path, url): + receive_file_url = "" + # 定义文件名和路径 + filename = file_path.split('/')[-1] + + # 打开文件以二进制形式读取 + with open(file_path, 'rb') as f: + # 使用multipart/form-data格式发送文件 + files = {'file': (filename, f)} + + # 发送POST请求 + response = requests.post(url, files=files) + + # 检查响应状态码 + if response.status_code == 200: + print("文件上传成功") + receive_file_response = response.content.decode('utf-8') + receive_file_json = json.loads(receive_file_response) + receive_file_url = receive_file_json["data"] + + else: + print(f"文件上传失败,状态码: {response.status_code}") + print(response.text) + + return receive_file_url + +def get_filename_and_folder(file_path): + # 使用os.path.basename获取文件名 + filename = os.path.splitext(os.path.basename(file_path))[0] #ztb_tobidders_notice_table,不包括扩展名 + # 使用os.path.dirname获取文件所在的完整目录路径,再次使用basename获取文件夹名称 + directory = os.path.dirname(file_path) + return filename, directory + +#参数为要转换的文件路径,以及输出的文件名,文件类型为自动补全。 +def pdf2docx(local_path_in): + remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/p2d' + receive_download_url = upload_file(local_path_in, remote_url) + filename, folder = get_filename_and_folder(local_path_in) #输入输出在同一个文件夹 + local_path_out=os.path.join(folder,filename) #输出文件名 + downloaded_filepath,file_type=download_file(receive_download_url, local_path_out) + print("have downloaded file to:",downloaded_filepath) + return downloaded_filepath + +def docx2pdf(local_path_in): + remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p' + receive_download_url = upload_file(local_path_in, remote_url) + filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹 + local_path_out = os.path.join(folder, filename) # 输出文件名 + downloaded_filepath,file_type = download_file(receive_download_url, local_path_out) + print("have downloaded file to:", downloaded_filepath) + return downloaded_filepath + +if __name__ == '__main__': + # 替换为你的文件路径和API URL + local_path_in="C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest16_invalid.docx" + # pdf2docx(local_path_in) + downloaded_file=docx2pdf(local_path_in) + print(downloaded_file) + + + + diff --git a/flask_app/main/json_utils.py b/flask_app/main/json_utils.py new file mode 100644 index 0000000..872940e --- /dev/null +++ b/flask_app/main/json_utils.py @@ -0,0 +1,116 @@ +import json +import re + +def extract_content_from_json(json_data): + """提取 { 和 } 之间的内容,并将其解析为字典""" + if not json_data.strip(): + return {} + match = re.search(r'\{[\s\S]*\}', json_data) + if match: + try: + json_data = match.group(0) + return json.loads(json_data) #返回字典 + except json.JSONDecodeError as e: + print(f"JSON decode error: {e}") + return {} + else: + print("No valid JSON content found.") + return {} + +def clean_json_string(json_string): + """清理JSON字符串,移除多余的反引号并解析为字典""" + return extract_content_from_json(json_string) + +def combine_json_results(json_lists): + """ + 将类json格式的列表整合成json数据(即大括号{}包裹) + """ + combined_result = {} + for json_str in json_lists: + if json_str.strip(): + json_data = clean_json_string(json_str) + combined_result.update(json_data) + return combined_result + + +def nest_json_under_key(data, key): + """ + 将给定的字典 data 嵌套在一个新的字典层级下,该层级由 key 指定,并返回 JSON 格式的字符串。 + + 参数: + - data: dict, 要嵌套的原始字典。 + - key: str, 新层级的键名。 + + 返回: + - 嵌套后的 JSON 字符串。 + """ + # 创建一个新字典,其中包含一个键,该键的值是原始字典 + nested_dict = {key: data} + # 将字典转换成 JSON 字符串 + nested_json = json.dumps(nested_dict, ensure_ascii=False, indent=4) + return nested_json + + +def add_keys_to_json(target_dict, source_dict): + """ + 将 source_dict 的内容添加到 target_dict 中的唯一外层键下的字典中。 + + 参数: + target_dict (dict): 要更新的目标字典,假定只有一个外层键。 + source_dict (dict): 源字典,其内容将被添加到目标字典。 + + 返回: + dict: 更新后的字典。 + """ + if not target_dict: + print("Error: Target dictionary is empty.") + return {} + + if len(target_dict) != 1: + print("Error: Target dictionary must contain exactly one top-level key.") + return target_dict + + # 获取唯一的外层键 + target_key, existing_dict = next(iter(target_dict.items())) + + if not isinstance(existing_dict, dict): + print(f"Error: The value under the key '{target_key}' is not a dictionary.") + return target_dict + + # 合并字典 + existing_dict.update(source_dict) + + # 更新原字典 + target_dict[target_key] = existing_dict + + return target_dict + + +def rename_outer_key(original_data,new_key): + # 定义新的键名 + # new_key = "重新招标, 不再招标和终止招标" + + # 提取原始数据中的唯一外层值(假设只有一个外层键) + if not original_data or not isinstance(original_data, dict): + return {} # 如果输入无效或不是字典,则返回空字典 + + # 使用 next(iter(...)) 提取第一个键的值 + original_value = next(iter(original_data.values()), {}) + + # 创建一个新的字典,使用新的键名 + new_data = {new_key: original_value} + + return json.dumps(new_data,ensure_ascii=False) +def transform_json_values(data): + if isinstance(data, dict): + return {key: transform_json_values(value) for key, value in data.items()} + elif isinstance(data, list): + return [transform_json_values(item) for item in data] + elif isinstance(data, bool): + return '是' if data else '否' + elif isinstance(data, (int, float)): + return str(data) + elif isinstance(data, str): + return data.replace('\n', '
') + else: + return data diff --git a/flask_app/main/json提取.py b/flask_app/main/json提取.py new file mode 100644 index 0000000..079ac09 --- /dev/null +++ b/flask_app/main/json提取.py @@ -0,0 +1,28 @@ +import json + +def search_key_in_json(file_path, search_key): + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + + # 递归函数查找键 + def recursive_search(data, key): + if key in data: + return key, data[key] + for k, v in data.items(): + if isinstance(v, dict): + result = recursive_search(v, key) + if result: + return result + return None + + result = recursive_search(data, search_key) + if result: + return f"{result[0]} : {result[1]}" + else: + return f"{search_key} : /" + + +# 用法示例 +file_path = 'C:/Users/Administrator/Downloads/truncate_output2.json' # 替换为你的 JSON 文件路径 +search_key = '多标段投标' # 替换为你想搜索的键 +print(search_key_in_json(file_path, search_key)) diff --git a/flask_app/main/start_up.py b/flask_app/main/start_up.py new file mode 100644 index 0000000..be35118 --- /dev/null +++ b/flask_app/main/start_up.py @@ -0,0 +1,219 @@ +import logging +import shutil +import sys +import time +import uuid +from datetime import datetime, timedelta + +from flask import Flask, request, jsonify, send_file, Response, stream_with_context +import json +import os +from download import download_file +from 招标文件解析 import main_processing + +app = Flask(__name__) +class CSTFormatter(logging.Formatter): + """自定义的 Formatter,将日志的时间戳调整为中国标准时间(UTC+8)""" + def formatTime(self, record, datefmt=None): + ct = datetime.fromtimestamp(record.created) + timedelta(hours=8) + if datefmt: + s = ct.strftime(datefmt) + else: + try: + s = ct.strftime("%Y-%m-%d %H:%M:%S") + if self.usesTime(): + s = f"{s},{record.msecs:03d}" + except ValueError: + s = ct.strftime("%Y-%m-%d %H:%M:%S") + return s + +def create_logger(unique_id): + """为每个请求创建一个新的日志器,日志器的日志文件存放在指定的输出文件夹中""" + output_folder = f"/ZbparseProjects/static/output/{unique_id}" + # output_folder =f"C:/Users/Administrator/Desktop/招标文件/test/{unique_id}" + if not os.path.exists(output_folder): + os.makedirs(output_folder, exist_ok=True) + log_filename = "log.txt" + log_path = os.path.join(output_folder, log_filename) + logger = logging.getLogger(unique_id) # 使用 unique_id 作为日志器名字 + if not logger.handlers: # 避免重复添加处理器 + # 文件处理器 + file_handler = logging.FileHandler(log_path) + file_formatter = CSTFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + file_handler.setFormatter(file_formatter) + logger.addHandler(file_handler) + + # 流处理器(控制台输出) + stream_handler = logging.StreamHandler(sys.stdout) + stream_formatter = logging.Formatter('%(message)s') # 简化的格式,只输出消息 + stream_handler.setFormatter(stream_formatter) + logger.addHandler(stream_handler) + + logger.setLevel(logging.INFO) + return logger, output_folder + +@app.route('/upload', methods=['POST']) +# def zbparse(): +# file_url = validate_request() +# if isinstance(file_url, tuple): # Check if the returned value is an error response +# return file_url +# try: +# app.logger.info("starting parsing url:"+file_url) +# final_json_path, output_folder,logger = download_and_process_file(file_url) +# if not final_json_path: +# return jsonify({'error': 'File processing failed'}), 500 +# response = generate_response(final_json_path,logger) # 先获取响应内容 +# # remove_directory(output_folder) # 然后删除文件夹 +# return response # 最后返回获取的响应 +# except Exception as e: +# app.logger.error('Exception occurred: ' + str(e)) # 使用全局 logger 记录 +# return jsonify({'error': str(e)}), 500 +def zbparse(): + file_url = validate_request() + if isinstance(file_url, tuple): # Check if the returned value is an error response + return file_url + try: + app.logger.info("starting parsing url:" + file_url) + return Response(stream_with_context(process_and_stream(file_url)), content_type='text/event-stream') + except Exception as e: + app.logger.error('Exception occurred: ' + str(e)) + return jsonify({'error': str(e)}), 500 + + +def process_and_stream(file_url): + unique_id = str(uuid.uuid4()) + logger, output_folder = create_logger(unique_id) + filename = "ztbfile" + downloaded_filename = os.path.join(output_folder, filename) + + downloaded_filepath, file_type = download_file(file_url, downloaded_filename) + + if downloaded_filepath is None or file_type == 3: + logger.error("Unsupported file type or failed to download file") + error_response = { + 'message': 'File processing failed', + 'filename': None, + 'data': json.dumps({'error': 'File processing failed'}) + } + yield f"data: {json.dumps(error_response)}\n\n" + return + + logger.info("Local file path: " + downloaded_filepath) + + for data in main_processing(output_folder, downloaded_filepath, file_type, unique_id): + response = { + 'message': 'Processing', + 'filename': os.path.basename(downloaded_filepath), + 'data': data + } + yield f"data: {json.dumps(response)}\n\n" + + final_response = { + 'message': 'File uploaded and processed successfully', + 'filename': os.path.basename(downloaded_filepath), + 'data': 'END' + } + yield f"data: {json.dumps(final_response)}\n\n" + +def validate_request(): + if not request.is_json: + return jsonify({'error': 'Missing JSON in request'}), 400 + file_url = request.json.get('file_url')[0] + if not file_url: + return jsonify({'error': 'No file URL provided'}), 400 + return file_url + +def download_and_process_file(file_url): + unique_id = str(uuid.uuid4()) # 生成一个唯一的 UUID + logger, output_folder = create_logger(unique_id) + filename = "ztbfile" + downloaded_filename = os.path.join(output_folder, filename) + + # 下载文件,假设 download_file 函数已正确处理异常并返回文件路径 + downloaded_filepath,file_type = download_file(file_url, downloaded_filename) + + if downloaded_filepath is None or file_type == 3: + logger.error("Unsupported file type or failed to download file") + return None, output_folder, logger + + logger.info("Local file path: " + downloaded_filepath) + processed_file_path = main_processing(output_folder, downloaded_filepath,file_type, unique_id) + return processed_file_path, output_folder,logger + +@app.route('/api/test_zbparse', methods=['POST']) +def test_zbparse(): + try: + return Response(stream_with_context(test_process_and_stream()), content_type='text/event-stream') + except Exception as e: + app.logger.error('Exception occurred: ' + str(e)) + return jsonify({'error': str(e)}), 500 + +def test_process_and_stream(): + # 模拟五段数据 + data_segments = [ + {"base_info": {"project_name": "测试项目1", "project_code": "TP001"}}, + {"review_standards": ["标准1", "标准2", "标准3"]}, + {"evaluation_standards": ["评估标准A", "评估标准B"]}, + {"invalid_requirements": ["无效要求X", "无效要求Y"]}, + {"bidding_documents_requirements": ["文件要求1", "文件要求2"]} + ] + + filename = "test_file.pdf" + + for i, data in enumerate(data_segments, 1): + response = { + 'message': f'Processing segment {i}', + 'filename': filename, + 'data': data + } + yield f"data: {json.dumps(response)}\n\n" + time.sleep(5) # 每隔2秒发送一段数据 + + # 发送结束信号 + final_response = { + 'message': 'File processed successfully', + 'filename': filename, + 'data': 'END' + } + yield f"data: {json.dumps(final_response)}\n\n" + + +# def generate_response(final_json_path,logger): +# if not os.path.exists(final_json_path): +# logger.error('JSON file not found at path: ' + final_json_path) +# return jsonify({'error': 'JSON file not found'}), 404 +# with open(final_json_path, 'r', encoding='utf-8') as f: +# logger.info('final_json_path:'+final_json_path) +# zbparse_data = json.load(f) +# json_str = json.dumps(zbparse_data, ensure_ascii=False) +# return jsonify({ +# 'message': 'File uploaded and processed successfully', +# 'filename': os.path.basename(final_json_path), +# 'data': json_str +# }) + + +# @app.route('/get_json', methods=['POST']) +# def testjson(): +# final_json_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp4\\fd55f067-2cf6-475c-b7ce-4498f6606bf6\\final_result.json" +# with open(final_json_path, 'r', encoding='utf-8') as f: +# print('final_json_path:'+final_json_path) +# zbparse_data = json.load(f) +# json_str = json.dumps(zbparse_data, ensure_ascii=False) +# print(json_str) +# return jsonify({ +# 'message': 'File uploaded and processed successfully', +# 'filename': os.path.basename(final_json_path), +# 'data': json_str +# }) + + +def remove_directory(path): + try: + shutil.rmtree(path) + app.logger.info(f"Successfully removed directory: {path}") # 使用全局 logger 记录 + except Exception as e: + app.logger.error(f"Failed to remove directory {path}: {str(e)}") # 使用全局 logger 记录 + +if __name__ == '__main__': + app.run(debug=True, host='0.0.0.0', port=5000) diff --git a/flask_app/main/table_content_extraction.py b/flask_app/main/table_content_extraction.py new file mode 100644 index 0000000..b67ed5b --- /dev/null +++ b/flask_app/main/table_content_extraction.py @@ -0,0 +1,87 @@ +from docx import Document +import json + +def read_tables_from_docx(file_path): + """读取DOCX文件中的表格数据,并以嵌套字典的形式返回.""" + doc = Document(file_path) + table_list = {} + cur_title = [] + + for table in doc.tables: + for i, row in enumerate(table.rows): + cur_level = table_list + temp_title = [] + for j, cell in enumerate(row.cells): + text_str = cell.text.strip().replace(' ', '').replace('\n', '') # 移除键中的换行符 + if j < len(row.cells) - 1: + if text_str == "": + text_str = cur_title[j] if j < len(cur_title) else "<未识别到上级标题>" + if text_str not in cur_level: + cur_level[text_str] = {} + cur_level = cur_level[text_str] + temp_title.append(text_str) + else: + cell_text = cell.text.strip().replace(' ', '') + if len(temp_title) > 0: + last_key = temp_title[-1] + if last_key in cur_level: + if isinstance(cur_level[last_key], dict): + cur_level[last_key] = f"\n{cell_text}" + else: + cur_level[last_key] += f"\n{cell_text}" # 追加值到已有键 + else: + cur_level[last_key] = cell_text # 初始化键的值 + else: + last_key = f"第{i}行内容" + if last_key in cur_level: + if isinstance(cur_level[last_key], dict): + cur_level[last_key] = f"\n{cell_text}" + else: + cur_level[last_key] += f"\n{cell_text}" # 追加值到'第i行内容' + else: + cur_level[last_key] = cell_text + cur_title = temp_title[:] + + return table_list + +def flatten_nested_dicts(d): + """平坦化嵌套字典,以便更简洁地保存为JSON.""" + keys_to_remove = [] + items_to_add = {} + + for key, value in list(d.items()): + if isinstance(value, dict): + value = flatten_nested_dicts(value) + if len(value) == 1 and key in value: + keys_to_remove.append(key) + items_to_add[key] = value[key] + elif len(value) == 1 and list(value.keys())[0] == list(value.values())[0]: + items_to_add[key] = list(value.values())[0] + + for key in keys_to_remove: + del d[key] + d.update(items_to_add) + + return d + +def save_data_to_json(data, filename): + """将数据保存到JSON文件中.""" + with open(filename, 'w', encoding='utf-8') as file: + json.dump(data, file, ensure_ascii=False, indent=4) + +def extract_tables_main(path, output_filename): + # 读取文档表格数据 + table_data = read_tables_from_docx(path) + + # 平坦化嵌套字典 + flattened_data = flatten_nested_dicts(table_data) + + # 保存平坦化后的数据到JSON文件 + save_data_to_json(flattened_data, output_filename) + + print(f"The data has been processed and saved to '{output_filename}'.") + +if __name__ == "__main__": + path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标03_tobidders_notice_table.docx' + output_filename = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json" # 前附表json文件 + extract_tables_main(path, output_filename) diff --git a/flask_app/main/test.py b/flask_app/main/test.py new file mode 100644 index 0000000..b3ec8e4 --- /dev/null +++ b/flask_app/main/test.py @@ -0,0 +1,54 @@ +import json +import re + + +def transform_json(input_json): + # 解析输入的JSON字符串 + data = json.loads(input_json) + + # 创建一个新的字典来存储转换后的结果 + result = {} + + # 用于临时存储各个层级的字典 + temp = {0: result} + + # 遍历原始JSON的键值对 + for key, value in data.items(): + # 使用正则表达式判断键名的层级 + match = re.match(r'(\d+)(?:\.(\d+))?(?:\.(\d+))?', key) + if match: + levels = [int(l) for l in match.groups() if l is not None] + + # 获取当前层级的父级字典 + parent = temp[len(levels) - 1] + + # 如果是最后一层,直接添加值 + if len(levels) == len(match.groups()): + if isinstance(parent, list): + parent.append(value) + else: + parent[value.split()[0]] = value + else: + # 如果不是最后一层,创建新的列表或字典 + new_key = value.split()[0] + if new_key not in parent: + parent[new_key] = [] if isinstance(parent, dict) else {} + temp[len(levels)] = parent[new_key] + + return json.dumps(result, ensure_ascii=False, indent=2) + + +# 输入的JSON字符串 +input_json = '''{ +"6.": "评标", +"6.1": "评标委员会", +"6.1.1": "评标由招标人依法组建的评标委员会负责。评标委员会由招标人或其委托的招标代理机构熟悉相关业务的代表,以及有关技术、经济等方面的专家组成。评标委员会成员人数以及技术、经济等方面专家的确定方式见投标人须知前附表。", +"6.1.2": "评标委员会成员有下列情形之一的,应当回避:(1)投标人或投标人的主要负责人的近亲属;(2)项目主管部门或者行政监督部门的人员;(3)与投标人有经济利益关系或其他利害关系,可能影响对投标公正评审的;(4)曾因在招标、评标以及其他与招标投标有关活动中从事违法行为而受过行政处罚或刑事处罚的。", +"6.2": "评标原则评标活动遵循公平、公正、科学和择优的原则。", +"6.3": "评标评标委员会按照第三章"评标办法"规定的方法、评审因素、标准和程序对投标文件进行评审。第三章"评标办法"没有规定的方法、评审因素和标准,不作为评标依据。", +"6.4": "评标结果(定标候选人)公示招标人将自收到评标报告之日起3日内,在投标人须知前附表规定的媒介公示定标候选人。公示期不少于3日。投标人或者其他利害关系人对评标结果有异议的,应当在评标结果公示期间提出。招标人自收到异议之日起3日内作出答复;作出答复前,暂停招标投标活动。异议与答复应当通过"电子交易系统"在"异议与答复"菜单以书面形式进行。", +"6.5": "履约能力的审查(如有)如果定标候选人的经营、财务状况发生较大变化或者存在违法行为,招标人认为可能影响其履约能力的,将在发出中标通知书前,召集原评标委员会按照招标文件规定的标准和方法审查确认。" +}''' + +# 转换JSON并打印结果 +print(transform_json(input_json)) \ No newline at end of file diff --git a/flask_app/main/ttt.py b/flask_app/main/ttt.py new file mode 100644 index 0000000..eb3d9d5 --- /dev/null +++ b/flask_app/main/ttt.py @@ -0,0 +1,18 @@ +import re + +# 正则表达式 +pattern = re.compile(r'第[一二三四五六七八九十百千]+章.*?(?:项目|服务|商务).*?要求') + +# 示例文本进行测试 +text = """ +第一章项目技术、服务及商务要求 +第二章 服务细节要求 +第三章 商务处理要求 +第四章 项目安排要求 +第五章 安全要求 +""" + +# 查找所有匹配 +matches = pattern.findall(text) +for match in matches: + print(match) diff --git a/flask_app/main/删除知识库.py b/flask_app/main/删除知识库.py new file mode 100644 index 0000000..90aded4 --- /dev/null +++ b/flask_app/main/删除知识库.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +# This file is auto-generated, don't edit it. Thanks. +import os +from alibabacloud_bailian20231229.client import Client as bailian20231229Client +from alibabacloud_tea_openapi import models as open_api_models +from alibabacloud_bailian20231229 import models as bailian_20231229_models +from alibabacloud_tea_util import models as util_models +from alibabacloud_tea_util.client import Client as UtilClient + +def create_client() -> bailian20231229Client: + """ + 使用AK&SK初始化账号Client + @return: Client + @throws Exception + """ + config = open_api_models.Config( + access_key_id=os.environ['ALIBABA_CLOUD_ACCESS_KEY_ID'], + access_key_secret=os.environ['ALIBABA_CLOUD_ACCESS_KEY_SECRET'] + ) + config.endpoint = 'bailian.cn-beijing.aliyuncs.com' + return bailian20231229Client(config) + +def delete_index(client: bailian20231229Client, workspace_id: str, index_id: str) -> None: + delete_index_request = bailian_20231229_models.DeleteIndexRequest( + index_id=index_id + ) + runtime = util_models.RuntimeOptions() + headers = {} + try: + response = client.delete_index_with_options(workspace_id, delete_index_request, headers, runtime) + print("API Response:", response) + except Exception as error: + print(error.message) + print(error.data.get("Recommend")) + UtilClient.assert_as_string(error.message) + +async def delete_index_async(client: bailian20231229Client, workspace_id: str, index_id: str) -> None: + delete_index_request = bailian_20231229_models.DeleteIndexRequest( + index_id=index_id + ) + runtime = util_models.RuntimeOptions() + headers = {} + try: + response = await client.delete_index_with_options_async(workspace_id, delete_index_request, headers, runtime) + print("API Response:", response) + except Exception as error: + print(error.message) + print(error.data.get("Recommend")) + UtilClient.assert_as_string(error.message) + +if __name__ == '__main__': + workspace_id = os.environ['DASHSCOPE_WORKSPACE_ID'] + index_id = 'pg5rrsv26x' + client = create_client() + delete_index(client, workspace_id, index_id) diff --git a/flask_app/main/判断是否分包等.py b/flask_app/main/判断是否分包等.py new file mode 100644 index 0000000..3cff093 --- /dev/null +++ b/flask_app/main/判断是否分包等.py @@ -0,0 +1,157 @@ +# -*- encoding:utf-8 -*- +import json +import os.path +import re +from json_utils import extract_content_from_json # 可以选择性地导入特定的函数 +from 提取打勾符号 import read_pdf_and_judge_main +from 通义千问 import qianwen_ask +from 通义千问long import qianwen_long,upload_file +#调用qianwen-ask之后,组织提示词问百炼。 + +def construct_judge_questions(json_data): + # 使用 extract_content_from_json 提取和解析 JSON 数据 + parsed_data = extract_content_from_json(json_data) + if not parsed_data: + return "" + + question_keys = [] + for key, value in parsed_data.items(): + if value == '未知': + question_keys.append(f"'{key}'") + + if not question_keys: + return "" + + # 移除单引号后的键名列表字符串 + questions_without_quotes = ', '.join(key.strip("'") for key in question_keys) # 移除单引号 + + if not questions_without_quotes: # 检查 questions_without_quotes 是否为空 + return "" + + keys_str = ",".join(question_keys) + question = f"请你依据文档中的信息回答,{questions_without_quotes}?请按json格式给我提供信息,键名分别为{keys_str},键值仅限于'是','否','未知'。" + + return question + + +def merge_json_to_list(merged): + """Merge updates into the original data by modifying specific keys based on their value ('是' or '否'), and create a list based on these values.""" + chosen_numbers = [] + + # 处理是否允许分包 保持'是否允许分包'键名主要是由于存在'未知'的情况。 + if merged.get('是否允许分包') == '是': + chosen_numbers.append(1) + merged.pop('是否允许分包', None) + elif merged.get('是否允许分包') == '否': + merged['分包'] = '不允许' + merged.pop('是否允许分包', None) + + # 处理是否递交投标保证金 + if merged.get('是否递交投标保证金') == '是': + chosen_numbers.extend([2, 3]) + merged.pop('是否递交投标保证金', None) + elif merged.get('是否递交投标保证金') == '否': + merged['投标保证金'] = '不提交' + merged['退还投标保证金'] = '/' + merged.pop('是否递交投标保证金', None) + + # 处理是否有履约保证金 + if merged.get('是否有履约保证金') == '是': + chosen_numbers.append(4) + merged.pop('是否有履约保证金', None) + elif merged.get('是否有履约保证金') == '否': + merged['履约保证金'] = '不提交' + merged.pop('是否有履约保证金', None) + + # 处理是否有招标代理服务费 + if merged.get('是否有招标代理服务费') == '是': + chosen_numbers.append(5) + merged.pop('是否有招标代理服务费', None) + elif merged.get('是否有招标代理服务费') == '否': + merged['招标代理服务费'] = '无' + merged.pop('是否有招标代理服务费', None) + + if merged.get('是否组织踏勘现场') == '是': + chosen_numbers.append(6) + merged.pop('是否组织踏勘现场',None) + elif merged.get('是否组织踏勘现场') == '否': + merged['踏勘现场']='不组织' + merged.pop('是否组织踏勘现场', None) + + if merged.get('是否召开投标预备会') == '是': + chosen_numbers.append(7) + merged.pop('是否召开投标预备会',None) + elif merged.get('是否召开投标预备会') == '否': + merged['投标预备会']='不召开' + merged.pop('是否召开投标预备会', None) + + if merged.get('是否允许偏离') == '是': + chosen_numbers.append(8) + merged.pop('是否允许偏离',None) + elif merged.get('是否允许偏离') == '否': + merged['偏离']='不允许' + merged.pop('是否允许偏离', None) + + return chosen_numbers, json.dumps(merged,ensure_ascii=False) + + + +def read_questions_from_judge(file_path, indices): + questions = [] + + # 读取文件内容 + with open(file_path, 'r', encoding='utf-8') as file: + content = file.read() + + # 正则表达式提取问题 + pattern = r'(\d+)\.(.*?)#pdf提取之后的提示词|(\d+)\.(.*?)(?=\d+\.|$)' + matches = re.findall(pattern, content, re.DOTALL) + + # 解析匹配到的内容并提取对应序号的问题 + for match in matches: + num = match[0] or match[2] + question = match[1].strip() or match[3].strip() + if int(num) in indices: + questions.append(question) + + return questions + + +def judge_whether_main(file_path,output_folder): #传入招标文件中‘投标人须知前附表’ + prompt = "请你依据以上信息,回答以下问题:是否组织踏勘现场?是否召开投标预备会?是否允许偏离?是否退还投标文件?是否允许分包? 是否需要递交投标保证金?是否需要提交履约保证金(履约担保)?是否有招标代理服务费?请按json格式给我提供信息,键名分别为'是否组织踏勘现场','是否召开投标预备会','是否允许偏离','是否退还投标文件',是否允许分包','是否递交投标保证金','是否提交履约保证金','是否有招标代理服务费',键值仅限于'是','否','未知',若存在矛盾信息,请回答'未知'。" + output_json_path = os.path.join(output_folder,'judge_exist.json') + read_pdf_and_judge_main(file_path, output_json_path) #提取打勾符号 + qianwen_answer = qianwen_ask(output_json_path, prompt) # 调用普通千问判断是、否、未知 + print("qianwen_answer:" + qianwen_answer) + user_query = construct_judge_questions(qianwen_answer) # 提取回答为”未知“的键 + # 判断user_query是否为空 + if user_query: + print("user_query:" + user_query) + file_id = upload_file(file_path) + res = qianwen_long(file_id, user_query) #整个前附表一起传问千问long + print(res) + return process_judge_content(qianwen_answer, res) + + else: + print("Normal sig!No valid user query available. Skipping further actions.") + original = extract_content_from_json(qianwen_answer) + return merge_json_to_list(original) + + +def process_judge_content(original_json, update_json): #用新的数据合并旧数据 + """Process judging content by merging updates into the original JSON data.""" + original = extract_content_from_json(original_json) + updates = extract_content_from_json(update_json) + original.update(updates) + print(original) + return merge_json_to_list(original) + + + +if __name__ == "__main__": + file_path="C:\\Users\\Administrator\\Desktop\\招标文件\\test2\\zbtest10_tobidders_notice_table.pdf" + output_dir="C:\\Users\\Administrator\\Desktop\\招标文件\\output1" + chosen_numbers, merged=judge_whether_main(file_path,output_dir) + print(chosen_numbers) + print(merged) + diff --git a/flask_app/main/商务标技术标整合.py b/flask_app/main/商务标技术标整合.py new file mode 100644 index 0000000..642a191 --- /dev/null +++ b/flask_app/main/商务标技术标整合.py @@ -0,0 +1,43 @@ +import json + +from json_utils import clean_json_string, combine_json_results +from 通义千问long import upload_file, qianwen_long + + +def combine_business_and_bidding(data): + # 提取并移除“商务标” + business_data = data.pop("商务标", {}) + + # 尝试提取不同命名的“投标报价” + bidding_data = data.pop("投标报价", None) + if bidding_data is None: + bidding_data = data.pop("投标报价评审标准", {}) + + # 将“商务标”和“投标报价”数据合并 + business_data.update({ + "投标报价": bidding_data + }) + + # 将修改后的商务标数据重新放入主数据结构 + data["商务标"] = business_data + + return data +def combine_evaluation_standards(truncate2): + # 商务标、技术标评分项:千问 + print("starting商务标技术标...") + file_id = upload_file(truncate2) + user_query_2 = ( + "根据该文档中的评标办法前附表,请你列出该文件的技术标,商务标,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术标'中新增子键名'备注'存放该信息。如果评分内容不是这3个,则返回文档中给定的评分内容以及它的评分要求,都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容") + evaluation_res = qianwen_long(file_id, user_query_2) + + update_json=combine_business_and_bidding(clean_json_string(evaluation_res)) + update_json_str = json.dumps(update_json, ensure_ascii=False) + temp=combine_json_results([update_json_str]) + evaluation_combined_res = json.dumps(temp,ensure_ascii=False,indent=4) + print("商务标技术标done") + return evaluation_combined_res + +if __name__ == "__main__": + truncate2="C:\\Users\\Administrator\\Desktop\\招标文件\\output\\zbfile_evaluation_method.pdf" + res=combine_evaluation_standards(truncate2) + print(res) \ No newline at end of file diff --git a/flask_app/main/回答来源.py b/flask_app/main/回答来源.py new file mode 100644 index 0000000..2f0711c --- /dev/null +++ b/flask_app/main/回答来源.py @@ -0,0 +1,216 @@ +#基于多线程提问,现已废弃 +# assistant_id +import queue +import concurrent.futures +from dashscope import Assistants, Messages, Runs, Threads +from llama_index.indices.managed.dashscope import DashScopeCloudRetriever +from json_utils import extract_content_from_json +prompt = """ +# 角色 +你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。 + +## 技能 +### 技能 1:文档解析与摘要 +- 深入理解并分析${document1}的内容,提取关键信息。 +- 根据需求生成简洁明了的摘要,保持原文核心意义不变。 + +### 技能 2:信息检索与关联 +- 在${document1}中高效检索特定信息或关键词。 +- 能够识别并链接到文档内部或外部的相关内容,增强信息的连贯性和深度。 + +## 限制 +- 所有操作均需基于${document1}的内容,不可超出此范围创造信息。 +- 在处理敏感或机密信息时,需遵守严格的隐私和安全规定。 +- 确保所有生成或改编的内容逻辑连贯,无误导性信息。 + +请注意,上述技能执行时将直接利用并参考${document1}的具体内容,以确保所有产出紧密相关且高质量。 +""" +prom = '请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}' + + + + +#正文和文档名之间的内容 +def extract_content_between_tags(text): + results = [] + + # 使用“【正文】”来分割文本 + parts = text.split('【正文】')[1:] # 跳过第一个分割结果,因为它前面不会有内容 + + for index, part in enumerate(parts): + # 查找“【文档名】”标签的位置 + doc_name_index = part.find('【文档名】') + # 查找 'file_ids' 标签的位置 + file_ids_index = part.find("'file_ids'") + + # 根据是否找到“【文档名】”来决定提取内容的截止点 + if doc_name_index != -1: + end_index = doc_name_index + elif file_ids_index != -1: + end_index = file_ids_index + else: + end_index = len(part) + + # 提取内容 + content = part[:end_index].strip() + results.append(content) + + # 如果存在 file_ids,处理最后一部分特别提取 file_ids 前的内容 + if "'file_ids'" in parts[-1]: + file_ids_index = parts[-1].find("'file_ids'") + if file_ids_index != -1: + last_content = parts[-1][:file_ids_index].strip() + results[-1] = last_content # 更新最后一部分的内容,确保只到 file_ids + + return results + +def find_references_in_extracted(formatted_ans, extracted_references): + results = {} # 用来存储匹配结果的字典 + + # 递归函数,用于处理多层嵌套的字典 + def recurse_through_dict(current_dict, path=[]): + for key, value in current_dict.items(): + # 检查值是否还是字典,如果是,进行递归 + if isinstance(value, dict): + recurse_through_dict(value, path + [key]) + else: + # 特定值处理:如果值为'未知',直接设置索引为-1 + if value == '未知': + results['.'.join(path + [key])] = -1 + else: + # 进行匹配检查 + found = False + for index, reference in enumerate(extracted_references): + if str(value) in reference: # 转换为字符串,确保兼容性 + results['.'.join(path + [key])] = index # 使用点表示法记录路径 + found = True + break + if not found: + results['.'.join(path + [key])] = None + + # 从根字典开始递归 + recurse_through_dict(formatted_ans) + return results + +def send_message(assistant, message='百炼是什么?'): + ans = [] + print(f"Query: {message}") + + # create thread. + thread = Threads.create() + print(thread) + + # create a message. + message = Messages.create(thread.id, content=message) + # create run + + run = Runs.create(thread.id, assistant_id=assistant.id) + print(run) + + # wait for run completed or requires_action + run_status = Runs.wait(run.id, thread_id=thread.id) + print(run_status) + reference_txt = str(run_status) + extracted_references = extract_content_between_tags(reference_txt) #引用的文章来源list + # get the thread messages. + msgs = Messages.list(thread.id) + for message in msgs['data'][::-1]: + ans.append(message['content'][0]['text']['value']) + return ans,extracted_references + +def rag_assistant(knowledge_name): + retriever = DashScopeCloudRetriever(knowledge_name) + pipeline_id = str(retriever.pipeline_id) + assistant = Assistants.create( + model='qwen-max', + name='smart helper', + description='智能助手,支持知识库查询和插件调用。', + temperature='0.3', + instructions=prom, + tools=[ + { + "type": "code_interpreter" + }, + { + "type": "rag", + "prompt_ra": { + "pipeline_id": pipeline_id, + "parameters": { + "type": "object", + "properties": { + "query_word": { + "type": "str", + "value": "${document1}" + } + } + } + } + }] + ) + return assistant + + +def pure_assistant(): + assistant = Assistants.create( + model='qwen-max', + name='smart helper', + description='智能助手,能基于用户的要求精准简洁地回答用户的提问', + instructions='智能助手,能基于用户的要求精准简洁地回答用户的提问', + tools=[ + { + "type": "code_interpreter" + }, + ] + ) + return assistant + +def llm_call(question, knowledge_name, result_queue, ans_index, use_rag=True): + if use_rag: + assistant = rag_assistant(knowledge_name) + else: + assistant = pure_assistant() + ans,extracted_references = send_message(assistant, message=question) + for index, reference in enumerate(extracted_references, start=0): + print(f"{index}. {reference}") + formatted_ans=extract_content_from_json(ans[1]) + print(formatted_ans) + + results = find_references_in_extracted(formatted_ans, extracted_references) + for key, index in results.items(): + print(f"{key}: Found at index {index}") + result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans) + +def multi_threading(queries, knowledge_name, use_rag=True): + result_queue = queue.Queue() + + # 使用 ThreadPoolExecutor 管理线程 + with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: + # 使用字典保存每个提交的任务的Future对象,以便按顺序访问结果 + future_to_query = {executor.submit(llm_call, query, knowledge_name, result_queue, index, use_rag): index for + index, query in enumerate(queries)} + + # 收集每个线程的结果 + for future in concurrent.futures.as_completed(future_to_query): + index = future_to_query[future] + # 由于 llm_call 函数本身会处理结果,这里只需要确保任务执行完成 + try: + future.result() # 可以用来捕获异常或确认任务完成 + except Exception as exc: + print(f"Query {index} generated an exception: {exc}") + + # 从队列中获取所有结果并按索引排序 + results = [None] * len(queries) + while not result_queue.empty(): + index, result = result_queue.get() + results[index] = result + return results + +if __name__ == "__main__": + # 读取问题列表 + questions = ["该招标文件的工程概况(或项目概况)是?招标范围是?请按json格式给我提供信息,键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。"] + knowledge_name = "招标解析5word" + results = multi_threading(questions, knowledge_name, use_rag=True) + # 打印结果 + for question, response in results: + print(f"Question: {question}") + print(f"Response: {response}") diff --git a/flask_app/main/基础信息整合.py b/flask_app/main/基础信息整合.py new file mode 100644 index 0000000..d3e6cc4 --- /dev/null +++ b/flask_app/main/基础信息整合.py @@ -0,0 +1,135 @@ +from json_utils import clean_json_string, nest_json_under_key,rename_outer_key, combine_json_results +from 投标人须知正文提取指定内容 import extract_from_notice +from 判断是否分包等 import judge_whether_main, read_questions_from_judge +from 多线程提问 import read_questions_from_file, multi_threading +from 通义千问long import upload_file +def combine_basic_info(baseinfo_list): + combined_baseinfo_list = [] + key_groups = { + "招标人/代理信息": ["招标人","招标人联系方式", "招标代理机构","招标代理机构联系方式"], + "项目信息": ["工程名称", "招标编号","工程概况","招标范围","招标控制价","投标竞争下浮率","是否接受联合体投标"], + "关键时间/内容":["投标文件递交截止日期","递交方式","投标人要求澄清招标文件的截止时间","投标有效期","评标结果公示媒介"], + "保证金相关":['质量保证金','退还投标保证金'], + "其他信息":["重新招标、不再招标和终止招标","是否退还投标文件","费用承担"] + } + # 将所有基础信息合并到一个字典中 + combined_data = {} + relevant_keys_detected = set() + + # 预处理以决定哪些键名将被使用 + for baseinfo in baseinfo_list: + json_data = clean_json_string(baseinfo) + combined_data.update(json_data) + relevant_keys_detected.update(json_data.keys()) + # for key in relevant_keys.keys(): + # if key in json_data: + # relevant_keys[key] = True + + # 根据检测到的键动态调整 key_groups + dynamic_key_handling(key_groups, relevant_keys_detected) + + # 打印 key_groups 的内容检查它们是否被正确更新 + print("Updated key_groups after dynamic handling:") + print(key_groups) + + # 使用合并后的字典创建最终输出 + for group_name, keys in key_groups.items(): + group_data = {key: combined_data.get(key, "未提供") for key in keys} + combined_json = nest_json_under_key(group_data, group_name) + combined_baseinfo_list.append(combined_json) + + return combined_baseinfo_list + +def dynamic_key_handling(key_groups, detected_keys): + # 检查和调整键组配置 + for key in detected_keys: + if "投标保证金" in key or "履约保证金" in key: + key_groups["保证金相关"].append(key) + elif "联合体投标要求" in key: + key_groups["项目信息"].append(key) + elif "分包" in key: + key_groups["项目信息"].append(key) + elif "踏勘现场" in key: + key_groups["其他信息"].append(key) + elif "投标预备会" in key: + key_groups["其他信息"].append(key) + elif "偏离" in key: + key_groups["其他信息"].append(key) + +def judge_consortium_bidding(baseinfo_list): + for baseinfo in baseinfo_list: + json_data = clean_json_string(baseinfo) + # 检查 "是否接受联合体投标" 键是否存在且其值为 "是" + if json_data.get("是否接受联合体投标") == "是": + return True + return False +def project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #投标人须知前附表 + # 调用大模型回答项目基础信息 + print("starting基础信息...") + baseinfo_list = [] + # baseinfo_file_path='../static/提示词/前两章提问总结.txt' + baseinfo_file_path = 'static/提示词/前两章提问总结.txt' # 替换为你的txt文件路径 + questions = read_questions_from_file(baseinfo_file_path) + res1 = multi_threading(questions, knowledge_name) + for _, response in res1: # _占位,代表ques;response[0]也是ques;response[1]是ans + try: + if response and len(response) > 1: # 检查response存在且有至少两个元素 + baseinfo_list.append(response[1]) + else: + print(f"Warning: Missing or incomplete response data for query index {_}.") + except Exception as e: + print(f"Error processing response for query index {_}: {e}") + print("basic信息done...") + # 判断是否分包、是否需要递交投标保证金等 + chosen_numbers, merged = judge_whether_main(truncate0,output_folder) + baseinfo_list.append(merged) + # judge_file_path = '../static/提示词/是否相关问题.txt' + judge_file_path ='static/提示词/是否相关问题.txt' + judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers) + + + judge_consortium = judge_consortium_bidding(baseinfo_list) #通过招标公告判断是否接受联合体投标 + if judge_consortium: + judge_consortium_question = "该招标文件对于联合体投标的要求是怎样的,请按json格式给我提供信息,外层键名为'联合体投标要求'。" + judge_questions.append(judge_consortium_question) + + file_id=upload_file(truncate0) + res2 = multi_threading(judge_questions, "",file_id,2) #调用千问-long + if not res2: + print("errror!") + else: + # 打印结果 + for question, response in res2: + baseinfo_list.append(response) + # for _, response in res2: # _占位,代表ques;response[0]也是ques;response[1]是ans #调用百炼rag + # try: + # if response and len(response) > 1: # 检查response存在且有至少两个元素 + # baseinfo_list.append(response[1]) + # else: + # print(f"Warning: Missing or incomplete response data for query index {_}.") + # except Exception as e: + # print(f"Error processing response for query index {_}: {e}") + + rebidding_situation = extract_from_notice(clause_path, 3) #"重新招标, 不再招标和终止招标"需从投标人须知正文提取 + + update_json=rename_outer_key(rebidding_situation,"重新招标、不再招标和终止招标") + baseinfo_list.append(update_json) + + update_baseinfo_list=combine_basic_info(baseinfo_list) #整合基础信息核心代码 + + baseinfo_combined_res = combine_json_results(update_baseinfo_list) # 返回值是字典 + print("基础信息done") + return nest_json_under_key(baseinfo_combined_res, "基础信息") #返回值是json字符串 + + +if __name__ == "__main__": + knowledge_name = "ztb" + output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\test2" + truncate0="C:\\Users\\Administrator\\Desktop\\招标文件\\test2\\zbtest10_tobidders_notice_table.pdf" + clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\clause1.json" + res=project_basic_info(knowledge_name,truncate0,output_folder,clause_path) + print(res) + + + + diff --git a/flask_app/main/多线程分类.py b/flask_app/main/多线程分类.py new file mode 100644 index 0000000..c50c288 --- /dev/null +++ b/flask_app/main/多线程分类.py @@ -0,0 +1,373 @@ +# -*- coding: utf-8 -*- +import os +import shutil +from PyPDF2 import PdfReader, PdfWriter + + +def validate_pdf(file_path): + """ 验证PDF文件是否损坏 """ + try: + with open(file_path, "rb") as file: + pdf = PdfReader(file) + return len(pdf.pages) > 0 + except Exception as e: + print(f"Error reading PDF {file_path}: {str(e)}") + return False + + +def truncate_pdf(source_path, target_path, max_pages=15): + """截取PDF文件的前15页并保存""" + try: + with open(source_path, "rb") as infile: + reader = PdfReader(infile) + writer = PdfWriter() + for i in range(min(max_pages, len(reader.pages))): + writer.add_page(reader.pages[i]) + with open(target_path, "wb") as outfile: + writer.write(outfile) + except Exception as e: + print(f"Error processing PDF {source_path}: {str(e)}") + +def copy_file(src, dest): + """ 复制单个文件 """ + os.makedirs(os.path.dirname(dest), exist_ok=True) + shutil.copy2(src, dest) + +def copy_directory(source, destination): + """复制整个目录""" + os.makedirs(destination, exist_ok=True) + for item in os.listdir(source): + src_path = os.path.join(source, item) + dest_path = os.path.join(destination, item) + if os.path.isfile(src_path): + copy_file(src_path, dest_path) + else: + copy_directory(src_path, dest_path) + +def unique_file_name(base_path, file_name): + counter = 1 + name_part, extension = os.path.splitext(file_name) + new_file_name = file_name + # 检查文件是否存在,若存在则修改文件名 + while os.path.exists(os.path.join(base_path, new_file_name)): + new_file_name = f"{name_part}_{counter}{extension}" + counter += 1 + return new_file_name + +def process_pdf_folders(source_dir, target_dir): + """ 处理源目录中的PDF文件,并基于条件选择目标文件夹 """ + for root, dirs, files in os.walk(source_dir, topdown=False): + for file in files: + if file.lower().endswith('.pdf'): + source_file_path = os.path.join(root, file) + if validate_pdf(source_file_path): + relative_path = os.path.relpath(root, source_dir) + target_file_dir = os.path.join(target_dir, relative_path) + target_file_path = os.path.join(target_file_dir, file) + copy_file(source_file_path, target_file_path) + else: + print(f"Deleted corrupt file: {source_file_path}") + # 清除空目录 + if not os.listdir(root): + os.rmdir(root) + + +def classify_folders(source_dir, target_dir, project_index): + """Classifies folders and processes files based on specific criteria.""" + temp_dir = os.path.join(source_dir, 'temp') + os.makedirs(temp_dir, exist_ok=True) + + target_project_dir = None + processed_dirs = set() # Set to track processed directories + + for subdir in os.listdir(source_dir): + subdir_path = os.path.join(source_dir, subdir) + if not os.path.isdir(subdir_path): + continue + + files = [f.lower() for f in os.listdir(subdir_path)] + if 'zb.pdf' in files: + target_project_dir, target_zb_name= process_tender_files(subdir_path, temp_dir, target_dir, project_index) + processed_dirs.add(subdir_path) # Mark this directory as processed + elif subdir.lower() == "输出文件": + process_evaluation_files(subdir_path, target_project_dir) + processed_dirs.add(subdir_path) # Mark this directory as processed + + # Process remaining folders, skipping already processed ones + process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs,target_zb_name) + #删除tmp目录 + # if os.path.exists(temp_dir): + # shutil.rmtree(temp_dir) + if os.path.exists(source_dir): + shutil.rmtree(source_dir) + +def process_tender_files(subdir_path, temp_dir, target_dir, project_index): + """Processes tender files and returns the target project directory and updated index.""" + zb_path = os.path.join(subdir_path, "zb.pdf") + truncated_zb_path = os.path.join(temp_dir, "truncate_zb.pdf") + truncate_pdf(zb_path, truncated_zb_path, 30) # Truncate to the first 30 pages + bot_response = file_parse(truncated_zb_path, project_index, 1) + zb_response = extract_answer(bot_response[1], 1) + zb_response[0]=zb_response[0].replace('/', '-').replace('\\', '-') #用'-'代替'/' ,目录名不允许出现斜杠 + target_zb_name, category = zb_response[2] + ".pdf", zb_response[3] + + new_zb_path = os.path.join(subdir_path, target_zb_name) + os.rename(zb_path, new_zb_path) + + target_category_dir = os.path.join(target_dir, category) + target_project_dir = os.path.join(target_category_dir, zb_response[0] + "_" + zb_response[1]) + copy_directory(subdir_path, os.path.join(target_project_dir, "招标文件夹")) + + os.remove(truncated_zb_path) + shutil.rmtree(subdir_path) + return target_project_dir, zb_response[2] + + +def process_evaluation_files(subdir_path, target_project_dir): + """Processes evaluation folders.""" + copy_directory(subdir_path, os.path.join(target_project_dir, "评标文件夹")) + shutil.rmtree(subdir_path) + + +def process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs, target_zb_name): + """Processes remaining folders containing bid files.""" + target_tb_dir = os.path.join(target_project_dir, "投标文件夹") + + for subdir in os.listdir(source_dir): + subdir_path = os.path.join(source_dir, subdir) + if not os.path.isdir(subdir_path) or subdir_path in processed_dirs: + continue # Skip processed directories + + target_tbcom_dir = None # Initialize outside the file loop + + for item in os.listdir(subdir_path): + item_src_path = os.path.join(subdir_path, item) + new_name = "truncate_" + item + truncated_tb_path = os.path.join(temp_dir, new_name) + truncate_pdf(item_src_path, truncated_tb_path) + bot_response = file_parse(truncated_tb_path, project_index, 2) + tb_response = extract_answer(bot_response[1], 2) + if not tb_response: + continue # If there's no response, skip this file + + # Initialize target_tbcom_dir only once based on the first processed file + if not target_tbcom_dir: + target_tb_name, _ = tb_response + if(target_tb_name != target_zb_name and target_tb_name != "未知"): + target_tbcom_dir = os.path.join(target_tb_dir, target_tb_name) + print(target_tbcom_dir) + os.makedirs(target_tbcom_dir, exist_ok=True) + + tb_section = tb_response[1] + ".pdf" + new_tb_path = os.path.join(subdir_path, tb_section) + # 获取唯一文件名 + new_tb_path = os.path.join(subdir_path, unique_file_name(subdir_path, tb_section)) + os.rename(item_src_path, new_tb_path) + + # Remove temporary truncated file + os.remove(truncated_tb_path) + + # Copy the whole directory at once after all files have been processed and renamed + if target_tbcom_dir: + copy_directory(subdir_path, target_tbcom_dir) + shutil.rmtree(subdir_path) # Optionally remove the original directory after copying + +import re + +def extract_answer(input_string, type): + # 使用正则表达式匹配所需的信息 + # 第一种模式:项目名称、项目编号、招标人、类别 + # 在每个字段值后都添加了\s*以忽略尾随的空格 + pattern1 = r"项目名称[::]\s*(.*?)[;;]\s*项目编号[::]\s*(.*?)[;;]\s*招标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*" + # 第二种模式:投标人、类别 + pattern2 = r"投标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*" + + if type == 1: + match = re.search(pattern1, input_string) + if match: + print(f"1: {match.group(1).strip()} 2: {match.group(2).strip()} 3: {match.group(3).strip()} 4: {match.group(4).strip()}") + return [match.group(1).strip(), match.group(2).strip(), match.group(3).strip(), match.group(4).strip()] + else: + print("No match found for type 1.") + return [] + elif type == 2: + match = re.search(pattern2, input_string) + if match: + # 检查是否包含“投标函”,如果是则替换部分内容为“商务文件” + part = match.group(2).strip() + if "投标函" in part: + part = "商务文件" + return [match.group(1).strip(), part] + else: + print("No match found for type 2.") + return [] + + +from llama_index.readers.dashscope.base import DashScopeParse +from llama_index.readers.dashscope.utils import ResultType +from llama_index.indices.managed.dashscope import DashScopeCloudIndex +from dashscope import Assistants, Messages, Runs, Threads + + +def send_message(assistant, index,documents,message='百炼是什么?'): + print(f"Query: {message}") + + # create thread. + # create a thread. + thread = Threads.create() + + print(thread) + + # create a message. + message = Messages.create(thread.id, content=message) + # create run + + run = Runs.create(thread.id, assistant_id=assistant.id) + print("run:" + str(run)) + + # # get run statue + run_status = Runs.get(run.id, thread_id=thread.id) + # print(run_status) + + # wait for run completed or requires_action + run_status = Runs.wait(run.id, thread_id=thread.id) + # print(run_status) + + # if prompt input tool result, submit tool result. + + run_status = Runs.get(run.id, thread_id=thread.id) + # print(run_status) + # verify_status_code(run_status) + + # get the thread messages. + msgs = Messages.list(thread.id) + # print(msgs) + # print(json.dumps(msgs, default=lambda o: o.__dict__, sort_keys=True, indent=4)) + + ans = [] + + print("运行结果:") + for message in msgs['data'][::-1]: + ans.append(message['content'][0]['text']['value']) + print("content: ", message['content'][0]['text']['value']) + print("\n") + deleteFileFromKnowledge(index,documents) + return ans + + +def file_parse(filepath, knowledge_index, type): + parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND) + documents = parse.load_data(file_path=filepath) + # 创建一个字典来映射index值到知识库名 + index_to_name = { + 0: "文件分类知识库0", + 1: "文件分类知识库1", + 2: "文件分类知识库2", + 3: "文件分类知识库3" + } + # 使用get方法获取对应的知识库名,如果index不存在,返回"默认文件分类知识库" + cloud_index_name = index_to_name.get(knowledge_index, "0") + + index = DashScopeCloudIndex(cloud_index_name) + index._insert(documents) + retriever = index.as_retriever() + pipeline_id = str(retriever.pipeline_id) + assistant = Assistants.create( + model='qwen-plus', + name='smart helper', + description='智能助手,支持知识库查询和插件调用。', + instructions='请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}', + tools=[ + { + "type": "code_interpreter" + }, + { + "type": "rag", + "prompt_ra": { + "pipeline_id": pipeline_id, + "parameters": { + "type": "object", + "properties": { + "query_word": { + "type": "str", + "value": "${document1}" + } + + } + } + } + }] + ) + questions1 = "这份招标文件的项目名称是什么?这份文件的招标编号是多少?这份文件的招标人是谁?这份招标文件属于以下哪类招标:服务类、工程类、还是货物类?你可以结合你对招投标的了解和以下内容:工程类招标投标是指建设单位对拟建的工程项目通过法定的程序和方式吸引建设项目的承包单位竞争;\ + 货物类招投标是指以货物作为采购对象的招标,业主或称购货方为获得货物通过招标的形式选择合格的供货商或称供货方,包括原材料、产品、设备、电能和固态、液态、气态物体等;服务类招标又称为服务采购,指的是除了工程和货之外的其他招标投标活动,服物招标投标范围包含建设工程的勘察、设计、监理、工程咨询评估、科技项目、科研课题、物业管理、金融保险服务等\ + 请按下列键值对格式给我提供信息,避免无关内容:‘项目名称:XXX;项目编号:XXX;招标人:XXX;类别:XXX‘" + questions2 = "这份投标文件的投标人是谁?如果文件中未明确提及投标人,投标人以未知替代;这份文件属于哪类投标文件?你的回答仅限于以下三种:商务文件、技术文件、报价文件,不要给出其他回答。请按下列键值对格式给我提供信息,避免无关内容:‘投标人:XXX;类别:XXX’" + if (type == 1): + questions = questions1 + elif (type == 2): + questions = questions2 + return send_message(assistant,index,documents, message=questions) + +def deleteFileFromKnowledge(index,documents): + # 初始化一个列表来存储所有文档的 ID + file_ids = [] + + # 检查documents是否为列表且不为空 + if isinstance(documents, list) and documents: + # 遍历每个文档 + for document in documents: + # 使用属性访问方式获取每个文档的 id_ + # 确保 document 对象有一个名为 id_ 的属性 + file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常 + if file_id: + file_ids.append(file_id) # 将 id 添加到列表中 + index.delete_ref_doc(file_ids) + +import concurrent.futures +import logging + +# 配置日志,只输出到控制台 +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +def process_directory(subdir_path, base_intermediate_directory, final_directory, project_index): + logging.info(f"Starting to process directory: {subdir_path} with index {project_index}") + + # 为每个子目录创建一个专用的临时目录 + intermediate_directory = os.path.join(base_intermediate_directory, f"intermediate_{project_index}") + os.makedirs(intermediate_directory, exist_ok=True) + logging.info(f"Created intermediate directory: {intermediate_directory}") + + # 这里可以添加具体的目录处理函数,例如: + process_pdf_folders(subdir_path, intermediate_directory) + classify_folders(intermediate_directory, final_directory, project_index % 4) + + # 处理完毕后清理该目录 + shutil.rmtree(intermediate_directory) + logging.info(f"Deleted intermediate directory: {intermediate_directory}") + +def main(base_directory, base_intermediate_directory, final_directory): + os.makedirs(final_directory, exist_ok=True) + logging.info(f"Final directory ensured at: {final_directory}") + + project_index = 0 + with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: + futures = [] + for subdir in os.listdir(base_directory): + subdir_path = os.path.join(base_directory, subdir) + if os.path.isdir(subdir_path): + logging.info(f"Submitting job for directory: {subdir_path}") + future = executor.submit(process_directory, subdir_path, base_intermediate_directory, final_directory, project_index) + futures.append(future) + project_index += 1 + + for future in concurrent.futures.as_completed(futures): + try: + future.result() # 如果执行没有抛出异常,完成该任务 + except Exception as e: + logging.error(f"Thread resulted in an error: {e}") + +if __name__ == "__main__": + base_directory = 'D:\\bidding_trading_files\\招投标文件2023\\2023' + base_intermediate_directory = 'D:\\tmp' + final_directory = 'D:\\output' + main(base_directory, base_intermediate_directory, final_directory) diff --git a/flask_app/main/多线程提问.py b/flask_app/main/多线程提问.py new file mode 100644 index 0000000..3cfdc10 --- /dev/null +++ b/flask_app/main/多线程提问.py @@ -0,0 +1,189 @@ +# 基于知识库提问的通用模板, +# assistant_id +import re +import queue +import concurrent.futures +import time + +from dashscope import Assistants, Messages, Runs, Threads +from llama_index.indices.managed.dashscope import DashScopeCloudRetriever +from 通义千问long import qianwen_long, upload_file + +prompt = """ +# 角色 +你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。 + +## 技能 +### 技能 1:文档解析与摘要 +- 深入理解并分析${documents}的内容,提取关键信息。 +- 根据需求生成简洁明了的摘要,保持原文核心意义不变。 + +### 技能 2:信息检索与关联 +- 在${documents}中高效检索特定信息或关键词。 +- 能够识别并链接到文档内部或外部的相关内容,增强信息的连贯性和深度。 + +## 限制 +- 所有操作均需基于${documents}的内容,不可超出此范围创造信息。 +- 在处理敏感或机密信息时,需遵守严格的隐私和安全规定。 +- 确保所有生成或改编的内容逻辑连贯,无误导性信息。 + +请注意,上述技能执行时将直接利用并参考${documents}的具体内容,以确保所有产出紧密相关且高质量。 +""" +prom = '请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${documents}' + + +def read_questions_from_file(file_path): + questions = [] + with open(file_path, 'r', encoding='utf-8') as file: + for line in file: + line = line.strip() + # 使用正则表达式匹配以数字开头,后接一个点的行 + if re.match(r'\d+\.', line): + # 从点后分割并去除前后空格获取问题部分 + question = line.split('.', 1)[1].strip() + questions.append(question) + return questions + + +#正文和文档名之间的内容 + +def send_message(assistant, message='百炼是什么?'): + ans = [] + print(f"Query: {message}") + + # create thread. + thread = Threads.create() + print(thread) + + # create a message. + message = Messages.create(thread.id, content=message) + # create run + + run = Runs.create(thread.id, assistant_id=assistant.id) + # print(run) + + # wait for run completed or requires_action + run_status = Runs.wait(run.id, thread_id=thread.id) + # print(run_status) + # get the thread messages. + msgs = Messages.list(thread.id) + for message in msgs['data'][::-1]: + ans.append(message['content'][0]['text']['value']) + return ans + +def rag_assistant(knowledge_name): + retriever = DashScopeCloudRetriever(knowledge_name) + pipeline_id = str(retriever.pipeline_id) + assistant = Assistants.create( + model='qwen-max', + name='smart helper', + description='智能助手,支持知识库查询和插件调用。', + temperature='0.3', + instructions="请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${documents}", + tools=[ + { + "type": "code_interpreter" + }, + { + "type": "rag", + "prompt_ra": { + "pipeline_id": pipeline_id, + "parameters": { + "type": "object", + "properties": { + "query_word": { + "type": "str", + "value": "${documents}" + } + } + } + } + }] + ) + return assistant + + +def pure_assistant(): + assistant = Assistants.create( + model='qwen-max', + name='smart helper', + description='智能助手,能基于用户的要求精准简洁地回答用户的提问', + instructions='智能助手,能基于用户的要求精准简洁地回答用户的提问', + tools=[ + { + "type": "code_interpreter" + }, + ] + ) + return assistant + +def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type): + if llm_type==1: + assistant = rag_assistant(knowledge_name) + elif llm_type==2: + qianwen_res = qianwen_long(file_id,question) + result_queue.put((ans_index,(question,qianwen_res))) + return + else : + assistant = pure_assistant() + ans = send_message(assistant, message=question) + result_queue.put((ans_index, (question, ans))) # 在队列中添加索引 (question, ans) + +def multi_threading(queries, knowledge_name="", file_id="",llm_type=1): + result_queue = queue.Queue() + + # 使用 ThreadPoolExecutor 管理线程 + with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: + # 逐个提交任务,每提交一个任务后休眠1秒 + future_to_query = {} + for index, query in enumerate(queries): + future = executor.submit(llm_call, query, knowledge_name, file_id, result_queue, index, llm_type) + future_to_query[future] = index + time.sleep(1) # 每提交一个任务后等待1秒 + + # 收集每个线程的结果 + for future in concurrent.futures.as_completed(future_to_query): + index = future_to_query[future] + # 由于 llm_call 函数本身会处理结果,这里只需要确保任务执行完成 + try: + future.result() # 可以用来捕获异常或确认任务完成 + except Exception as exc: + print(f"Query {index} generated an exception: {exc}") + + # 从队列中获取所有结果并按索引排序 + results = [None] * len(queries) + while not result_queue.empty(): + index, result = result_queue.get() + results[index] = result + return results + +if __name__ == "__main__": + start_time=time.time() + # 读取问题列表 + questions =read_questions_from_file('../static/提示词/前两章提问总结.txt') + for i in questions: + print(i) + knowledge_name = "招标解析5word" + llm_type=1 + results = multi_threading(questions, knowledge_name) + end_time = time.time() + if not results: + print("errror!") + else: + print("elapsed time:"+str(end_time-start_time)) + # 打印结果 + for question, response in results: + print(f"Question: {question}") + print(f"Response: {response}") + + # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\ztb_evaluation_method.pdf" + # file_id = upload_file(file_path) + # questions=["根据该文档中的评标办法前附表,请你列出该文件的技术标,以json的格式返回结果","根据该文档中的评标办法前附表,请你列出该文件的商务标,以json的格式返回结果","根据该文档中的评标办法前附表,请你列出该文件的投标报价,以json的格式返回结果"] + # results=multi_threading(questions,"",file_id,2) #1代表使用百炼rag 2代表使用qianwen-long + # if not results: + # print("errror!") + # else: + # # 打印结果 + # for question, response in results: + # print(f"Question: {question}") + # print(f"Response: {response}") \ No newline at end of file diff --git a/flask_app/main/废标项.py b/flask_app/main/废标项.py new file mode 100644 index 0000000..77bec71 --- /dev/null +++ b/flask_app/main/废标项.py @@ -0,0 +1,66 @@ +import fitz # PyMuPDF +import re + +def extract_text_with_keywords(pdf_path, keywords, follow_up_keywords): + """ + 提取PDF文档中包含特定关键词的段落,包括正文和表格。如果段落中包含特定的后续关键词,也提取其后的段落,直到遇到下一个相似的序列编号。 + :param pdf_path: PDF文件的路径。 + :param keywords: 包含关键词的列表。 + :param follow_up_keywords: 触发连续提取的关键词列表。 + :return: 包含关键词的段落列表。 + """ + doc = fitz.open(pdf_path) + extracted_paragraphs = [] + continue_collecting = False + current_section_pattern = None + + for page in doc: + text_blocks = page.get_text("blocks") + for index, block in enumerate(text_blocks): + text = block[4].strip() # Text content of the block + if text == "": # Skip empty lines + continue + if continue_collecting: + if current_section_pattern and re.match(current_section_pattern, text): + continue_collecting = False + else: + extracted_paragraphs.append(text) + + if any(keyword in text for keyword in keywords): + extracted_paragraphs.append(text) + if any(follow_up in text for follow_up in follow_up_keywords): + continue_collecting = True + section_number = re.match(r'(\d+(\.\d+)*)', text) + if section_number: + current_section_number = section_number.group(1) + base_section_number = current_section_number.rsplit('.', 1)[0] + current_section_pattern = re.compile(rf'^{re.escape(base_section_number)}\.\d+\b') + else: + found_next_number = False + for next_index in range(index + 1, len(text_blocks)): + next_text = text_blocks[next_index][4].strip() + next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)', next_text) + if next_section_number: + found_next_number = True + current_section_pattern = re.compile(rf'^{next_section_number.group(1)}\b') + elif found_next_number: + break + doc.close() + return extracted_paragraphs + +# Example usage +doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标02_invalid.docx' +keywords = ['否决', '无效投标', '被拒绝', '予以拒绝'] +follow_up_keywords = ['情形之一'] +extracted_contents = extract_text_with_keywords(doc_path, keywords, follow_up_keywords) + +# Writing to file and handling duplicates +output_file = "C:\\Users\\Administrator\\Desktop\\temp.txt" +with open(output_file, 'w', encoding='utf-8') as file: + for content in extracted_contents: + file.write(content + '\n') + +# file_id = upload_file(output_file) +# user_query="根据文本中的内容,请你回答,否决投标和拒绝投标的情况有哪些?由于文本中存在冗余且无关的信息,且文本中的序号是混乱的,请你重新编排答案返回给我,以json格式返回,你的回答仅限于原文内容但删去原有序号,请不要自己组织语言。" +# res=qianwen_long(file_id,user_query) +# print("Query Result:", res) \ No newline at end of file diff --git a/flask_app/main/形式响应评审.py b/flask_app/main/形式响应评审.py new file mode 100644 index 0000000..911a59a --- /dev/null +++ b/flask_app/main/形式响应评审.py @@ -0,0 +1,170 @@ +import re +import json +import time + +from 多线程提问 import multi_threading +from 根据条款号整合json import process_and_merge_entries +from 通义千问long import qianwen_long +from json_utils import extract_content_from_json +prompt = """ +# 角色 +你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。 + +## 技能 +### 技能 1:文档解析与摘要 +- 深入理解并分析${document1}的内容,提取关键信息。 +- 根据需求生成简洁明了的摘要,保持原文核心意义不变。 + +### 技能 2:信息检索与关联 +- 在${document1}中高效检索特定信息或关键词。 +- 能够识别并链接到文档内部或外部的相关内容,增强信息的连贯性和深度。 + +## 限制 +- 所有操作均需基于${document1}的内容,不可超出此范围创造信息。 +- 在处理敏感或机密信息时,需遵守严格的隐私和安全规定。 +- 确保所有生成或改编的内容逻辑连贯,无误导性信息。 + +请注意,上述技能执行时将直接利用并参考${document1}的具体内容,以确保所有产出紧密相关且高质量。 +""" + +def extract_matching_keys(json_data): + # 函数首先检查输入 json_data 是否为字符串类型。如果是,它会使用 json.loads() 将字符串解析为字典。 + if isinstance(json_data, str): + data = json.loads(json_data) + else: + data = json_data + + # 正则表达式匹配 + include_patterns = [re.compile(r"符合第"), re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目")] + additional_include_patterns = [re.compile(r"规定"), re.compile(r"条目")] + exclude_patterns = ["投标文件格式", "权利义务", "技术标准","工程量清单"] + additional_exclude_patterns = ["按要求", "按规定"] + + # Initialize a list to hold filtered key-value pairs + final_matching = [] + + # Recursive function to traverse and filter data + def recursive_search(current_data, path=[]): + if isinstance(current_data, dict): + for key, value in current_data.items(): + new_path = path + [key] # Update path for nested keys + if isinstance(value, (dict, list)): + recursive_search(value, new_path) + else: + process_value(key, str(value), new_path) + elif isinstance(current_data, list): + for item in current_data: + recursive_search(item, path) + + # Function to process each value against the patterns + def process_value(key, value, path): + # Check exclude patterns first + if any(ex in key or ex in value for ex in exclude_patterns): + return + # Main include patterns + if any(pattern.search(value) for pattern in include_patterns): + # Additional exclude patterns + if not any(ex in key or ex in value for ex in additional_exclude_patterns): + # Additional include patterns + if any(pattern.search(value) for pattern in additional_include_patterns): + final_matching.append({".".join(path): value}) # Use dot notation for nested keys + + # Start the recursive search + recursive_search(data) + + return final_matching + + +def reformat_questions(match_keys): + """ + 根据是否包含特定序号格式(如3.7.4或3.7.4(5)或3.7.4(5)),重新格式化匹配到的评审条目。 + 若包含序号,则提取出来;若不包含,则生成格式化的问题字符串。 + """ + entries_with_numbers = [] + formatted_questions = [] + + # 正则表达式,同时匹配全角和半角括号 + pattern = re.compile(r'(\d+(?:\.\d+)+)(?:[\(\(](\d+)[\)\)])?') + + for entry in match_keys: + key, value = next(iter(entry.items())) + match = pattern.search(value) + if match: + # 如果存在序号,保存序号与对应的键值对,包括括号内的数字(如果存在) + num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "") + entries_with_numbers.append({key: num}) + else: + # 如果不存在序号,删除“符合”并格式化文本 + revised_standard = re.sub(r'符合', '', value) + formatted_entry = f"关于‘{key}’,{revised_standard}的内容是怎样的?请按json格式给我提供信息,键名为'{key}',如果存在未知信息,请在对应键值处填'未知'。" + formatted_questions.append(formatted_entry) + + return entries_with_numbers, formatted_questions + + + +def update_json_data(original_data, updates, second_response_list): + """ + 根据提供的更新字典覆盖原始JSON数据中对应的键值,支持点分隔的键来表示嵌套结构。 + 参数: + - original_data: dict, 原始的JSON数据。 + - updates: dict, 包含需要更新的键值对。 + - second_response_list: list, 包含多个字典,每个字典包含需要更新的键值对。 + 返回: + - updated_data: dict, 更新后的JSON数据。 + """ + def recursive_update(data, key, value): + # 处理点分隔的键,递归定位并更新嵌套字典 + keys = key.split('.') + for k in keys[:-1]: + data = data.setdefault(k, {}) + if isinstance(value, dict) and isinstance(data.get(keys[-1], None), dict): + data[keys[-1]] = {**data.get(keys[-1], {}), **value} + else: + data[keys[-1]] = value + + # 合并 updates 到 original_data 中 + for key, value in updates.items(): + recursive_update(original_data, key, value) + + # 遍历 second_response_list 中的每个字典,并合并到 original_data 中 + for response_dict in second_response_list: + for key, value in response_dict.items(): + recursive_update(original_data, key, value) + + return original_data + + + +def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause_json_path): + matched_keys = extract_matching_keys(original_dict_data) #[{'形式评审标准.投标文件签字盖章': '符合第二章“投标人须知”第 3.7.3(4)目 规定'}, {'形式评审标准.多标段投标': '符合第二章“投标人须知”第 10.1款规定'}] + entries_with_numbers, formatted_questions = reformat_questions(matched_keys) + results_2 = multi_threading(formatted_questions, knowledge_name, True) #无序号的直接问大模型 + second_response_list = [] + for _, response in results_2: + try: + if response and len(response) > 1: # 检查response存在且有至少两个元素 + temp = extract_content_from_json(response[1]) + second_response_list.append(temp) + else: + print(f"Warning: Missing or incomplete response data for query index {_}.") + except Exception as e: + print(f"Error processing response for query index {_}: {e}") + + # Assume JSON file paths are defined or configured correctly + combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath, clause_json_path) #脚本提取的要求 + updated_json = update_json_data(original_dict_data, combined_results, second_response_list) + return updated_json + + +if __name__ == "__main__": + start_time=time.time() + knowledge_name="zbfile" + truncate_tobidders_table_json_path="C:\\Users\\Administrator\\Desktop\\招标文件\\output\\truncate_output.json" + clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\output\\clause.json" + original_dict_data={'营业执照': '具备有效的营业执照', '资质条件': '符合第二章“投标人须知”第 1.4.1项规定', '财务状况': '符合第二章“投标人须知”第 1.4.1项规定', '类似业绩': '符合第二章“投标人须知”第 1.4.1项规定', '信誉': '符合第二章“投标人须知”第 1.4.1项规定', '项目经理资格': '符合第二章“投标人须知”第 1.4.1项规定', '设计负责人资格': '符合第二章“投标人须知”第 1.4.1项规定', '施工负责人资格': '符合第二章“投标人须知”第 1.4.1项规定', '施工机械设备': '符合第二章“投标人须知”第 1.4.1项规定', '项目管理机构及人员': '符合第二章“投标人须知”第 1.4.1项规定', '其他要求': '符合第二章“投标人须知”第 1.4.1项规定', '联合体投投人 (如有)': '符合第二章“投标人须知”第 1.4.2项规定', '不存在禁止投标的情形': '不存在第二章“投标人须知”第 1.4.3项规 定的任何一种情形'} + formal_json = process_reviews(original_dict_data,knowledge_name, truncate_tobidders_table_json_path, clause_path) + data = json.loads(formal_json) + end_time=time.time() + elapsed_time = end_time - start_time + print(f"Function execution took {elapsed_time} seconds.") diff --git a/flask_app/main/截取pdf.py b/flask_app/main/截取pdf.py new file mode 100644 index 0000000..5b5591a --- /dev/null +++ b/flask_app/main/截取pdf.py @@ -0,0 +1,149 @@ +from PyPDF2 import PdfReader, PdfWriter +import re # 导入正则表达式库 +import os # 用于文件和文件夹操作 + +def clean_page_numbers(text): + # 使用正则表达式删除页码 + # 假设页码在文本的最开始,紧跟着文字且无空格分隔 + cleaned_text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 + # 删除结尾的页码 + cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text) + # 删除形如 /129 的页码 + cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text) + return cleaned_text +def extract_pages(pdf_path, output_folder, chapter_pattern, begin_page, end_phrase_pattern, output_suffix): + # 打开PDF文件 + pdf_document = PdfReader(pdf_path) + start_page = None + end_page = None + + qualification_pattern = re.compile(r'资格审查|资质条件|能力') + # 遍历文档的每一页,查找开始和结束短语的位置 + for i in range(len(pdf_document.pages)): + page = pdf_document.pages[i] + text = page.extract_text() + if text: + cleaned_text = clean_page_numbers(text) + if re.search(chapter_pattern, cleaned_text) and i > begin_page: + if output_suffix == "invalid" and start_page: #仅当提取Invalid的时候,判断初始页码是第一个匹配到的页码,因为招标编号可能存在多个,后面的覆盖前面 + continue + if output_suffix == "qualification" and not re.search(qualification_pattern, cleaned_text): + # 如果是资格审查条件,但当前页不包含相关词汇,则不执行任何操作 + pass + else: + start_page = i + if start_page is not None and re.search(end_phrase_pattern, cleaned_text) and i > (start_page+1): + is_invalid_condition = output_suffix == "invalid" and i > 30 + if is_invalid_condition or output_suffix != "invalid": + end_page = i + break + + + # 确保找到了起始和结束页面 + if start_page is None or end_page is None: + print(f"未找到起始或结束页在文件 {pdf_path} 中!") + return None + + # 创建一个新的PDF文档保存截取的页面 + base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # Get the base name without extension + output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf") + output_doc = PdfWriter() + + # 添加需要的页面,从 start_page 开始,包括 end_page + for page_num in range(start_page, end_page + 1): + output_doc.add_page(pdf_document.pages[page_num]) + # 保存新的PDF文件 + with open(output_pdf_path, 'wb') as f: + output_doc.write(f) + + print(f"已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}") + + return output_pdf_path + +def process_input(input_path, output_folder, chapter_pattern, begin_page, end_phrases, output_suffix,selection): + # 确保输出文件夹存在 + if not os.path.exists(output_folder): + os.makedirs(output_folder) + if(selection==3 or selection==4 or selection==5): + end_phrase_pattern = re.compile('|'.join([phrase for phrase in end_phrases]), re.MULTILINE) + else: + end_phrase_pattern = re.compile('|'.join([re.escape(phrase) for phrase in end_phrases])) + + if os.path.isdir(input_path): + generated_files = [] + # 遍历文件夹内的所有PDF文件 + for file in os.listdir(input_path): + if file.endswith(".pdf"): + pdf_path = os.path.join(input_path, file) + output_pdf_path = extract_pages(pdf_path, output_folder, chapter_pattern, begin_page, end_phrase_pattern, output_suffix) + if output_pdf_path and os.path.isfile(output_pdf_path): + generated_files.append(output_pdf_path) + return generated_files + elif os.path.isfile(input_path) and input_path.endswith(".pdf"): + # 处理单个PDF文件 + output_pdf_path = extract_pages(input_path, output_folder, chapter_pattern, begin_page, end_phrase_pattern, output_suffix) + if output_pdf_path and os.path.isfile(output_pdf_path): + return [output_pdf_path] # 以列表形式返回,以保持一致性 + else: + print("提供的路径既不是文件夹也不是PDF文件。") + return [] + + +def truncate_pdf_main(input_path, output_folder, selection): + if selection == 1: + # Configure patterns and phrases for "投标人须知前附表" + pattern = re.compile(r'第[一二三四五六七八九十]+章\s*投标人须知') + begin_page = 3 + end_phrases = ["投标人须知正文"] + output_suffix = "tobidders_notice_table" + elif selection == 2: + # Configure patterns and phrases for "评标办法" + pattern = re.compile(r'第[一二三四五六七八九十]+章\s*评标办法') + begin_page = 10 + end_phrases = ["评标办法正文", "评标办法"] + output_suffix = "evaluation_method" + elif selection == 3: + # Configure patterns and phrases for "投标人须知正文" + pattern = re.compile(r'投标人须知正文') + begin_page = 5 + end_phrases = [ + r'^第[一二三四五六七八九十]+章\s*评标办法',r'^评标办法前附表',r'^附录:', r'^附录一:',r'^附件:', r'^附件一:',r'^附表:', r'^附表一:',r'^附录:', r'^附录一:',r'^附件:', r'^附件一:',r'^附表:', r'^附表一:', + ] + output_suffix = "tobidders_notice" + elif selection==4: + pattern = re.compile(r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:') + begin_page = 0 + end_phrases = [ + r'第[一二三四五六七八九十]+章\s*合同', + r':清标报告',# 添加了新的匹配项 + r':清标报告' + ] + output_suffix="invalid" + elif selection==5: + appendix_pattern = r'^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]' + pattern = re.compile(appendix_pattern) + begin_page=5 + end_phrases = [r'评标办法正文', r'评标办法',appendix_pattern] + output_suffix="qualification" + else: + print("无效的选择") + return None + + # Process the selected input + return process_input(input_path, output_folder, pattern, begin_page, end_phrases, output_suffix,selection) + +def truncate_pdf_multiple(input_path, output_folder): + truncate_files = [] + for selection in range(1, 6): + files = truncate_pdf_main(input_path, output_folder, selection) + truncate_files.extend(files) + return truncate_files + +if __name__ == "__main__": + input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest5.pdf" + output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\test" + truncate_pdf_multiple(input_path,output_folder) + # selection = 5 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-招标公告-合同条款前 + # generated_files = truncate_pdf_main(input_path, output_folder, selection) + # print("生成的文件:", generated_files) + diff --git a/flask_app/main/投标人须知正文提取指定内容.py b/flask_app/main/投标人须知正文提取指定内容.py new file mode 100644 index 0000000..c65c799 --- /dev/null +++ b/flask_app/main/投标人须知正文提取指定内容.py @@ -0,0 +1,120 @@ +import json +import re + + +# 定义查找与目标值匹配的键的函数 +def find_keys_by_value(target_value, json_data): + matched_keys = [k for k, v in json_data.items() if v == target_value] + if not matched_keys: + matched_keys = [k for k, v in json_data.items() if isinstance(v, str) and v.startswith(target_value)] + return matched_keys + + +# 定义查找以特定前缀开始的键的函数 +def find_keys_with_prefix(key_prefix, json_data): + subheadings = [k for k in json_data if k.startswith(key_prefix)] + return subheadings + + +# 从文件中读取JSON数据,并提取特定内容 +def extract_json(data, target_values): + results = {} + for target_value in target_values: + matched_keys = find_keys_by_value(target_value, data) + for key in matched_keys: + key_and_subheadings = find_keys_with_prefix(key, data) + for subkey in key_and_subheadings: + if "." in subkey: + parent_key = subkey.rsplit('.', 1)[0] + top_level_key = parent_key.split('.')[0] + '.' + # 特别处理定标相关的顶级键,确保不会重复添加其他键 + if target_value == "定标" and top_level_key not in results: + results[top_level_key] = "定标" + # 添加或更新父级键 + if parent_key not in results: + if parent_key in data: + results[parent_key] = data[parent_key] + # 添加当前键 + results[subkey] = data[subkey] + return results + + +# 转换结构化的JSON数据 +def transform_json(data): + result = {} + temp = {0: result} + + for key, value in data.items(): + match = re.match(r'(\d+)(?:\.(\d+))?(?:\.(\d+))?', key) + if match: + levels = [int(l) for l in match.groups() if l is not None] + parent = temp[len(levels) - 1] + + if len(levels) == len(match.groups()): + if isinstance(parent, list): + parent.append(value) + else: + # 对于没有 \n 的情况,使用首个空格分割的词作为键 + parent[value.split()[0]] = value + else: + new_key = value.split()[0] + if '\n' in value and len(levels) == 2: + # 处理换行情况并分割键和值 + new_key, new_value = value.split('\n', 1) + new_key = new_key.strip() + new_value = new_value.strip() + # 确保父级是字典 + if isinstance(parent, list): + if len(parent) == 0 or not isinstance(parent[-1], dict): + parent.append({}) + parent[-1][new_key] = new_value + else: + parent[new_key] = new_value + else: + if isinstance(parent, list): + if len(parent) == 0 or not isinstance(parent[-1], dict): + parent.append({}) + parent = parent[-1] + if new_key not in parent: + parent[new_key] = [] + temp[len(levels)] = parent[new_key] + + # 修改函数以移除只有一个元素的列表和空列表 + def remove_single_item_lists(node): + if isinstance(node, dict): + for key in list(node.keys()): + node[key] = remove_single_item_lists(node[key]) + if isinstance(node[key], list) and not node[key]: + node[key] = "" # 如果列表为空,转换为空字符串 + elif isinstance(node, list) and len(node) == 1: + return remove_single_item_lists(node[0]) + return node + + return remove_single_item_lists(result) + + +# 读取JSON数据,提取内容,转换结构,并打印结果 +def extract_from_notice(file_path, type): + if type == 1: + target_values = ["投标文件", "投标"] + elif type == 2: + target_values = ["开标", "评标", "定标"] + elif type == 3: + target_values = ["重新招标、不再招标和终止招标", "重新招标", "不再招标", "终止招标"] + else: + raise ValueError("Invalid type specified. Use 1 for '开标, 评标, 定标' or 2 for '投标文件, 投标'.") + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + extracted_data = extract_json(data, target_values) # 读取json + transformed_data = transform_json(extracted_data) + return transformed_data + + +# 假设原始数据文件路径 +if __name__ == "__main__": + file_path = 'clause2.json' + try: + res = extract_from_notice(file_path, 3) # 可以改变此处的 type 参数测试不同的场景 + print(res) + except ValueError as e: + print(e) diff --git a/flask_app/main/投标人须知正文条款提取成json文件.py b/flask_app/main/投标人须知正文条款提取成json文件.py new file mode 100644 index 0000000..90e5475 --- /dev/null +++ b/flask_app/main/投标人须知正文条款提取成json文件.py @@ -0,0 +1,152 @@ +import json +import docx +import fitz +import re +import os + +def extract_text_from_docx(file_path): + doc = docx.Document(file_path) + return '\n'.join([para.text for para in doc.paragraphs]) + +def clean_page_numbers(text): + # 删除每页开头的页码,假设页码后跟至少一个空格 + cleaned_text = re.sub(r'^\s*\d+\s+', '', text) + # 删除每页末尾的页码,假设页码前有至少一个空格 + cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text) + # 删除形如 /123 或 123/ 的页码 + cleaned_text = re.sub(r'\s*/\s*\d+\s*|\s*\d+\s*/\s*', '', cleaned_text) + return cleaned_text + +def extract_text_from_pdf(file_path): + doc = fitz.open(file_path) + text = "" + for page in doc: + page_text = page.get_text() + page_text = clean_page_numbers(page_text) + text += page_text + return text + +def extract_section(text, start_keyword, end_phrases): + start_index = text.find(start_keyword) + if start_index == -1: + return "" + + end_index = len(text) + for phrase in end_phrases: + match = re.search(phrase, text[start_index:]) + if match: + end_index = start_index + match.start() + break + + return text[start_index:end_index] + +def compare_headings(current, new): + # 使用过滤来确保只处理非空且为数字的部分 + current_nums = [int(num) for num in current.split('.') if num.isdigit()] + new_nums = [int(num) for num in new.split('.') if num.isdigit()] + + # 比较数字序列以确定标题的层次关系 + for c, n in zip(current_nums, new_nums): + if n > c: + return True + elif n < c: + return False + + # 如果新标题有更多层次,认为是新的子章节 + return len(new_nums) > len(current_nums) + + +def should_add_newline(content, keywords, max_length=20): + content_str = ''.join(content).strip() + return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length + +def handle_content_append(current_content, line_content, append_newline, keywords): + if append_newline: + if should_add_newline(current_content, keywords): + current_content.append('\n') # 添加换行符 + append_newline = False + current_content.append(line_content) + return append_newline + +#对二级标题如x.x进行额外处理:如果当前处理内容包含keywords中的内容,则必须保留换行符/如果当前内容字数大于20,不保留换行。 +def parse_text_by_heading(text): + keywords = ['包含', '以下'] + data = {} + current_key = None + current_content = [] + append_newline = False + + lines = text.split('\n') + for i, line in enumerate(lines): + line_stripped = line.strip() + # 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号 + match = re.match(r'^(?json,从表格提取数据 + clause_path = convert_clause_to_json(truncate_files[2], output_folder) # 投标人须知正文条款pdf->json + + return { + 'knowledge_index': index, + 'knowledge_name':knowledge_name, + 'truncate_files': truncate_files, + 'truncate0_jsonpath': truncate0_jsonpath, + 'clause_path': clause_path, + 'invalid_docpath': invalid_docpath + } +#基本信息 +def fetch_project_basic_info(knowledge_name,truncate0,output_folder,clause_path): #投标人须知前附表 + global_logger.info("starting基础信息...") + basic_res=project_basic_info(knowledge_name,truncate0,output_folder,clause_path) + global_logger.info("基础信息done") + return basic_res + +#形式、响应、资格评审 +def fetch_review_standards(truncate1,truncate4,knowledge_name,truncate0_jsonpath,clause_path): + global_logger.info("starting资格审查...") + review_standards_res=combine_review_standards(truncate1,truncate4,knowledge_name,truncate0_jsonpath,clause_path) + global_logger.info("资格审查done") + return review_standards_res + +#评分细则 +def fetch_evaluation_standards(truncate1): #评标办法前附表 + global_logger.info("starting商务标技术标...") + evaluation_standards_res= combine_evaluation_standards(truncate1) + global_logger.info("商务标技术标done") + return evaluation_standards_res + +#无效、废标项解析 +def fetch_invalid_requirements(invalid_docpath,output_folder,truncate0_jsonpath,clause_path,truncate4): + #废标项要求:千问 + global_logger.info("starting无效标与废标...") + find_invalid_res=combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath,clause_path,truncate4) + global_logger.info("无效标与废标done...") + return find_invalid_res + + +#投标文件要求 +def fetch_bidding_documents_requirements(clause_path): + global_logger.info("starting投标文件要求...") + fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1) + qualify_nested_res = nest_json_under_key(fetch_bidding_documents_requirements_json, "投标文件要求") + global_logger.info("投标文件要求done...") + return qualify_nested_res + +#开评定标流程 +def fetch_bid_opening(clause_path): + global_logger.info("starting开评定标流程...") + fetch_bid_opening_json=extract_from_notice(clause_path,2) + qualify_nested_res = nest_json_under_key(fetch_bid_opening_json, "开评定标流程") + global_logger.info("开评定标流程done...") + return qualify_nested_res + +# def main_processing(output_folder,downloaded_file_path,file_type,unique_id): #file_type=1->docx file_type=2->pdf +# global global_logger +# global_logger= get_global_logger(unique_id) +# # Preprocess files and get necessary data paths and knowledge index +# processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id) +# +# +# with concurrent.futures.ThreadPoolExecutor() as executor: +# # Submit all tasks to the executor +# futures = { +# 'base_info': executor.submit(fetch_project_basic_info, processed_data['knowledge_name'], +# processed_data['truncate_files'][0], output_folder, +# processed_data['clause_path']), +# 'review_standards': executor.submit(fetch_review_standards, processed_data['truncate_files'][1],processed_data['truncate_files'][4], +# processed_data['knowledge_name'], processed_data['truncate0_jsonpath'], +# processed_data['clause_path']), +# 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate_files'][1]), +# 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], +# output_folder, processed_data['truncate0_jsonpath'],processed_data['clause_path'],processed_data['truncate_files'][4]), +# 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, +# processed_data['clause_path']), +# 'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path']) +# } +# +# comprehensive_responses = [] +# # Collect results in the defined order +# for key in ['base_info', 'review_standards', 'evaluation_standards', 'invalid_requirements', 'bidding_documents_requirements','opening_bid']: +# try: +# # Wait for the future to complete and get the result +# result = futures[key].result() +# comprehensive_responses.append(result) +# except Exception as exc: +# global_logger.info(f"Error processing {key}: {exc}") +# # 合并 JSON 结果 +# combined_final_result = combine_json_results(comprehensive_responses) +# modified_json = transform_json_values(combined_final_result) +# +# final_result_path = os.path.join(output_folder, "final_result.json") +# with open(final_result_path, 'w', encoding='utf-8') as file: +# json.dump(modified_json, file, ensure_ascii=False, indent=2) +# global_logger.info("final_result.json has been saved") +# deleteKnowledge(processed_data['knowledge_index']) +# return final_result_path + +#TODO:{ +#目前返回结果: + # "opening_bid": "{
\"开评定标流程\": {
\"开标\": {
\"开标时间和地点\": [
\"招标人在本章第4.2.1项规定的投标截止时间(开标时间)在“电子交易平台”上公开进行开标,所有投标人均应当准时在线参加开标。\",
\"招标人通过互联网在投标人须知前附表规定的地点组织开标,并在投标截止时间30分钟前,使用CA数字证书登录“电子交易平台”,进入“开标室”选择相应标段作在线开标的准备工作。\",
\"投标人应当在能够保证设施设备可靠、互联网畅通的任意地点,通过互联网在线参加开标。在投标截止时间前,使用加密其投标文件的CA数字证书登录“电子交易平台”,进入“开标室”选择所投标段进行签到,并实时在线关注招标人的操作情况。5.2开标程序\",
\"主持人按下列程序在“电子交易平台”的“开标室”进行在线开标:(1)宣布开标纪律;(2)公布主持人、招标人代表、监标人等有关人员姓名;(3)公布在投标截止时间前投标文件的递交情况;(4)公布投标保证金递交情况;(5)按照投标人须知前附表规定抽取评标基准价下浮值(如有);规定最高投标限价计算方法的,计算并公布最高投标限价(如适用),当众公布后记录在案;(6)读取已解密的投标文件的内容;(7)公布投标人名称、标段名称、投标保证金的递交情况、投标报价、项目经理姓名及其他内容,并生成开标记录;(8)开标结束。\",
\"在本章第5.2.1(6)目规定的时间内,非因“电子交易平台”原因造成投标文件未解密的,视为投标人撤回投标文件。已解密的投标文件少于三个的,招标失败;已解密的投标文件不少于三个,开标继续进行。\"
],
\"开标异议\": [
\"投标人对开标有异议的,应当在开标过程中提出;招标人当场对异议作出答复,并记入开标记录。异议与答复应通过“开标室”在“异议与答复”菜单以书面形式进行。本处所称异议是指投标人在开标过程中对投标文件提交、投标截止时间、开标程序、开标记录以及投标人和招标人或者投标人相互之间存在利益冲突的情形等提出的质疑。\",
\"投标人异议成立的,招标人将及时采取纠正措施,或者提交评标委员会评审确认;投标人异议不成立的,招标人将当场给予解释说明。\"
],
\"特殊情况的处置\": [
\"因“电子交易平台”系统故障导致无法投标的,交易中心及时通知招标人,招标人视情况决定是否顺延投标截止时间。因投标人自身原因导致无法完成投标的,由投标人自行承担后果。\",
\"因“电子交易平台”系统故障导致无法正常开标的,招标人将暂停开标,待系统恢复正常后继续开标。\",
\"“电子交易平台”系统故障是指下列情形:(1)系统服务器发生故障,无法访问或无法使用系统;(2)系统的软件或数据库出现错误,不能进行正常操作;(3)系统发现有安全漏洞,有潜在的泄密危险;(4)出现断电、断网事故;(5)其他无法保证招投标过程正常进行的情形。\"
]
},
\"评标\": {
\"评标委员会\": [
\"评标由招标人依法组建的评标委员会负责。评标委员会由招标人代表以及有关技术、经济等方面的专家组成。评标委员会成员人数以及技术、经济等方面专家的确定方式见投标人须知前附表。\",
\"评标委员会成员有下列情形之一的,应当回避:(1)投标人或投标人主要负责人的近亲属;(2)项目主管部门或者行政监督部门的人员;(3)与投标人有经济利益关系,可能影响对投标公正评审的;(4)曾因在招标、评标以及其他与招标投标有关活动中从事违法行为而受过行政处罚或刑事处罚的。\",
\"定标会招标人原则上应当在定标候选人公示结束后5个工作日内召开定标会,如有特殊情况,最迟应当在定标候选人公示结束后10个工作日内召开定标会,定标会进入公共资源交易中心进行。\",
\"定标流程(1)签到,宣读定标委员会成员名单;(2)监督小组监督员宣读定标纪律;(3)招标人代表或招标代理机构人员向定标委员会介绍定标项目相关情况;(4)定标委员成员会有疑问的,可以向招标人代表进行提问;(5)阅相关资料;(6)投票;(7)招标人代表或招标代理机构人员进行统计;(8)定标委员会组长宣读得分结果和定标结果;定标委员会成员签署定标报告,会议结束。\",
\"定标原则(1)组建定标委员会:由招标人组建定标委员会负责定标工作,按照定标委员会定标法进行定标。定标委员会成员数量为5人,招标人的法定代表人或其授权代表(为领导班子成员之一)应当参加定标会,并推荐担任定标会组长主持定标会,定标委员会其他成员从招标人组建的定标成员库中随机抽取确定。定标委员会成员与定标候选人有利害关系的,应当回避。所有参加定标会的定标委员会成员的意见应有书面记录,并由所有定标委员会成员签字确认。(2)组建定标监督小组:由招标人组建定标监督小组,对定标委员会的定标活动全过程进行监督,定标监督小组由2人组成,一般为招标人本单位或上级单位纪检监察人员,也可由招标人的法定代表人或主要负责人指定骨干成员参加。定标监督小组有权就定标委员会违反定标规则的行为进行质询。评估是否符合内控机制及价值取向,确保定标过程公正、公平。定标前,招标人或者招标代理机构在定标前可以介绍项目情况、招标情况、对投标人或者项目负责人的考察、质询情况;招标人可以邀请评标专家代表介绍评标情况、专家评审意见及评标结论、提醒注意事项。定标委员会成员有疑问的,可以向招标人或者招标代理机构、评标专家提问。\",
\"定标办法(1)定标会成员根据评标委员会提出书面评标报告,结合定标候选人的投标报价、商务标、技术标、市场信誉等,招标人应当按照充分竞争、合理低价的原则,集体讨论后,采用简单多数原则进行票决,在进入投票范围的定标候选人中,以每人投票支持一个定标候选人的方式,得票最多且过半数的定标候选人为中标人。当没有定标候选人得票超过半数,但有2个定标候选人得票较多时,选择得票较多的2个定标候选人(按上一轮得票多少的顺序选择,在选择第2个定标候选人时出现同票的投标人时,所有同票定标候选人一并纳入下一轮的投票范围)作为二次投票的范围,直至出现得票过半数的定标候选人为止。如果没有2个定标候选人得票较多时,重新投票。(2)定标会由招标人或代理机构的工作人员发放选票、定标会成员填写选票(须说明推荐理由并署名),定标过程公开、公平、公正。定标会成员按有关规定及招标文件约定的定标方法确定一名中标人。投票定标选票招标项目名称:支持的投标人支持理由定标委员签名:时间:本项目采用“评定分离”方法实施招投标活动。本项目定标办法详见第三章附件定标办法。\",
\"定标标准(1)择优要素。招标人在定标前应对评标委员会评审结果与实际情况进行实质性审查核实,重点对投标人的企业实力、企业信誉、履约能力的真实性、准确性、一致性进行核实,招标人应如实记录审查核实情况并作为定标参考。在考虑价格因素时,招标人应坚持投标人投标报价和其履约能力、服务质量等与招标项目相匹配的原则。企业实力包括资质等级、近几年营业额、过往业绩(含业绩影响力、难易程度)等方面。企业信誉包括获得各种荣誉、过往业绩履约情况,同时应重点关注近几年的不良信息,包括建设行政主管部门作出的各种不良处罚以及其他失信记录。对拟派团队履约能力与履约水平考核方式,可以考察团队主要负责人类似业绩情况,也可以对拟派项目负责人进行答辩。为确保可追溯性,答辩工作在有录音、录像场所进行。各项考核动作要针对所有投标人统一进行,不宜针对部分投标人进行考核,以体现公平原则。在同等条件下,择优的相对标准有以下几个方面:1)投标报价:各定标候选人的报价结合其履约能力,服务质量等与招标项目相匹配,经综合比较,价格最合理得优;2)工程业绩:综合比较投标人投标人近五年,完成的单项合同额在2000万元以上的装饰装修项目,主要比较项目难易程度和项目造价,工程业绩总造价高且项目难度大的优于工程业绩总造价低且设计难度小的;若总体难度差异不大且造价类似的情况下,业绩数量多的优于业绩数量少的;3)技术方案:对项目理解程度高、与本项目针对性强、技术方案完善且合理性相应程度高的企业优于项目理解程度一般、技术方案基本完善且进度控制一般的企业;4)企业实力:企业财务指标良好(整体营业收入、资产负债率等)的企业优于财务指标一般得企业;以水平相同的情况下,营业收入的优劣为准;5)企业获奖:近五年(指从投标截止日往前推算五年)类似项目获得国家级奖项优于获得省级奖项;6)企业信誉:无不良行为记录企业优于有不良行为记录企业,不良行为记录较轻企业优于不良行为记录较重企业。定标会在评议时优先进行“比优”,无法比优情况下可进行“比劣”,“比劣”可参考以下等要素进行:1)有无串通投标,围标,以行贿等不正当手段谋取中标行为;2)有无挂靠,以他人名义投标,出让或者出租资格、资质证书供他人投标行为;投标人在招标人的项目中有无严重违约或重大工程质量安全问题;投标人在近一年内经查实有以上行为的不确定为中标人。\"
],
\"评标原则\": \"评标活动遵循公平、公正、科学和择优的原则。\",
\"评标\": \"评标委员会按照第三章“评标办法”规定的方法、评审因素、标准和程序对投标文件进行评审。第三章“评标办法”没有规定的方法、评审因素和标准,不作为评标依据。\",
\"评标结果公示\": \"招标人将自收到评标报告之日起3日内,在投标人须知前附表规定的媒介公示中标候选人。公示期不少于3日。投标人或者其他利害关系人对评标结果有异议的,应当在评标结果公示期间提出。招标人自收到异议之日起3日内作出答复;作出答复前,暂停招标投标活动。异议与答复应当通过“电子交易平台”在“异议与答复”菜单以书面形式进行。\",
\"履约能力的审查(如有)\": \"如果中标候选人的经营、财务状况发生较大变化或者存在违法行为,招标人认为可能影响其履约能力的,将在发出中标通知书前报行政监督部门后,召集原评标委员会按照招标文件规定的标准和方法审查确认。\"
},
\"定标\": {
\"评标结果\": \"(1)评标委员会完成评标后,应当向招标人提出书面评标报告,阐明评标委员会对各投标文件的评审和比较意见,并按照招标文件中规定的评标方法,在投标报价合格的基础上,按照最终得分(保留2位小数)由高到低推荐定标候选人。定标候选人不少于3家,不超过5家。投标人的数量少于或者等于10家时,评标委员会推荐的定标候选人数量不超过3家,经评标委员会评审,符合招标文件要求的定标候选人不足3家时,由评标委员会作出是否具备竞争性,如具备竞争性,可继续推荐定标候选人,招标人可继续定标,否则,招标人应重新招标。定标侯选人进入定标程序。(2)经评标委员会评审,符合招标文件要求的定标候选人不足3家时,由评标委员会作出是否具备竞争性,如具备竞争性,可继续推荐定标候选人,招标人可继续定标,否则,招标人应重新招标。(3)招标人应当自收到评标报告之日起3日内公示定标候选人,公示期不少于3日。对评标结果的异议的提出和处理,适用《招标投标法实施条例》第五十四条的规定。评标结果(定标候选人)公示期间,因异议或投诉导致定标候选人少于招标文件规定的数量时,招标人继续定标还是在原评标委员会评审的基础上递补定标候选人由招标人在招标文件中明确。评标结果(定标候选人)公示期间,有定标候选人因异议或投诉并查实被取消中标资格时,若有效定标候选人不少于3家的,不再递补,招标人继续定标;除评标委员会作出具备竞争性情形外,若有效定标候选人少于3家的,按投标人得分高低补足至3家。对于递补的定标候选人需在黄石市公共资源交易信息网公示不少于3日。\"
}
}
}" +#} + +def main_processing(output_folder, downloaded_file_path, file_type, unique_id): + global global_logger + global_logger = get_global_logger(unique_id) + processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id) + + with concurrent.futures.ThreadPoolExecutor() as executor: + # 提交任务到线程池 + futures = { + 'base_info': executor.submit(fetch_project_basic_info, processed_data['knowledge_name'], + processed_data['truncate_files'][0], output_folder, + processed_data['clause_path']), + 'review_standards': executor.submit(fetch_review_standards, processed_data['truncate_files'][1], + processed_data['truncate_files'][4], + processed_data['knowledge_name'], processed_data['truncate0_jsonpath'], + processed_data['clause_path']), + 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate_files'][1]), + 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], + output_folder, processed_data['truncate0_jsonpath'], + processed_data['clause_path'], processed_data['truncate_files'][4]), + 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, + processed_data['clause_path']), + 'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path']) + } + # 逐个提交任务,每次提交间隔1秒 + for task_name, func, args in futures: + futures[task_name] = executor.submit(func, *args) + time.sleep(1) # 每提交一个任务后暂停1秒 + + # 根据任务完成的顺序处理和返回结果 + for future in concurrent.futures.as_completed(futures.values()): + key = next(key for key, val in futures.items() if val == future) # 获取完成任务的key + try: + result = future.result() + modified_result = transform_json_values({key: result}) + yield f"data: {json.dumps(modified_result)}\n\n" + except Exception as exc: + global_logger.info(f"Error processing {key}: {exc}") + yield f"data: {json.dumps({'error': f'Error processing {key}: {str(exc)}'})}\n\n" + + deleteKnowledge(processed_data['knowledge_index']) + + +#TODO:近三年业绩可能是按照投标文件的来的/分模块返回结果/对于上传docx文件进行优化 +if __name__ == "__main__": + output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\test" + + # truncate0 = os.path.join(output_folder, "ztb_tobidders_notice_table.pdf") + # truncate1=os.path.join(output_folder,"ztb_evaluation_method.pdf") + # clause_path = convert_clause_to_json(truncate1, output_folder) + # truncate0_jsonpath = os.path.join(output_folder, "truncate_output3.json") + + start_time = time.time() + file_type=2 + input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\test\\zbtest12.docx" + file_path=main_processing(output_folder,input_file,1,"uuidzyzy11") + + end_time = time.time() + elapsed_time = end_time - start_time # 计算耗时 + print(f"Function execution took {elapsed_time} seconds.") + + diff --git a/flask_app/main/按页读取pdf.py b/flask_app/main/按页读取pdf.py new file mode 100644 index 0000000..21a441f --- /dev/null +++ b/flask_app/main/按页读取pdf.py @@ -0,0 +1,30 @@ +import PyPDF2 +import re # 导入正则表达式库 + +def clean_page_numbers(text): + # 使用正则表达式删除页码 + # 假设页码在文本的最开始,最结尾,或中间(如 /129 或 129) + cleaned_text = re.sub(r'^\s*\d+\s+', '', text) # 删除开头的页码 + cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text) # 删除结尾的页码 + cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text) # 删除形如 /129 的页码 + return cleaned_text + +def extract_text_by_page(file_path): + result = "" + with open(file_path, 'rb') as file: + reader = PyPDF2.PdfReader(file) + num_pages = len(reader.pages) + # print(f"Total pages: {num_pages}") + for page_num in range(num_pages): + page = reader.pages[page_num] + text = page.extract_text() + if text: + cleaned_text = clean_page_numbers(text) + result += cleaned_text + # print(f"Page {page_num + 1} Content:\n{cleaned_text}") + else: + print(f"Page {page_num + 1} is empty or text could not be extracted.") + return result +if __name__ == '__main__': + file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\zbfile_tobidders_notice_table.pdf" + extract_text_by_page(file_path) diff --git a/flask_app/main/提取打勾符号.py b/flask_app/main/提取打勾符号.py new file mode 100644 index 0000000..069af8e --- /dev/null +++ b/flask_app/main/提取打勾符号.py @@ -0,0 +1,83 @@ +import re +import PyPDF2 +import json + +def extract_key_value_pairs(text): + import re + # 使用正则表达式来找到通常的键值对 + pattern = r'\d+\.\d+\s*([\w\s()\u4e00-\u9fff]+)[\x01\x02☑]([\w\s\u4e00-\u9fff]+)' + matches = re.findall(pattern, text) + results = {} + + for key, value in matches: + # 移除键中的数字和点序号 + key = re.sub(r'^\d+(\.\d+)*\s+', '', key) + # 移除键中的多余空格和特殊字符 + cleaned_key = re.sub(r'\s+', '', key).replace('(', '').replace(')', '') + # 清理值并停止提取到特定标点符号为止 + cleaned_value = re.split(r'[,。、,]', value)[0].strip() + results[cleaned_key] = cleaned_value + + # 处理 '' 或 '☑' 位于行首的特殊情况 + lines = text.split('\n') + previous_lines = [] + last_serial_key = "" + + for line in lines: + if re.match(r'^[\x01\x02☑√]', line): + # 提取当前行的值,直到特定标点符号 + value = re.sub(r'^[\x01\x02☑√]\s*', '', line) + cleaned_value = re.split(r'[,。、,]', value)[0].strip() + + # 使用最后一个包含序号的行作为键名的候选 + if last_serial_key: + # 从最后一个有效序号行提取键名,并在特殊字符前停止 + key = re.sub(r'^\d+(\.\d+)*\s+', '', last_serial_key) + key = re.split(r'[\x01\x02□]', key)[0].strip() # 停止提取到特殊字符为止 + # 处理重复键名的情况 + original_key = key + count = 1 + while key in results: + key = f"{original_key}{' ' * count}" + count += 1 + results[key] = cleaned_value + else: + # 保留最近的几行作为键的候选行 + if re.search(r'\d+\.\d+', line): + last_serial_key = line # 更新最后一个包含序号的行作为键名的候选 + previous_lines.append(line) + if len(previous_lines) > 10: + previous_lines.pop(0) + + return results + + + +def read_pdf_and_judge_main(file_path, output_json_path): + with open(file_path, 'rb') as file: + reader = PyPDF2.PdfReader(file) + num_pages = len(reader.pages) + print(f"Total pages: {num_pages}") + + all_data = {} + for page_num in range(num_pages): + page = reader.pages[page_num] + text = page.extract_text() if page.extract_text() else "" + # 使用正则表达式删除每页开头的页码,格式通常为“数字+空格” + cleaned_text = re.sub(r'^\d+\s+', '', text) + + key_value_pairs = extract_key_value_pairs( cleaned_text) + all_data.update(key_value_pairs) + # 和有区别吗   + # 将所有数据保存为 JSON 文件 + with open(output_json_path, "w", encoding="utf-8") as json_file: + json.dump(all_data, json_file, ensure_ascii=False, indent=4) + + print(f"Data extraction complete and saved to '{output_json_path}'.") + + +if __name__ == "__main__": + # 示例调用 + file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\ztb_tobidders_notice_table.pdf' + output_json_path = 'judge_exist.json' + read_pdf_and_judge_main(file_path, output_json_path) diff --git a/flask_app/main/文件分类普通版.py b/flask_app/main/文件分类普通版.py new file mode 100644 index 0000000..6055121 --- /dev/null +++ b/flask_app/main/文件分类普通版.py @@ -0,0 +1,350 @@ +# -*- coding: utf-8 -*- +import os +import shutil +from PyPDF2 import PdfReader, PdfWriter + + +def validate_pdf(file_path): + """ 验证PDF文件是否损坏 """ + try: + with open(file_path, "rb") as file: + pdf = PdfReader(file) + return len(pdf.pages) > 0 + except Exception as e: + print(f"Error reading PDF {file_path}: {str(e)}") + return False + + +def truncate_pdf(source_path, target_path, max_pages=15): + """截取PDF文件的前15页并保存""" + try: + with open(source_path, "rb") as infile: + reader = PdfReader(infile) + writer = PdfWriter() + for i in range(min(max_pages, len(reader.pages))): + writer.add_page(reader.pages[i]) + with open(target_path, "wb") as outfile: + writer.write(outfile) + except Exception as e: + print(f"Error processing PDF {source_path}: {str(e)}") + +def copy_file(src, dest): + """ 复制单个文件 """ + os.makedirs(os.path.dirname(dest), exist_ok=True) + shutil.copy2(src, dest) + +def copy_directory(source, destination): + """复制整个目录""" + os.makedirs(destination, exist_ok=True) + for item in os.listdir(source): + src_path = os.path.join(source, item) + dest_path = os.path.join(destination, item) + if os.path.isfile(src_path): + copy_file(src_path, dest_path) + else: + copy_directory(src_path, dest_path) + +def unique_file_name(base_path, file_name): + counter = 1 + name_part, extension = os.path.splitext(file_name) + new_file_name = file_name + # 检查文件是否存在,若存在则修改文件名 + while os.path.exists(os.path.join(base_path, new_file_name)): + new_file_name = f"{name_part}_{counter}{extension}" + counter += 1 + return new_file_name + +def process_pdf_folders(source_dir, target_dir): + """ 处理源目录中的PDF文件,并基于条件选择目标文件夹 """ + for root, dirs, files in os.walk(source_dir, topdown=False): + for file in files: + if file.lower().endswith('.pdf'): + source_file_path = os.path.join(root, file) + if validate_pdf(source_file_path): + relative_path = os.path.relpath(root, source_dir) + target_file_dir = os.path.join(target_dir, relative_path) + target_file_path = os.path.join(target_file_dir, file) + copy_file(source_file_path, target_file_path) + else: + print(f"Deleted corrupt file: {source_file_path}") + # 清除空目录 + if not os.listdir(root): + os.rmdir(root) + + +def classify_folders(source_dir, target_dir, project_index): + """Classifies folders and processes files based on specific criteria.""" + temp_dir = os.path.join(source_dir, 'temp') + os.makedirs(temp_dir, exist_ok=True) + + target_project_dir = None + processed_dirs = set() # Set to track processed directories + + for subdir in os.listdir(source_dir): + subdir_path = os.path.join(source_dir, subdir) + if not os.path.isdir(subdir_path): + continue + + files = [f.lower() for f in os.listdir(subdir_path)] + if 'zb.pdf' in files: + target_project_dir, project_index ,target_zb_name= process_tender_files(subdir_path, temp_dir, target_dir, project_index) + processed_dirs.add(subdir_path) # Mark this directory as processed + elif subdir.lower() == "输出文件": + process_evaluation_files(subdir_path, target_project_dir) + processed_dirs.add(subdir_path) # Mark this directory as processed + + # Process remaining folders, skipping already processed ones + process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs,target_zb_name) + #删除tmp目录 + # if os.path.exists(temp_dir): + # shutil.rmtree(temp_dir) + if os.path.exists(source_dir): + shutil.rmtree(source_dir) + +def process_tender_files(subdir_path, temp_dir, target_dir, project_index): + """Processes tender files and returns the target project directory and updated index.""" + zb_path = os.path.join(subdir_path, "zb.pdf") + truncated_zb_path = os.path.join(temp_dir, "truncate_zb.pdf") + truncate_pdf(zb_path, truncated_zb_path, 30) # Truncate to the first 30 pages + bot_response = file_parse(truncated_zb_path, "file_" + str(project_index), 1) + project_index += 1 + zb_response = extract_answer(bot_response[1], 1) + zb_response[0]=zb_response[0].replace('/', '-').replace('\\', '-') #用'-'代替'/' ,目录名不允许出现斜杠 + target_zb_name, category = zb_response[2] + ".pdf", zb_response[3] + + new_zb_path = os.path.join(subdir_path, target_zb_name) + os.rename(zb_path, new_zb_path) + + target_category_dir = os.path.join(target_dir, category) + target_project_dir = os.path.join(target_category_dir, zb_response[0] + "_" + zb_response[1]) + copy_directory(subdir_path, os.path.join(target_project_dir, "招标文件夹")) + + os.remove(truncated_zb_path) + shutil.rmtree(subdir_path) + return target_project_dir, project_index,zb_response[2] + + +def process_evaluation_files(subdir_path, target_project_dir): + """Processes evaluation folders.""" + copy_directory(subdir_path, os.path.join(target_project_dir, "评标文件夹")) + shutil.rmtree(subdir_path) + + +def process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs, target_zb_name): + """Processes remaining folders containing bid files.""" + target_tb_dir = os.path.join(target_project_dir, "投标文件夹") + + for subdir in os.listdir(source_dir): + subdir_path = os.path.join(source_dir, subdir) + if not os.path.isdir(subdir_path) or subdir_path in processed_dirs: + continue # Skip processed directories + + target_tbcom_dir = None # Initialize outside the file loop + + for item in os.listdir(subdir_path): + item_src_path = os.path.join(subdir_path, item) + new_name = "truncate_" + item + truncated_tb_path = os.path.join(temp_dir, new_name) + truncate_pdf(item_src_path, truncated_tb_path) + bot_response = file_parse(truncated_tb_path, "file_" + str(project_index), 2) + project_index += 1 + tb_response = extract_answer(bot_response[1], 2) + if not tb_response: + continue # If there's no response, skip this file + + # Initialize target_tbcom_dir only once based on the first processed file + if not target_tbcom_dir: + target_tb_name, _ = tb_response + if(target_tb_name != target_zb_name and target_tb_name != "未知"): + target_tbcom_dir = os.path.join(target_tb_dir, target_tb_name) + print(target_tbcom_dir) + os.makedirs(target_tbcom_dir, exist_ok=True) + + tb_section = tb_response[1] + ".pdf" + new_tb_path = os.path.join(subdir_path, tb_section) + # 获取唯一文件名 + new_tb_path = os.path.join(subdir_path, unique_file_name(subdir_path, tb_section)) + os.rename(item_src_path, new_tb_path) + + # Remove temporary truncated file + os.remove(truncated_tb_path) + + # Copy the whole directory at once after all files have been processed and renamed + if target_tbcom_dir: + copy_directory(subdir_path, target_tbcom_dir) + shutil.rmtree(subdir_path) # Optionally remove the original directory after copying + + + +import re + + +import re + +def extract_answer(input_string, type): + # 使用正则表达式匹配所需的信息 + # 第一种模式:项目名称、项目编号、招标人、类别 + # 在每个字段值后都添加了\s*以忽略尾随的空格 + pattern1 = r"项目名称[::]\s*(.*?)[;;]\s*项目编号[::]\s*(.*?)[;;]\s*招标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*" + # 第二种模式:投标人、类别 + pattern2 = r"投标人[::]\s*(.*?)[;;]\s*类别[::]\s*([^。;;]*).*" + + if type == 1: + match = re.search(pattern1, input_string) + if match: + print(f"1: {match.group(1).strip()} 2: {match.group(2).strip()} 3: {match.group(3).strip()} 4: {match.group(4).strip()}") + return [match.group(1).strip(), match.group(2).strip(), match.group(3).strip(), match.group(4).strip()] + else: + print("No match found for type 1.") + return [] + elif type == 2: + match = re.search(pattern2, input_string) + if match: + # 检查是否包含“投标函”,如果是则替换部分内容为“商务文件” + part = match.group(2).strip() + if "投标函" in part: + part = "商务文件" + return [match.group(1).strip(), part] + else: + print("No match found for type 2.") + return [] + + +from llama_index.readers.dashscope.base import DashScopeParse +from llama_index.readers.dashscope.utils import ResultType +from llama_index.indices.managed.dashscope import DashScopeCloudIndex +from dashscope import Assistants, Messages, Runs, Threads + + +def send_message(assistant, index,documents,message='百炼是什么?'): + print(f"Query: {message}") + + # create thread. + # create a thread. + thread = Threads.create() + + print(thread) + + # create a message. + message = Messages.create(thread.id, content=message) + # create run + + run = Runs.create(thread.id, assistant_id=assistant.id) + print("run:" + str(run)) + + # # get run statue + run_status = Runs.get(run.id, thread_id=thread.id) + # print(run_status) + + # wait for run completed or requires_action + run_status = Runs.wait(run.id, thread_id=thread.id) + # print(run_status) + + # if prompt input tool result, submit tool result. + + run_status = Runs.get(run.id, thread_id=thread.id) + # print(run_status) + # verify_status_code(run_status) + + # get the thread messages. + msgs = Messages.list(thread.id) + # print(msgs) + # print(json.dumps(msgs, default=lambda o: o.__dict__, sort_keys=True, indent=4)) + + ans = [] + + print("运行结果:") + for message in msgs['data'][::-1]: + ans.append(message['content'][0]['text']['value']) + print("content: ", message['content'][0]['text']['value']) + print("\n") + deleteFileFromKnowledge(index,documents) + return ans + + +def file_parse(filepath, knowledge_index, type): + parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND) + documents = parse.load_data(file_path=filepath) + index = DashScopeCloudIndex("文件分类临时知识库") + index._insert(documents) + retriever = index.as_retriever() + pipeline_id = str(retriever.pipeline_id) + assistant = Assistants.create( + model='qwen-max', + name='smart helper', + description='智能助手,支持知识库查询和插件调用。', + instructions='请记住以下材料,他们对回答问题有帮助,请你简洁准确地给出回答,不要给出无关内容。${document1}', + tools=[ + { + "type": "code_interpreter" + }, + { + "type": "rag", + "prompt_ra": { + "pipeline_id": pipeline_id, + "parameters": { + "type": "object", + "properties": { + "query_word": { + "type": "str", + "value": "${document1}" + } + + } + } + } + }] + ) + questions1 = "这份招标文件的项目名称是什么?这份文件的招标编号是多少?这份文件的招标人是谁?这份招标文件属于以下哪类招标:服务类、工程类、还是货物类?你可以结合你对招投标的了解和以下内容:工程类招标投标是指建设单位对拟建的工程项目通过法定的程序和方式吸引建设项目的承包单位竞争;\ + 货物类招投标是指以货物作为采购对象的招标,业主或称购货方为获得货物通过招标的形式选择合格的供货商或称供货方,包括原材料、产品、设备、电能和固态、液态、气态物体等;服务类招标又称为服务采购,指的是除了工程和货之外的其他招标投标活动,物招标投标范围包含建设工程的勘察、设计、监理、工程咨询评估、科技项目、科研课题、物业管理、金融保险服务等\ + 请按下列格式给我提供信息,避免无关内容:‘项目名称:XXX;项目编号:XXX;招标人:XXX;类别:XXX‘" + questions2 = "这份投标文件的投标人是谁?如果文件中未明确提及投标人,投标人以未知替代;这份文件属于哪类投标文件?你的回答仅限于以下三种:商务文件、技术文件、报价文件,不要给出其他回答。请按下列格式给我提供信息,避免无关内容:‘投标人:XXX;类别:XXX’" + if (type == 1): + questions = questions1 + elif (type == 2): + questions = questions2 + return send_message(assistant,index,documents, message=questions) + +def deleteFileFromKnowledge(index,documents): + # 初始化一个列表来存储所有文档的 ID + file_ids = [] + + # 检查documents是否为列表且不为空 + if isinstance(documents, list) and documents: + # 遍历每个文档 + for document in documents: + # 使用属性访问方式获取每个文档的 id_ + # 确保 document 对象有一个名为 id_ 的属性 + file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常 + if file_id: + file_ids.append(file_id) # 将 id 添加到列表中 + index.delete_ref_doc(file_ids) + +def process_directory(base_directory, intermediate_directory, final_directory): + # 遍历base_directory下的所有子目录 + for subdir in os.listdir(base_directory): + subdir_path = os.path.join(base_directory, subdir) + # 确保是目录 + if os.path.isdir(subdir_path): + # 处理每个项目的目录 + process_pdf_folders(subdir_path, intermediate_directory) + classify_folders(intermediate_directory, final_directory, 0) + + # 清理临时目录以备下一个项目使用 + if os.path.exists(intermediate_directory): + shutil.rmtree(intermediate_directory) + +def main(base_directory, intermediate_directory, final_directory): + # 确保中间目录和最终目录存在 + os.makedirs(intermediate_directory, exist_ok=True) + os.makedirs(final_directory, exist_ok=True) + process_directory(base_directory, intermediate_directory, final_directory) + +#TODO:后期可加多线程进行处理 +if __name__ == "__main__": + base_directory = 'D:\\bidding_trading_files\\招投标文件2023\\2023' + intermediate_directory = 'D:\\tmp' + final_directory = 'D:\\output' + main(base_directory, intermediate_directory, final_directory) #处理多级目录 + # process_pdf_folders(base_directory, intermediate_directory) ##处理单级目录 + # classify_folders(intermediate_directory, final_directory, 0) diff --git a/flask_app/main/无效标和废标和禁止投标整合.py b/flask_app/main/无效标和废标和禁止投标整合.py new file mode 100644 index 0000000..6ba0152 --- /dev/null +++ b/flask_app/main/无效标和废标和禁止投标整合.py @@ -0,0 +1,302 @@ +# -*- coding: utf-8 -*- +import json +import os.path +import time +import re +from json_utils import combine_json_results, nest_json_under_key +from 通义千问long import upload_file, qianwen_long +from concurrent.futures import ThreadPoolExecutor +from 禁止投标情形 import find_forbidden + +#如果当前段落有序号,则向下匹配直接遇到相同的序号样式 +#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。 +def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): + from collections import OrderedDict + from docx import Document + import re + + doc = Document(doc_path) + extracted_paragraphs = OrderedDict() # 使用OrderedDict保持顺序 + continue_collecting = False + current_section_pattern = None + active_key = None # 用来标记当前正在收集的文本块的键 + + def match_keywords(text, patterns): + """使用正则表达式匹配关键词。""" + return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns) + + def extract_from_text(text, index): + nonlocal continue_collecting, current_section_pattern, active_key + if text == "": # Skip empty lines + return + if continue_collecting: + if current_section_pattern and re.match(current_section_pattern, text): + continue_collecting = False + active_key = None # 结束当前的收集 + else: + if active_key is not None: + extracted_paragraphs[active_key].append(text) + + if match_keywords(text, keywords): + active_key = text # 设置当前的关键词块 + extracted_paragraphs[active_key] = [text] # 初始化列表以收集文本,包括当前匹配的文本 + # 检查是否也匹配后续关键词 + if match_keywords(text, follow_up_keywords): + continue_collecting = True + # 设置跟踪模式 + section_number = re.match(r'(\d+(\s*\.\s*\d+)*)', text) + if section_number: + current_section_number = section_number.group(1) + level_count = current_section_number.count('.') + pattern = r'\b' + (r'\d+\s*\.\s*') * level_count + r'\d+\b' + current_section_pattern = re.compile(pattern) + else: + found_next_number = False + current_section_pattern = None + + for next_index in range(index + 1, len(doc.paragraphs)): + next_text = doc.paragraphs[next_index].text.strip() + if not found_next_number: + next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)', next_text) + if next_section_number: + found_next_number = True + section_parts = next_section_number.group(1).split('.') + dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b' + current_section_pattern = re.compile(dynamic_pattern) + + if current_section_pattern and re.match(current_section_pattern, next_text): + extracted_paragraphs[active_key].append(next_text) # 持续收集 + + for index, para in enumerate(doc.paragraphs): + extract_from_text(para.text.strip(), index) + + return extracted_paragraphs # 返回字典,键是关键词,值是相关文本列表 + +def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达式提取到的东西格式化 + all_texts1 = [] + all_texts2=[] + # 定义用于分割句子的正则表达式,包括中文和西文的结束标点 + split_pattern = r'(?<=[。!?\!\?])' + + for key, text_list in extracted_contents.items(): + if len(text_list) == 1: + for data in text_list: + # 检查是否包含任何需要排除的字符串 + if any(exclude in data for exclude in excludes): + continue # 如果包含任何排除字符串,跳过这个数据 + # 去掉开头的序号,包括字母+数字的格式 以及括号+数字 + pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)' + data = re.sub(pattern, '', data).strip() + keyword_match = re.search(keywords, data) + if keyword_match: + # 从关键词位置开始查找结束标点符号 + start_pos = keyword_match.start() + # 截取从关键词开始到后面的内容 + substring = data[start_pos:] + # 按定义的结束标点分割 + sentences = re.split(split_pattern, substring, 1) + if len(sentences) > 0 and sentences[0]: + # 只取第一句,保留标点 + cleaned_text = data[:start_pos] + sentences[0] + else: + cleaned_text = data # 如果没有标点,使用整个字符串 + else: + # 如果没有找到关键词,保留原文本 + cleaned_text = data + all_texts1.append(cleaned_text) # 将处理后的文本添加到结果列表 + + else: + pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)' + data = re.sub(pattern, '', text_list[0]).strip() + # 将修改后的第一个元素和剩余的元素连接起来 + text_list[0] = data # 更新列表中的第一个元素 + joined_text = "\n".join(text_list) # 如果列表中有多个元素,则连接它们 + all_texts2.append(joined_text) # 将每个列表的内容添加到 all_texts 中 + + return all_texts1,all_texts2 +def find_sentences_with_keywords(data, keywords, follow_up_keywords): + """递归查找并返回包含关键词的句子列表,并根据是否存在后续关键词分别存储到两个列表中。""" + sentences1 = [] # 保存没有后续关键词的情况 + sentences2 = [] # 保存有后续关键词的情况 + + if isinstance(data, dict): + for value in data.values(): + result1, result2 = find_sentences_with_keywords(value, keywords, follow_up_keywords) + sentences1.extend(result1) + sentences2.extend(result2) + elif isinstance(data, list): + for item in data: + result1, result2 = find_sentences_with_keywords(item, keywords, follow_up_keywords) + sentences1.extend(result1) + sentences2.extend(result2) + elif isinstance(data, str): + # 分割句子,保证句子完整性 + split_sentences = re.split(r'(?<=[。!?\!\?])', data) + i = 0 + while i < len(split_sentences): + sentence = split_sentences[i] + if re.search(keywords, sentence, re.IGNORECASE): + follow_up_present = any( + re.search(follow_up, sentence, re.IGNORECASE) for follow_up in follow_up_keywords) + if follow_up_present: + # 如果存在后续关键词,则从当前位置开始截取 + start_index = i + end_index = start_index + found_next_section = False + for j in range(start_index + 1, len(split_sentences)): + if re.match(r'\d+\.\d+(\.\d+)?', split_sentences[j].strip()): + end_index = j + found_next_section = True + break + + if found_next_section: + full_text = ' '.join(split_sentences[start_index:end_index]).strip() + else: + full_text = ' '.join(split_sentences[start_index:]).strip() + pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)' + data=re.sub(pattern,'',full_text) + sentences2.append(data) # 存储有后续关键词的情况 + i = end_index if found_next_section else len(split_sentences) + else: + pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)' + data = re.sub(pattern, '', sentence).replace('\n','').strip() + sentences1.append(data) # 存储没有后续关键词的情况 + i += 1 + else: + i += 1 + + return sentences1, sentences2 # 返回两个列表 + +def extract_sentences_from_json(json_path, keywords,follow_up_keywords): + with open(json_path, 'r', encoding='utf-8') as file: + data = json.load(file) + """从JSON数据中提取包含关键词的句子。""" + return find_sentences_with_keywords(data, keywords,follow_up_keywords) + +#处理无效投标 +def extract_values_if_contains(data, includes): + """ + 递归检查字典中的值是否包含列表 'includes' 中的内容。 + 如果包含,将这些值添加到一个列表中并返回。 + + 参数: + data (dict): 字典或从 JSON 解析得到的数据。 + includes (list): 包含要检查的关键词的列表。 + + 返回: + list: 包含满足条件的值的列表。 + """ + included_values = [] # 初始化结果列表 + + # 定义递归函数来处理嵌套字典 + def recursive_search(current_data): + if isinstance(current_data, dict): + for key, value in current_data.items(): + if isinstance(value, dict): + # 如果值是字典,递归搜索 + recursive_search(value) + elif isinstance(value, str): + # 如果值是字符串,检查是否包含任何 includes 中的关键词 + if any(include in value for include in includes): + included_values.append(value) + elif isinstance(current_data, list): + for item in current_data: + # 如果是列表,递归每个元素 + recursive_search(item) + + # 开始递归搜索 + recursive_search(data) + + return included_values + + +#你是一个文本助手,文本内的信息以'...............'分割,你负责准确筛选所需的信息并返回,每块信息要求完整,不遗漏,你不得擅自进行总结或删减。 +#以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号。 +#以上是原文内容,文本内的信息以'...............'分割,请你根据该信息回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选所需的信息并返回。最终结果以json列表格式返回给我,键名为'否决和无效投标情形',你的回答完全忠于原文内容,且回答内容与原文内容一致,要求完整与准确,不能擅自总结或者概括。", + +def handle_query(file_path, user_query, output_file, result_key, keywords, truncate_json_path): + excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:"] + follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下'] + extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) #字典结果 + all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表 + all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords) + qianwen_txt = all_texts1 + all_tables1 + # Proceed only if there is content to write + if qianwen_txt: + with open(output_file, 'w', encoding='utf-8') as file: + # 初始化一个计数器 + counter = 1 + for content in qianwen_txt: + file.write("..............."+'\n') + # 写入内容前加上序号,后面接一个点和空格,然后是内容 + file.write(f"{counter}. {content}\n") + # 更新计数器,每次循环递增 + counter += 1 + file_id = upload_file(output_file) + print("starting qianwen-long...") + qianwen_ans = qianwen_long(file_id, user_query) + selected_contents = [] + num_list = json.loads(qianwen_ans) + print(num_list) + for index in num_list: + if index - 1 < len(qianwen_txt): + content = qianwen_txt[index - 1] # 转换序号为索引(假设序号从1开始) + selected_contents.append(content) + selected_contents += all_texts2 + selected_contents += all_tables2 + # 创建一个字典来保存结果 + res = {result_key: selected_contents} + # 将结果转换为JSON字符串 + # os.remove(output_file) # Remove the file after use + # print(f"Deleted temporary file: {output_file}") + else: + res = {result_key: ""} # Set the response to empty if no contents were extracted + return res + +def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,truncate4): + print("starting无效标与废标...") + queries = [ + (r'否\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效', + "以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号。", + os.path.join(output_dir, "temp1.txt"), "否决和无效投标情形"), + (r'废\s*标', + "以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:废标项的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号。", + os.path.join(output_dir, "temp2.txt"), "废标项") + ] + results = [] + + # 使用线程池来并行处理查询 + with ThreadPoolExecutor() as executor: + futures = [] + for keywords, user_query, output_file, result_key in queries: + future = executor.submit(handle_query, file_path, user_query, output_file, result_key, keywords, + truncate_json_path) + futures.append(future) + time.sleep(1) # 暂停1秒后再提交下一个任务 + + for future in futures: + results.append(future.result()) + + forbidden_res = find_forbidden(truncate_json_path, clause_path, truncate4) + results.append(forbidden_res) + + combined_dict = {} + for d in results: + combined_dict.update(d) + + print("无效标与废标done...") + return nest_json_under_key(combined_dict, "无效标与废标项") + + +#TODO:1.运行时间约80s,如果成为短板需要优化多线程 2.没有提取评标办法前附表中的表格 3.提取表格时根据中文的句号分割 4.qianwen-long存在bug +if __name__ == '__main__': + start_time = time.time() + truncate_json_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json" + clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\clause.json" + truncate4="C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest12_qualification.pdf" + output_dir = "C:\\Users\\Administrator\\Desktop\\招标文件" + doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标03.docx' + results = combine_find_invalid(doc_path, output_dir,truncate_json_path,clause_path,truncate4) + end_time = time.time() + print("Elapsed time:", str(end_time - start_time)) + print("Results:", results) diff --git a/flask_app/main/根据条款号整合json.py b/flask_app/main/根据条款号整合json.py new file mode 100644 index 0000000..d8b6fd3 --- /dev/null +++ b/flask_app/main/根据条款号整合json.py @@ -0,0 +1,133 @@ +import json + +def load_json(file_path): + """ + 加载JSON文件,并统一其中的括号为全角括号。 + """ + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + return standardize_brackets_in_json(data) + +def standardize_brackets_in_json(data): + """ + 递归地处理JSON数据,将所有文本中的半角括号转换为全角括号。 + """ + if isinstance(data, dict): + return {k: standardize_brackets_in_json(v) for k, v in data.items()} + elif isinstance(data, list): + return [standardize_brackets_in_json(element) for element in data] + elif isinstance(data, str): + return standardize_brackets(data) + else: + return data + +def convert_dict_to_str(d): + if isinstance(d, dict): + return "\n".join(f"{k}: {v}" for k, v in d.items()) + return str(d) + + +def find_entries_in_jsons(entries, json_primary, json_secondary): + results = {} + for entry in entries: + key, value = next(iter(entry.items())) + combined_value = [] + # 先尝试在json_primary中寻找,如果找不到再在json_secondary中查找 + found_in_primary = process_json_with_subentries(json_primary, value, combined_value) + if not found_in_primary: + process_json_with_subentries(json_secondary, value, combined_value) + + if combined_value: + results[key] = "\n".join(combined_value) + return results + +def process_json_with_subentries(json_data, value, combined_value): + """ + 处理JSON数据,寻找指定的条目,考虑全角和半角括号。 + """ + value = standardize_brackets(value) + if "(" in value and ")" in value: + base_key, subentry_key = value.split("(") + subentry_key = "(" + subentry_key + content = json_data.get(base_key.strip()) + if content: + if isinstance(content, str): + extracted_content = extract_specific_subentry(content, subentry_key) + if extracted_content: + combined_value.append(extracted_content) + return True + else: + return False + else: + return process_json(json_data, value, combined_value) + + +def process_json(json_data, value, combined_value): + found_subentries = check_and_collect_subentries(json_data, value, combined_value) + if not found_subentries: + content = json_data.get(value, "") + if content: + combined_value.append(get_values_only(content)) + return True + return found_subentries + +def check_and_collect_subentries(json_data, value, combined_value): + found_subentries = False + subentry_index = 1 + for subkey in json_data: + if subkey.startswith(value + "."): + content = json_data[subkey] + combined_value.append(f"{subentry_index}. {get_values_only(content)}") + subentry_index += 1 + found_subentries = True + return found_subentries + +def extract_specific_subentry(content, subentry_key): + """ + 提取指定的子条目文本,考虑全角和半角括号。 + """ + subentry_index = subentry_key.replace("(", "").replace(")", "") + try: + idx = int(subentry_index) + bracket_pattern = f"({idx})" + parts = content.split(bracket_pattern) + if len(parts) > 1: + next_bracket_pattern = f"({idx+1})" + next_part = parts[1].split(next_bracket_pattern, 1)[0] + return next_part.strip() + except ValueError: + return "" + return "" + +def get_values_only(content): + if isinstance(content, dict): + return " / ".join(content.values()) + return content + +def standardize_brackets(value): + """ + 将输入中的所有半角括号转换为全角括号。 + """ + return value.replace('(', '(').replace(')', ')') + +def process_and_merge_entries(entries_with_numbers, primary_json_path, secondary_json_path): + primary_json_data = load_json(primary_json_path) + secondary_json_data = load_json(secondary_json_path) + combined_results = find_entries_in_jsons(entries_with_numbers, primary_json_data, secondary_json_data) + return combined_results + +if __name__ == "__main__": + # Hypothetical entries and file paths for testing + # entries_with_numbers = [{'形式评审标准.投标文件签字盖章': '3.7.3(3)'}, {'形式评审标准.多标段投标': '10.1'}, {'形式评审标准.“技术暗标”': '3.7.4(5)'}, {'响应性评审标准.投标内容': '1.3.1'}, {'响应性评审标准.工期': '1.3.2'}, {'响应性评审标准.工程质量': '1.3.3'}, {'响应性评审标准.投标有效期': '3.3.1'}, {'响应性评审标准.投标保证金': '3.4.1'}, {'响应性评审标准.分包计划': '1.11'}] + entries_with_numbers=[{'xxx': '3.7.4(5)'}] + primary_json_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\truncate_output3.json' + secondary_json_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\clause3.json' + + # Since this is just a test block, make sure these paths point to actual JSON files with the appropriate structure + try: + combined_results = process_and_merge_entries(entries_with_numbers, primary_json_path, secondary_json_path) + print("Combined Results:", json.dumps(combined_results, indent=4, ensure_ascii=False)) + except FileNotFoundError: + print("One or more JSON files were not found. Please check the file paths.") + except json.JSONDecodeError: + print("One or more files could not be decoded. Please check the file content.") diff --git a/flask_app/main/知识库操作.py b/flask_app/main/知识库操作.py new file mode 100644 index 0000000..e383b02 --- /dev/null +++ b/flask_app/main/知识库操作.py @@ -0,0 +1,57 @@ +import os +import uuid + +from llama_index.readers.dashscope.base import DashScopeParse +from llama_index.readers.dashscope.utils import ResultType +from llama_index.indices.managed.dashscope import DashScopeCloudIndex +from 删除知识库 import delete_index, create_client + + +def addfileToKnowledge(filepath,knowledge_name): + parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND) + documents = parse.load_data(file_path=filepath) + index = DashScopeCloudIndex.from_documents( + documents, + knowledge_name, + verbose=True, + ) + # index = DashScopeCloudIndex(knowledge_name) + # index._insert(documents) + # return index, documents + return index + +def deleteKnowledge(index): + retriever = index.as_retriever() + index_id = str(retriever.pipeline_id) + workspace_id = os.environ.get('DASHSCOPE_WORKSPACE_ID') + client = create_client() + delete_index(client,workspace_id,index_id) + + + +def deleteFileFromKnowledge(index, documents): + # 初始化一个列表来存储所有文档的 ID + file_ids = [] + # 检查documents是否为列表且不为空 + if isinstance(documents, list) and documents: + # 遍历每个文档 + for document in documents: + # 使用属性访问方式获取每个文档的 id_ + # 确保 document 对象有一个名为 id_ 的属性 + file_id = getattr(document, 'id_', None) # 使用 getattr 防止属性不存在时抛出异常 + if file_id: + file_ids.append(file_id) # 将 id 添加到列表中 + print("deleted successfully") + index.delete_ref_doc(file_ids) + + +# 示例用法 +if __name__ == "__main__": + filepath = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标01.pdf" + unique_id = str(uuid.uuid4()) + knowledge_name="招标解析"+unique_id + # index = addfileToKnowledge(filepath,knowledge_name) + index = DashScopeCloudIndex("招标解析e8cc45f4-cd41-47cf-a5e6-2b7885debfff") + # 删除文件 + # deleteFileFromKnowledge(index, document) + deleteKnowledge(index) diff --git a/flask_app/main/禁止投标情形.py b/flask_app/main/禁止投标情形.py new file mode 100644 index 0000000..089f78c --- /dev/null +++ b/flask_app/main/禁止投标情形.py @@ -0,0 +1,151 @@ +import ast +import json +import os +import re + +from PyPDF2 import PdfWriter, PdfReader + +from 通义千问long import upload_file, qianwen_long +from json_utils import clean_json_string + +def extract_and_format_from_paths(json_paths, includes): + """ + 从多个 JSON 文件路径读取数据,提取包含特定关键词的内容,并按照格式要求合并。 + + 参数: + json_paths (list): 包含多个 JSON 文件路径的列表。 + includes (list): 包含要检查的关键词的列表。 + + 返回: + list: 包含所有文件中满足条件的格式化字符串列表。 + """ + all_formatted_results = [] + + # 遍历每个文件路径 + for path in json_paths: + try: + with open(path, 'r', encoding='utf-8') as file: + # 加载 JSON 数据 + json_data = json.load(file) + formatted_results = [] + + # 遍历 JSON 数据的每个键值对 + for key, value in json_data.items(): + if isinstance(value, dict): + # 如果值是字典,检查嵌套字典的每个键值对 + for sub_key, sub_value in value.items(): + if any(include in sub_value for include in includes): + # 如果子值包含关键词,格式化并添加到结果列表 + formatted_results.append(f"{sub_key}: {sub_value}") + elif isinstance(value, str): + # 如果值是字符串,直接检查是否包含关键词 + if any(include in value for include in includes): + # 如果值包含关键词,添加到结果列表 + formatted_results.append(value) + + # 将当前文件的结果添加到总结果列表 + all_formatted_results.extend(formatted_results) + except FileNotFoundError: + print(f"Error: The file '{path}' does not exist.") + except json.JSONDecodeError: + print(f"Error: The file '{path}' contains invalid JSON.") + + return all_formatted_results + +def extract_unique_items_from_texts(texts): + pattern = re.compile(r'(?:\d+\.|\(\d+\)|\(\d+\)|\d+\))\s*') + intro_pattern = re.compile(r'^.*[::]') + punctuation_pattern = re.compile(r'[;。,、..,:;!?!?]+$') + + all_results = [] + seen = set() + + for text in texts: + text = intro_pattern.sub('', text) + items = pattern.split(text) + + for item in items: + cleaned_item = item.strip() + if cleaned_item: + cleaned_item = pattern.sub('', cleaned_item) + cleaned_item = punctuation_pattern.sub('', cleaned_item) + cleaned_item = cleaned_item.strip() + if cleaned_item and cleaned_item not in seen: + seen.add(cleaned_item) + all_results.append(cleaned_item) + + return all_results + +def merge_pdfs(paths, output_filename): + pdf_writer = PdfWriter() + output_path = None + + for path in paths: + pdf_reader = PdfReader(path) + for page in range(len(pdf_reader.pages)): + # 将每页添加到写入对象中 + pdf_writer.add_page(pdf_reader.pages[page]) + + if output_path is None: + # 确定输出文件路径 + output_path = os.path.join(os.path.dirname(path), output_filename) + + # 写入合并的PDF到文件 + if output_path: + with open(output_path, 'wb') as out: + pdf_writer.write(out) + print(f"Merged PDF saved to {output_path}") + else: + print("No files to merge.") + return output_path + +def process_string_list(string_list): + # 使用正则表达式匹配方括号内的内容 + match = re.search(r'\[(.*?)\]', string_list) + if match: + # 获取匹配的内容,即方括号内的部分 + content_inside_brackets = match.group(1) + if content_inside_brackets: # 检查内容是否为空 + # 将每个项目用引号包裹,并确保适当的空格和逗号处理 + formatted_list = '[' + ', '.join(f"'{item.strip()}'" for item in content_inside_brackets.split(',') if item.strip()) + ']' + else: + return [] # 直接返回空列表如果内容为空 + # 使用 ast.literal_eval 来解析格式化后的字符串 + try: + actual_list = ast.literal_eval(formatted_list) + return actual_list + except SyntaxError as e: + print(f"Error parsing list: {e}") + return [] + else: + # 如果没有匹配到内容,返回空列表 + return [] +def find_forbidden(truncate_json_path,clause_path,truncate4): #投标人须知前附表 条款 评分前附表和资格审查表中 + # output_filename="merged.pdf" + # paths=[truncate1,truncate4] + # merged_filepath=merge_pdfs(paths,output_filename) #暂时废弃,评分前附表中的在'否决投标'中摘录了。 + file_id=upload_file(truncate4) + #user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请按json列表格式给我提供信息,键名为'不得存在的其他情形',请你不要回答有关\"信誉要求\"的内容,若文件中未说明,请在键值中填'未知'。" + user_query_forbidden = "该招标文件规定的投标人不得存在的其他情形有哪些,请以列表给我提供信息,形如[xx,xx,...],请你不要回答有关\"信誉要求\"的内容,若原文未提及,返回[]。" + qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden) + print(qianwen_forbidden_str) + actual_list=process_string_list(qianwen_forbidden_str) + print(actual_list) + + includes = ["不得存在", "禁止投标"] + forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes) + processed_results = extract_unique_items_from_texts(forbidden_results) + print(processed_results) + merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results)) + forbidden_dict={'不得存在的其他情形':merged_forbidden_list} + + return forbidden_dict + + +if __name__ == '__main__': + truncate_json_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json" + clause_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\clause.json" + truncate4 = "C:\\Users\\Administrator\\Desktop\\招标文件\\zbtest12_qualification.pdf" + output_dir = "C:\\Users\\Administrator\\Desktop\\招标文件" + doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标03.docx' + find_forbidden(truncate_json_path,clause_path,truncate4) \ No newline at end of file diff --git a/flask_app/main/读取docx.py b/flask_app/main/读取docx.py new file mode 100644 index 0000000..b36bf65 --- /dev/null +++ b/flask_app/main/读取docx.py @@ -0,0 +1,18 @@ +from docx import Document + +def read_docx(file_path): + # 尝试打开文档 + try: + doc = Document(file_path) + except Exception as e: + print(f"Error opening file: {e}") + return + + # 读取文档中的所有段落并打印它们 + for para in doc.paragraphs: + print(para.text) + + +if __name__ == "__main__": + file_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标02_invalid.docx" + read_docx(file_path) diff --git a/flask_app/main/资格审查模块.py b/flask_app/main/资格审查模块.py new file mode 100644 index 0000000..45ea6f4 --- /dev/null +++ b/flask_app/main/资格审查模块.py @@ -0,0 +1,35 @@ +import json +import os + +from 投标人须知正文条款提取成json文件 import convert_clause_to_json +from json_utils import nest_json_under_key, extract_content_from_json +from 形式响应评审 import process_reviews +from 资格评审 import process_qualification +from 通义千问long import upload_file, qianwen_long + + +def combine_review_standards(truncate1,truncate4,knowledge_name,truncate0_jsonpath,clause_path): #评标办法前附表 + # 形式评审、响应评审:千问 + print("starting形式响应评审...") + file_id=upload_file(truncate1) #评标办法前附表 + user_query_1 = "根据该文档中的评标办法前附表,请你列出该文件中的形式评审标准和响应性评审标准和资格评审标准,请以json格式返回,外层键名为'形式评审标准'和'响应性评审标准'和'资格评审标准',嵌套键名为'评审因素'中的内容,相应的键值为对应'评审标准'中的内容。" + results = qianwen_long(file_id, user_query_1) + original_dict_data = extract_content_from_json(results) + qualification_review = original_dict_data.pop('资格评审标准', '默认值或None') + final_qualify_json=process_qualification(qualification_review,truncate4,knowledge_name) + form_response_dict=process_reviews(original_dict_data, knowledge_name, truncate0_jsonpath, clause_path) + print("形式响应评审done") + form_response_dict.update(final_qualify_json) + return nest_json_under_key(form_response_dict,"资格审查") + + +if __name__ == "__main__": + output_folder = "C:\\Users\\Administrator\Desktop\\fsdownload\\temp1" + truncate1 = os.path.join(output_folder, "ztbfile_tobidders_notice_table.pdf") + knowledge_name="zbfile" + truncate2=os.path.join(output_folder,"ztbfile_evaluation_method.pdf") + truncate4=os.path.join(output_folder,"ztbfile_qualification.pdf") + clause_path = convert_clause_to_json(truncate2, output_folder) + truncate1_jsonpath = os.path.join(output_folder, "truncate_output.json") + res=combine_review_standards(truncate2,truncate4, knowledge_name,truncate1_jsonpath,clause_path) + print(res) \ No newline at end of file diff --git a/flask_app/main/资格评审.py b/flask_app/main/资格评审.py new file mode 100644 index 0000000..b753641 --- /dev/null +++ b/flask_app/main/资格评审.py @@ -0,0 +1,87 @@ +#资格审查中,首先排除'联合体投标'和'不得存在的情况',有'符合'等的,加入matching_keys列表,否则保留原字典 +import re + +from json_utils import clean_json_string, combine_json_results, add_keys_to_json +from 多线程提问 import multi_threading, read_questions_from_file +from 通义千问long import upload_file + +def merge_dictionaries_under_common_key(dicts, common_key): + # 初始化一个空字典来保存合并的结果 + merged_dict = {common_key: {}} + + # 遍历列表中的每个字典 + for d in dicts: + if common_key in d: + # 使用字典解包来合并字典 + merged_dict[common_key].update(d[common_key]) + else: + print(f"Warning: Dictionary does not contain the key {common_key}") + + return merged_dict +def generate_qual_question(matching_keys_list): + questions=[] + # 将列表转换为单引号包裹的格式,并用逗号和空格分隔 + formatted_keys = ["'{}'".format(key) for key in matching_keys_list] + # 将格式化后的关键词列表连接成字符串 + keys_string = ", ".join(formatted_keys) + # 构造完整的问题语句 + question1 = (f"该招标文件中资格评审的内容是怎样的?具体内容包括{keys_string}," + "请你以json格式返回结果,外层键名为'资格评审',嵌套键名为具体的字段,请你忠于原文,回答要求完整准确,不要擅自总结、删减。") + question2="该招标文件中资格评审中有关人员资格的要求是怎样的?请依次给出所需的岗位、需要的数量、资格要求、需要提交的证明材料(如具体的社保证明、技能证书等,若有时间要求请注明时间范围)、在岗要求、备注,若相关要求不存在,则以“未知”填充。请你以json格式返回结果,外层键名为'资格评审',嵌套键名为具体的要求,请你忠于原文,回答要求完整准确,不要擅自总结、删减。" + questions.append(question1) + questions.append(question2) + return questions +def extract_matching_keys_qual(dict_data): + # 定义包含模式的列表 + include_patterns = [re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"),re.compile(r"符合")] + # 初始化列表,用于存储匹配的键 + matching_keys = [] + non_matching_keys = {} + # 定义排除项 + excludes = ['联合体', '禁止投标', '不存在', '不得存在','资格','管理机构','负责人'] + # 遍历字典中的每个键值对 + for key, value in dict_data.items(): + # 检查键是否包含任何排除项 + if any(ex in key for ex in excludes): + continue # 如果包含排除项,则跳过当前键值对 + # 检查值是否符合任何一个包含模式 + if any(pattern.search(value) for pattern in include_patterns): + # 如果匹配,将键添加到列表中 + matching_keys.append(key) + else: + # 如果不匹配,将键值对添加到不匹配字典中 + non_matching_keys[key] = value + return matching_keys,non_matching_keys #matching:['资质条件', '财务状况'] non_matching_keys:{'营业执照': '具备有效的营业执照', '施工机械设备': '具备完善的施工设备'} + +def process_qualification(qualification_review,truncate4,knowledge_name): + # 资格评审 + matching_keys_list, non_matching_dict = extract_matching_keys_qual(qualification_review) + user_querys = generate_qual_question(matching_keys_list) # 生成提问->附件:资格审查 + file_id2 = upload_file(truncate4) + results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表 + res_list = [] + if not results2: + print("errror!") + else: + # 打印结果 + for question, response in results2: + cleaned_res = clean_json_string(response) + res_list.append(cleaned_res) # 都是问资格评审表得出的 + merged_dict = merge_dictionaries_under_common_key(res_list, '资格评审') + qualify_list = [] + # qualification_review_file_path = '../static/提示词/资格评审问题.txt' # 替换为你的txt文件路径 + qualification_review_file_path='static/提示词/资格评审问题.txt' + qualification_review_questions = read_questions_from_file(qualification_review_file_path) # 联合体投标 + results1 = multi_threading(qualification_review_questions, knowledge_name) + for _, response in results1: # _占位,代表ques;response[0]也是ques;response[1]是ans + try: + if response and len(response) > 1: # 检查response存在且有至少两个元素 + qualify_list.append(response[1]) + else: + print(f"Warning: Missing or incomplete response data for query index {_}.") + except Exception as e: + print(f"Error processing response for query index {_}: {e}") + qualify_combined_dict = combine_json_results(qualify_list) + updated_qualify_json = add_keys_to_json(merged_dict, qualify_combined_dict) # 合并字典 + final_qualify_json = add_keys_to_json(updated_qualify_json, non_matching_dict) + return final_qualify_json \ No newline at end of file diff --git a/flask_app/main/资格评审前判断.py b/flask_app/main/资格评审前判断.py new file mode 100644 index 0000000..9a220d5 --- /dev/null +++ b/flask_app/main/资格评审前判断.py @@ -0,0 +1,29 @@ +from 按页读取pdf import extract_text_by_page + +def check_strings_in_pdf(file_path): + judge_list=['施工机械设备', '企业信息登记'] + # Read text from PDF + text = extract_text_by_page(file_path) # Assuming this returns all text from the PDF + full_text = ''.join(text).replace('\n', '').replace(' ', '') # Clean up the text + + # Initialize the questions list + ques_list = [] + + # Check for each string in the judge_list and construct questions accordingly + if judge_list[0] in full_text: + ques_list.append(f"该招标文件对于'{judge_list[0]}'的要求是怎样的,请按json格式给我提供信息,键名为'{judge_list[0]}',若存在未知信息,在对应的键值中填'未知'。") + if len(judge_list) > 1 and judge_list[1] in full_text: + ques_list.append(f"该招标文件对于'{judge_list[1]}'的要求是怎样的,请按json格式给我提供信息,键名为'{judge_list[1]}',若存在未知信息,在对应的键值中填'未知'。") + + if not ques_list: + return None + return ques_list + +# Test cases or example usage +if __name__ == '__main__': + file_path = 'C:/Users/Administrator/Desktop/zbtest18_39-45.pdf' # Replace with your actual PDF file path + judge_list = ['施工机械设备', '企业信息登记'] # List of strings to check in the PDF + + questions = check_strings_in_pdf(file_path, judge_list) + for question in questions: + print(question) diff --git a/flask_app/main/转化格式/__init__.py b/flask_app/main/转化格式/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/flask_app/main/转化格式/check_status.py b/flask_app/main/转化格式/check_status.py new file mode 100644 index 0000000..7a90c7a --- /dev/null +++ b/flask_app/main/转化格式/check_status.py @@ -0,0 +1,30 @@ +# filename: check_status.py +import hashlib +import email.utils +import http.client +import json + +def get_download_url(task_id): + app_id = 'SX20240723LAKILA' + app_key = 'mIwDAgJZIZEUsOZatRrCvhtMkaxGdWbq' + get_uri = f"/api/developer/v1/tasks/convert/to/docx/{task_id}" + current_time = email.utils.formatdate(usegmt=True) + content_md5 = hashlib.md5(get_uri.encode('utf-8')).hexdigest() + data = app_key + content_md5 + "application/json" + current_time + sha1_hex = hashlib.sha1(data.encode('utf-8')).hexdigest() + authorization_header = f"WPS-2:{app_id}:{sha1_hex}" + conn = http.client.HTTPSConnection("solution.wps.cn") + headers = { + 'Date': current_time, + 'Content-Md5': content_md5, + 'Content-Type': "application/json", + 'Authorization': authorization_header + } + conn.request("GET", get_uri, headers=headers) + res = conn.getresponse() + data = res.read() + response_json = json.loads(data.decode("utf-8")) + return response_json['data']['download_url'] + + + diff --git a/flask_app/main/转化格式/download.py b/flask_app/main/转化格式/download.py new file mode 100644 index 0000000..9e5a789 --- /dev/null +++ b/flask_app/main/转化格式/download.py @@ -0,0 +1,41 @@ +import requests +import mimetypes +import os + +def download_file(url, local_filename): + try: + with requests.get(url, stream=True) as response: + response.raise_for_status() # 确保请求成功,否则抛出异常 + + # 获取文件类型并设置适当的文件扩展名 + content_type = response.headers.get('Content-Type') + extension = mimetypes.guess_extension(content_type, strict=False) + if not extension: + # 如果无法猜测扩展名,默认使用 .bin + extension = '.bin' + full_filename = local_filename + extension # 追加扩展名 + + with open(full_filename, 'wb') as file: + for chunk in response.iter_content(chunk_size=8192): + file.write(chunk) + + print(f"File downloaded successfully and saved as {full_filename}") + return full_filename # 返回文件的完整路径 + except requests.HTTPError as e: + print(f"HTTP Error: {e}") + return None + except requests.RequestException as e: + print(f"Error downloading the file: {e}") + return None + except Exception as e: + print(f"An error occurred: {e}") + return None + +if __name__ == '__main__': + # 测试下载的URL + test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/tender/28f7c0af7c7041bbbdf88ce6848e8a38.pdf?Expires=1722165340&OSSAccessKeyId=TMP.3KfNYFQchGtZWbjd2M1jR6y7PPqYTq1QLZ4pzbfEwkz3LwGLepVvr9371bndcRoMhHFhohaUJxrhiL63jKoAZk6VWQfwh4&Signature=RmktXAOwEbP1BBrkSfARfHtuXh8%3D" + # 基本的本地文件名,不包括扩展名 + local_file_name = 'C:\\Users\\zhangsan\\Desktop\\temp\\downloaded_file' + file_path = download_file(test_url, local_file_name) + if file_path: + print(f"Downloaded file path: {file_path}") diff --git a/flask_app/main/转化格式/main_pdf_to_docx.py b/flask_app/main/转化格式/main_pdf_to_docx.py new file mode 100644 index 0000000..a8ab7f9 --- /dev/null +++ b/flask_app/main/转化格式/main_pdf_to_docx.py @@ -0,0 +1,39 @@ +import time + +from submit_conversion import submit_conversion_task +from check_status import get_download_url +from download import download_file + +def download_pdf_convert_docx(url, downloaded_filename): + """ + Download a PDF from a URL, convert it to a DOCX, and save it locally. + + Args: + url (str): The URL of the PDF to be downloaded. + downloaded_filename (str): The filename to save the converted DOCX as. + """ + # 提交转换任务并获取task_id + task_id = submit_conversion_task(url) + if task_id: + download_url = None + # 使用while循环进行每秒的查询 + while not download_url: + time.sleep(0.5) + download_url = get_download_url(task_id) + + # 如果得到下载链接 + if download_url: + # 下载文件 + download_file(download_url, downloaded_filename) + print(f'File downloaded and saved as {downloaded_filename}') + else: + print("Failed to get download URL.") + else: + print("Failed to submit conversion task.") + +if __name__ == "__main__": + # PDF文件URL + pdf_url = "https://temp-pdf2docx.oss-cn-wuhan-lr.aliyuncs.com/pdf/02cf0a7a8cda432a8ba7a929862510eb.pdf?Expires=1724035295&OSSAccessKeyId=TMP.3Kj9nRWk3bspYRpZJJeKSSDjuoiSsd1SYBnHtac62JciczGbftutcSUcM5RpLTQNQXeANRNbdSxK2VnX9cQZ9bUgR3dWDv&Signature=MJfXEZe1fy5CEIoJ1IxhliSv0Ss%3D" + # pdf_url="C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\zbfile.pdf" + downloaded_filename = 'downloaded_document.docx' + download_pdf_convert_docx(pdf_url, downloaded_filename) diff --git a/flask_app/main/转化格式/pdf2doc.py b/flask_app/main/转化格式/pdf2doc.py new file mode 100644 index 0000000..7073049 --- /dev/null +++ b/flask_app/main/转化格式/pdf2doc.py @@ -0,0 +1,48 @@ +import requests +import os + + +def convert_pdf_to_word(file_path, output_dir, output_format='docx'): + """ + Converts a PDF file to a Word document using a specified API. + + :param file_path: Path to the PDF file to convert. + :param output_dir: Directory to save the converted Word document. + :param output_format: Format of the output Word document ('docx' or 'doc'). + :return: None + """ + url = 'http://192.168.0.40:5000/api/v1/convert/pdf/word' + + # Prepare the files and data dictionary for the multipart/form-data request + with open(file_path, 'rb') as file_handle: + files = { + 'fileInput': (os.path.basename(file_path), file_handle, 'application/pdf') + } + data = { + 'outputFormat': output_format + } + + # Make the POST request + response = requests.post(url, files=files, data=data) + + # Check the response + if response.status_code == 200: + print("Request was successful.") + + # Determine the output filename based on the input filename but with the new extension + output_filename = os.path.splitext(os.path.basename(file_path))[0] + '.' + output_format + output_path = os.path.join(output_dir, output_filename) + + # Save the output to a new file + with open(output_path, 'wb') as f: + f.write(response.content) + print(f"Output file saved to: {output_path}") + else: + print("Failed to make request:", response.status_code, response.text) + + +# Example usage: +if __name__ == '__main__': + file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标02_invalid.pdf" + output_dir = "C:\\Users\\Administrator\\Desktop\\招标文件" + convert_pdf_to_word(file_path, output_dir, 'doc') diff --git a/flask_app/main/转化格式/submit_conversion.py b/flask_app/main/转化格式/submit_conversion.py new file mode 100644 index 0000000..10f3062 --- /dev/null +++ b/flask_app/main/转化格式/submit_conversion.py @@ -0,0 +1,37 @@ +# filename: submit_conversion.py +import http.client +import email.utils +import hashlib +import json + +def submit_conversion_task(file_url): + app_id = 'SX20240723LAKILA' + app_key = 'mIwDAgJZIZEUsOZatRrCvhtMkaxGdWbq' + current_time = email.utils.formatdate(usegmt=True) + payload = json.dumps({"url": file_url}) + md5_hasher = hashlib.md5() + md5_hasher.update(payload.encode('utf-8')) + content_md5 = md5_hasher.hexdigest() + content_type = "application/json" + signing_string = app_key + content_md5 + content_type + current_time + hasher = hashlib.sha1() + hasher.update(signing_string.encode('utf-8')) + signature = hasher.hexdigest() + authorization_header = f"WPS-2:{app_id}:{signature}" + conn = http.client.HTTPSConnection("solution.wps.cn") + headers = { + 'Date': current_time, + 'Content-Md5': content_md5, + 'Content-Type': content_type, + 'Authorization': authorization_header + } + conn.request("POST", "/api/developer/v1/office/pdf/convert/to/docx", payload, headers) + res = conn.getresponse() + data = res.read() + response_json = json.loads(data.decode("utf-8")) + return response_json['data']['task_id'] + +if __name__ == "__main__": + file_url="C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\zbfile.pdf" + submit_conversion_task(file_url) + diff --git a/flask_app/main/通义千问.py b/flask_app/main/通义千问.py new file mode 100644 index 0000000..cfe7165 --- /dev/null +++ b/flask_app/main/通义千问.py @@ -0,0 +1,48 @@ +import json +import random +from http import HTTPStatus +from dashscope import Generation + +def call_with_messages(messages): + response = Generation.call(model="qwen-max", + messages=messages, + seed=random.randint(1, 10000), + temperature=0.5, + top_p=0.5, + top_k=50, + result_format='message') + if response.status_code == HTTPStatus.OK: + content = response.output['choices'][0]['message']['content'] + return content + else: + raise Exception(f'Request id: {response.request_id}, Status code: {response.status_code}, error code: {response.code}, error message: {response.message}') + +def prepare_question_from_json(json_path, prompt): + with open(json_path, 'r', encoding='utf-8') as file: + json_data = json.load(file) + question = json.dumps(json_data, ensure_ascii=False) + prompt + return question + +#专用于判断是否 +def qianwen_ask(json_path, prompt): + messages = [] + question = prepare_question_from_json(json_path, prompt) + messages.append({'role': 'system', 'content': 'You are a helpful assistant.'}) + messages.append({'role': 'user', 'content': question}) + return call_with_messages(messages) + +#通用问题 +def qianwen_ask2(questions): + messages = [] + messages.append({'role': 'system', 'content': 'You are a helpful assistant.'}) + messages.append({'role': 'user', 'content': questions}) + return call_with_messages(messages) + +if __name__ == '__main__': + json_path = 'judge_exist.json' + prompt = "请你依据以上信息回答,是否组织踏勘现场?是否召开投标预备会?是否允许偏离?是否退还投标文件?是否允许分包? 是否需要递交投标保证金?是否有履约保证金(履约担保)?是否有招标代理服务费?请按json格式给我提供信息,键名分别为'是否组织踏勘现场','是否召开投标预备会','是否允许偏离','是否退还投标文件',是否允许分包','是否需要递交投标保证金','是否有履约保证金','是否有招标代理服务费',键值仅限于'是','否','未知',若存在未知或者矛盾信息,请回答'未知'。" + try: + content = qianwen_ask(json_path, prompt) + print(content) + except Exception as e: + print(f"An error occurred: {e}") diff --git a/flask_app/main/通义千问long.py b/flask_app/main/通义千问long.py new file mode 100644 index 0000000..ecdd8ab --- /dev/null +++ b/flask_app/main/通义千问long.py @@ -0,0 +1,67 @@ +import time +from pathlib import Path +from openai import OpenAI +import os + +def upload_file(file_path): + """ + Uploads a file to DashScope and returns the file ID. + """ + client = OpenAI( + api_key=os.getenv("DASHSCOPE_API_KEY"), + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" + ) + file = client.files.create(file=Path(file_path), purpose="file-extract") + return file.id + +def qianwen_long(file_id, user_query): + print("call qianwen-long...") + """ + Uses a previously uploaded file to generate a response based on a user query. + """ + client = OpenAI( + api_key=os.getenv("DASHSCOPE_API_KEY"), + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" + ) + + # Generate a response based on the file ID + completion = client.chat.completions.create( + model="qwen-long", + top_p=0.5, + temperature=0.5, + messages=[ + { + 'role': 'system', + 'content': f'fileid://{file_id}' + }, + { + 'role': 'user', + 'content': user_query + } + ], + stream=False + ) + + # Return the response content + return completion.choices[0].message.content + +if __name__ == "__main__": + # Example file path - replace with your actual file path + + file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\output1\\ztb_evaluation_method.pdf" + file_id = upload_file(file_path) + + user_query1 = ("根据该文档中的评标办法前附表,请你列出该文件的技术标,商务标,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在嵌套键如'技术标'中新增键名'备注'存放该信息。如果评分内容不是这3个,则返回文档中给定的评分内容以及它的评分要求,都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容") + user_query2 = ("请提供文件中关于资格审查的具体内容和标准。") + start_time=time.time() + # First query + print("starting qianwen-long...") + result1 = qianwen_long(file_id, user_query1) + print("First Query Result:", result1) + + # # Second query + # print("starting qianwen-long...") + # result2 = qianwen_long(file_id, user_query2) + # print("Second Query Result:", result2) + # end_time=time.time() + # print("elapsed time:"+str(end_time-start_time)) diff --git a/flask_app/static/提示词/前两章提问总结.txt b/flask_app/static/提示词/前两章提问总结.txt new file mode 100644 index 0000000..ab917c1 --- /dev/null +++ b/flask_app/static/提示词/前两章提问总结.txt @@ -0,0 +1,31 @@ +1.该招标文件的工程名称(项目名称)是?招标编号是?招标人是?招标代理机构是?请按json格式给我提供信息,键名分别是'工程名称','招标编号','招标人','招标代理机构',若存在未知信息,在对应的键值中填'未知'。 + +#该招标文件的工程概况(或项目概况)是?招标范围是?招标控制价(可指代投标限价、投资概算金额、工程概算金额、合同估算价,但非监理费用)是?该项目的计划工期(监理服务期)是?该项目是否接受联合体投标?请按json格式给我提供信息,键名分别为'工程概况','招标范围','招标控制价','计划工期','是否接受联合体投标',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知','是否接受联合体投标'的键值仅限于'是'、'否'、'未知'。 +2.该招标文件的工程概况(或项目概况)是?招标范围是?请按json格式给我提供信息,键名分别为'工程概况','招标范围',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。 + +3.该招标文件的招标控制价(可指代投标限价、投资概算金额、工程概算金额、合同估算价,但非监理费用)是?该项目是否接受联合体投标?请按json格式给我提供信息,键名分别为'招标控制价','是否接受联合体投标',若存在未知信息,在对应的键值中填'未知','是否接受联合体投标'的键值仅限于'是'、'否'、'未知'。 + +4.投标文件递交截止日期是?递交方式是?请按json格式给我提供信息,键名分别是'投标文件递交截止日期','递交方式',若存在未知信息,在对应的键值中填'未知'。 + +5.招标人和招标代理机构的联系方式是?请按json格式给我提供信息,键名分别是'招标人联系方式','招标代理机构联系方式',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。 + +##8.该项目的开标时间和地点是?请按json格式给我提供信息,键名为'开标时间'和'开标地点',若存在未知信息,在对应的键值中填'未知'。 + +##(三个问题分开问)根据第二章投标人须知的内容,该招标文件是否允许分包? 是否需要递交投标保证金?是否有履约保证金(履约担保)?是否有招标代理服务费?你需要留意☑后的内容。请按json格式给我提供信息,键名分别为'是否允许分包','是否递交投标保证金','是否有履约保证金','是否有招标代理服务费',键值仅限于'是','否','未知'。可以一起问,设置摘取分段为8,仍存在问题:pdf转word文件打勾符号可能会无法正常显示,解决思路1:根据原pdf进行提取 + +6.该招标文件的评标结果(定标候选人)公示媒介在哪?请按json格式给我提供信息,键名是'评标结果公示媒介',若存在未知信息,在对应的键值中填'未知'。 + +7.该招标文件的投标竞争下浮率是多少?请按json格式给我提供信息,键名是'投标竞争下浮率',若存在未知信息,在对应的键值中填'未知'。 + +#11.该招标文件的投标竞争下浮率是多少?若请按json格式给我提供信息,键名是'投标竞争下浮率',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。 + +8.该项目的投标有效期是什么?请按json格式给我提供信息,键名是'投标有效期',若存在未知信息,在对应的键值中填'未知'。 +#该招标中对于实质性要求(废标项)的内容有哪些?规定投标人不得存在的情形有哪些?文件中提及的否决和无效投标情形有哪些?请以json格式返回结果,键名分别'实质性要求','不得存在的情形','否决和无效投标情形',若存在未知信息,请在对应键值中填'未知',你的回答一切以原文内容为准,不可改动。 + +#8.该招标文件的电子招标文件获取方式是?请按原文段落全部完整内容回答,以json的格式给我提供信息,键名是'电子招标文件获取方式',若存在未知信息,在对应的键值中填'未知'。 + +9.该招标文件对投标人准备和参加投标活动发生的费用是如何规定的?请以json的格式给我提供信息,键名是'费用承担',若存在未知信息,在对应的键值中填'未知'。 + +10.求澄清的招标文件截止时间是?请以json的格式给我提供信息,键名是'投标人要求澄清招标文件的截止时间',若存在未知信息,在对应的键值中填'未知'。 + +11.该文档要求扣留的质量保证金百分比是多少,请以json格式给我提供信息,键名为'质量保证金',如果没有则以'未知'填充。 diff --git a/flask_app/static/提示词/是否相关问题.txt b/flask_app/static/提示词/是否相关问题.txt new file mode 100644 index 0000000..71d5c11 --- /dev/null +++ b/flask_app/static/提示词/是否相关问题.txt @@ -0,0 +1,10 @@ +#pdf提取之后的提示词,调用普通通译千问: +#请你依据以上信息回答,是否允许分包? 是否需要递交投标保证金?是否有履约保证金(履约担保)?是否有招标代理服务费?请按json格式给我提供信息,键名分别为'是否允许分包','是否递交投标保证金','是否有履约保证金','是否有招标代理服务费',键值仅限于'是','否','未知'。 +1.该招标文件对于分包的要求是怎样的?请按json格式给我提供信息,键名为'分包'。 +2.根据招标文件第二章投标人须知,该项目投标保证金需要缴纳金额是多少?到账截止时间是?缴纳形式是?请按json格式给我提供信息,外层键名为'投标保证金',嵌套键名分别为'缴纳金额','到账截止时间','缴纳形式',若存在多种缴纳形式,则在'缴纳形式'下以各种缴纳形式作为嵌套键名,再在对应的缴纳形式下嵌套写出缴纳步骤或要求或账户信息,请详细回答,不要遗漏原文信息。 +3.该招标文件对于投标保证金的退还相关的规章办法是怎样的?请按json格式给我提供信息,键名为'退还投标保证金',若存在嵌套信息,嵌套内容键名以文档中对应字段命名。 +4.根据投标人须知前附表,该项目对于履约保证金(担保)的要求中,它的履约担保形式是怎样的?它的履约担保金额是多少?请按json格式给我提供信息,外层键名为'履约保证金',嵌套键名分别是'履约担保形式','担保金额',若存在多种履约担保形式,则在'履约担保形式'下以各种履约担保形式作为嵌套键名,若存在未知信息,在对应的键值中填'未知'。 +5.本项目的招标代理服务费由谁支付?支付标准是什么?支付方式是什么?支付时间是什么?请按json格式给我提供信息,外层键名为'招标代理服务费',嵌套键名分别是'支付人','支付标准','支付方式','支付时间',若存在未知信息,在对应的键值中填'未知'。 +6.该招标文件对于踏勘现场是怎样的,踏勘时间和踏勘集中地点是?请以json格式给我提供信息,外层键名为'踏勘现场',嵌套键名分别是'踏勘时间','踏勘地点',若存在其他信息,新增嵌套键名'备注',填入其中,若存在未知信息,在对应的键值中填'未知'。 +7.该招标文件对于投标预备会内容是怎样的,召开时间和召开地点是?请以json格式给我提供信息,外层键名为'投标预备会',嵌套键名分别是'召开时间','召开地方',若存在其他信息,新增嵌套键名'备注',填入其中,若存在未知信息,在对应的键值中填'未知'。 +8.本项目可偏离的项目和范围是怎样的?请以json格式给我提供信息,外层键名为'偏离'。 \ No newline at end of file diff --git a/flask_app/static/提示词/第三章提示词.txt b/flask_app/static/提示词/第三章提示词.txt new file mode 100644 index 0000000..2fd7454 --- /dev/null +++ b/flask_app/static/提示词/第三章提示词.txt @@ -0,0 +1,111 @@ +资格评审-》商务文件: +资质要求、财务要求、业绩要求、主要人员要求、信誉要求 + + + +3. 投标人资格要求 +3.1 资格要求: +(1)本次招标要求投标申请人必须是在中华人民共和国境内注册,并具有独立法人资格的有 +效营业执照、组织机构代码证、税务登记证(或多证合一)。 +(2)投标人必须具备国家行政主管部门核发的工程监理综合资质或房屋建筑工程监理资质甲 +级资质。 +(3)投标人拟派总监理工程师须具备房屋建筑专业注册监理工程师执业资格、工程类相关专 +业的高级及以上职称,并在本单位注册,须提供无在监项目承诺函。 +3.2 业绩要求 +(1) 投标人 2017 年 12 月 1 日至今至少承接过 1 项施工合同投资金额达 1 亿元及以上的房 +屋建筑工程监理业绩 ,并在人员、设备、资金等方面具有相应的监理能力。 +(2)拟派项目总监理工程师 2017 年 12 月 1 日至今至少承接 1 项施工合同投资金额达 1 亿元 +及以上的房屋建筑工程监理业绩。 +3.3 财务能力要求 +(1)提供近三年(2019、2020、2021 年)财务审计报告(新成立的公司需提供自成立之日起 +相应年度的财务审计报告),且近 3 年均无亏损(新成立的公司自成立之日起年度起)。 +3.5 本次招标(接受或不接受)联合体投标:不接受。 + +资质条件:见本章附件 +主要人员要求:见本章附件 +信誉要求:见本章附件 +其他要求:见本章附件 + + +1.4 投标人资格要求(适用于未进行资格预审的) +1.4.1 投标人应具备承担本标段监理的资质条件、能力和信誉。 +(1)资质条件:见投标人须知前附表; +(2)主要人员要求:见投标人须知前附表; +(3)信誉要求:见投标人须知前附表; +(4)其他要求:见投标人须知前附表。 + + + +基本信息: +工程名称(项目名称)是?招标编号是?招标人是?招标代理机构是?投标文件(递交)截止日期是?招标范围是?招标限价(工程、投资概算)是多少?投标保证金是多少? +请以键值对的形式给出回答,要求简洁准确,不多余 + + +##资格评审:(在第二章附件:投标人资质条件、能力和信誉(资格审查标准)表中) + +营业执照:具备有效的营业执照 + +安全生产许可证:具备有效的安全生产许可证 + +资质要求: +该招标文件对于投标人的资质条件(等级)是怎样的,要求给出完整资质要求内容、需要提交的证明材料,并按json格式给我提供信息,外层键名为'资质条件',对于各个'资质条件',嵌套键名为'内容'和'证明材料',若存在未知信息,在对应的键值中填'未知',不要回答有关业绩要求、财务要求、主要人员要求、信誉要求的内容。 + +业绩要求: +该招标文件对于投标人的业绩要求是怎样的,至少需要包含业绩时间范围、业绩内容(如有)、金额要求(如有)、需要提交的证明材料、备注(其他关于业绩要求的内容,如有),请按json格式给我提供信息,最外层键名为'业绩要求',若相关要求不存在,则在对应的键值中填'未知'。 + +财务要求: +该招标文件对于投标人的财务要求是怎样的,要求给出财务报告的时间范围、营收(若利润)要求、需要提交的证明材料、备注(其他关于财务要求的内容,如有),请按json格式给我提供信息,最外层键名为'财务要求',若相关要求不存在,则在对应的键值中填'未知'。 + +信誉要求: +该招标文件对于投标人的信誉要求是怎样的。请按json格式给我提供信息,键名为'信誉要求',对于各个'信誉要求',嵌套键名为'内容'和'证明材料',若存在未知信息,在对应的键值中填'未知',不要回答无关信誉要求的内容。 + +(存在问题)主要人员要求: +该招标文件对于投标人的项目经理(监理)和技术负责人的要求是怎样的,请依次给出需要的数量、资格要求、需要提交的证明材料(如具体的社保证明、技能证书等,若有时间要求请注明时间范围)、在岗要求、备注(除上述要求外文档中提及的其他关于项目经理(监理)和技术负责人要求的信息),以json的形式给出,若相关要求不存在,则以“未知”填充。 + +该招标文件对于项目管理机构中除项目经理和技术负责人外的其他人员要求是怎样的,请依次给出所需的岗位、对应的数量、资格要求、需要提交的证明材料(如具体的社保证明、技能证书等,若有时间要求请注明时间范围)、在岗要求、备注(除上述要求外文档中提及的其他关于人员要求的信息),以json的形式给出,若相关要求不存在,则以“未知”填充。 + +是否有'施工机械设备'和'企业信息登记' +该招标文件对于'施工机械设备'和'企业信息登记'的要求是怎样的,请按json格式给我提供信息,若存在未知信息,在对应的键值中填'未知'。 + + +(需要与第一章对应)联合体投标: +该招标文件是否接受联合体投标? +该招标文件对于联合体投标的要求是怎样的,请按json格式给我提供信息,外层键名为'联合体投标要求'。 + +(需跳转)禁止投标的情形: +在资格审查中,该招标文件规定的投标人不得存在的情形有哪些,请按json格式给我提供信息,键名为'禁止投标的情形'。 + +根据该招标文件的第三章评标办法前附表,对投标文件的评分分值构成是?请以json的格式返回。如果没有评分分值构成,请回答我该文件对投标文件评分标准是?这里的标准不需要具体展开。 + + +##形式评审: +该招标文件的形式评审标准是怎样的?以n行3列表格形式给出,表头为序号、评审因素和评审标准,不要回答有关资格评审标准、响应性评审标准相关的内容,尽量用原文内容进行表述。 + + +## 响应性评审 +该招标文件的响应性评审标准是怎样的?以n行3列表格形式给出,表头为序号、评审因素和评审标准,不要回答有关资格评审标准、形式评审标准相关的内容,尽量用原文内容进行表述。 + +prompt=""" +# 角色 +你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。 + +## 技能 +### 技能 1:文档解析与摘要 +- 深入理解并分析${documents}的内容,提取关键信息。 +- 根据需求生成简洁明了的摘要,保持原文核心意义不变。 + +### 技能 2:信息检索与关联 +- 在${documents}中高效检索特定信息或关键词。 +- 能够识别并链接到文档内部或外部的相关内容,增强信息的连贯性和深度。 + +## 限制 +- 所有操作均需基于${documents}的内容,不可超出此范围创造信息。 +- 在处理敏感或机密信息时,需遵守严格的隐私和安全规定。 +- 确保所有生成或改编的内容逻辑连贯,无误导性信息。 + +请注意,上述技能执行时将直接利用并参考${documents}的具体内容,以确保所有产出紧密相关且高质量。 +""" + + +投标内容==招标范围、监理服务期、监理工作范围、投标有效期、投标保证金、算术错误修正、投标价格(报价)、其他、工期、工程质量==(质量,质量标准)、权利义务、已标价工程量清单、技术标准和要求、招标人不能接受的条件、分包计划、重大偏差 + diff --git a/flask_app/static/提示词/资格评审问题.txt b/flask_app/static/提示词/资格评审问题.txt new file mode 100644 index 0000000..076813f --- /dev/null +++ b/flask_app/static/提示词/资格评审问题.txt @@ -0,0 +1,35 @@ +#资质要求: +#1.该招标文件对于投标人的资质条件(等级)是怎样的,要求给出完整资质要求内容、需要提交的证明材料,并按json格式给我提供信息,外层键名为'资质条件',对于各个'资质条件',嵌套键名为'内容'和'证明材料',若存在未知信息,在对应的键值中填'未知',不要回答有关业绩要求、财务要求、主要人员要求、信誉要求的内容。 + +#业绩要求: +#2.该招标文件对于投标人的业绩要求是怎样的,至少需要包含业绩时间范围、业绩内容(如有)、金额要求(如有)、需要提交的证明材料、备注(其他关于业绩要求的内容,如有),请按json格式给我提供信息,最外层键名为'业绩要求',若相关要求不存在,则在对应的键值中填'未知'。 + +#财务要求: +#3.该招标文件对于投标人的财务要求是怎样的,要求给出财务报告的时间范围、营收(若利润)要求、需要提交的证明材料、备注(其他关于财务要求的内容,如有),请按json格式给我提供信息,最外层键名为'财务要求',若相关要求不存在,则在对应的键值中填'未知'。 + +#信誉要求: +#4.该招标文件对于投标人的信誉要求是怎样的,请按json格式给我提供信息,键名为'信誉要求',对于各个'信誉要求',嵌套键名为'内容'和'证明材料',若存在未知信息,在对应的键值中填'未知',不要回答无关信誉要求的内容。 + +#(存在问题)主要人员要求: +#5.该招标文件对于投标人的项目经理(监理)和技术负责人的要求是怎样的,请依次给出需要的数量、资格要求、需要提交的证明材料(如具体的社保证明、技能证书等,若有时间要求请注明时间范围)、在岗要求、备注(除上述要求外文档中提及的其他关于项目经理(监理)和技术负责人要求的信息),以json的形式给出,键名分别是"项目经理"和"技术负责人",若相关要求不存在,则以“未知”填充。 + +#6.该招标文件对于项目管理机构中除项目经理和技术负责人外的其他人员要求是怎样的,请依次给出所需的岗位、对应的数量、资格要求、需要提交的证明材料(如具体的社保证明、技能证书等,若有时间要求请注明时间范围)、在岗要求、备注(除上述要求外文档中提及的其他关于人员要求的信息),以json的形式给出,最外层键名为"其他人员",嵌套的键名为为具体的岗位名称,若相关要求不存在,则以“未知”填充。 + +#(需要与第一章对应)联合体投标: +#该招标文件是否接受联合体投标? +7.该招标文件对于联合体投标的要求是怎样的,请按json格式给我提供信息,外层键名为'联合体投标要求(如有)'。 + +#(需跳转)禁止投标的情形: +#8.该招标文件规定的投标人不得存在的其他情形有哪些,请按json格式给我提供信息,键名为'不得存在的其他情形',请你不要回答有关"信誉要求"的内容。 + +#9.在资格评审标准中,除了'资质要求','业绩要求','财务要求','信誉要求','人员要求','联合体投标要求','禁止投标的情形',还有其他要求吗,请按json格式给我提供信息,最外层键名为'其他要求',若存在嵌套信息,嵌套内容键名以文件中对应字段命名,若存在未知信息,在对应的键值中填'未知'。注意不要回答'形式评审标准'和'响应性评审标准'的内容。 + +#施工机械设备、企业信息登录 +#9.该招标文件对于'施工机械设备'的要求是怎样的,请按json格式给我提供信息,键名为'施工机械设备',若存在未知信息,在对应的键值中填'未知'。 +#10.该招标文件对于'企业信息登录'的要求是怎样的,请按json格式给我提供信息,键名为'企业信息登录',若存在未知信息,在对应的键值中填'未知'。 + +#该招标文件中资格评审的内容是怎样的?具体内容包括'资质条件','财务状况','类似业绩','信誉','施工机械设备','其他要求',请你以json格式返回结果,外层键名为'资格评审',嵌套键名为具体的字段,请你忠于原文,回答要求完整准确,不要擅自总结、删减。 + +#该招标文件中资格评审中有关'项目经理资格','设计负责人资格','施工负责人资格','项目管理机构及人员'的要求是怎样的?请依次给出所需的岗位、需要的数量、资格要求、需要提交的证明材料(如具体的社保证明、技能证书等,若有时间要求请注明时间范围)、在岗要求、备注,若相关要求不存在,则以“未知”填充。请你以json格式返回结果,外层键名为'资格评审',嵌套键名为具体的要求,请你忠于原文,回答要求完整准确,不要擅自总结、删减。 + +#该招标文件中资格评审中有关人员资格的要求是怎样的?请依次给出所需的岗位、需要的数量、资格要求、需要提交的证明材料(如具体的社保证明、技能证书等,若有时间要求请注明时间范围)、在岗要求、备注,若相关要求不存在,则以“未知”填充。请你以json格式返回结果,外层键名为'资格评审',嵌套键名为具体的要求,请你忠于原文,回答要求完整准确,不要擅自总结、删减。 diff --git a/flask_app/货物标/__init__.py b/flask_app/货物标/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/flask_app/货物标/extract_procurement_requirements.py b/flask_app/货物标/extract_procurement_requirements.py new file mode 100644 index 0000000..88a6b04 --- /dev/null +++ b/flask_app/货物标/extract_procurement_requirements.py @@ -0,0 +1,36 @@ +import os +import sys +from 货物标截取pdf import truncate_pdf_main +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) +# from ..main.format_change import docx2pdf +# from ..main.多线程提问 import multi_threading +from ..main.通义千问long import upload_file,qianwen_long +from ..main.json_utils import clean_json_string + +def generate_key_paths(data, parent_key=''): + key_paths = [] + for key, value in data.items(): + current_key = f"{parent_key}.{key}" if parent_key else key + if isinstance(value, dict): + # 如果值是字典,递归调用 + key_paths.extend(generate_key_paths(value, current_key)) + else: + # 如果到达了末端,添加当前键路径 + key_paths.append(current_key) + return key_paths + +#获取采购清单 +def fetch_purchasing_list(file_path): + output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output" + # file_path = docx2pdf(file_path) + truncate_path=truncate_pdf_main(file_path,output_folder,1) + user_query="这是一份货物标中采购要求部分的内容,你需要摘取出需要采购的系统(货物),一个大系统(大项)中可能包含多个小系统(小项),你需要保留这种层次关系,给出货物名称,请以json格式返回,外层键名为\"采购需求\",嵌套键名为对应的系统名称或货物名称,无需给出采购数量和单位,如有未知内容,在对应键值处填\"未知\"。" + file_id=upload_file(truncate_path) + res=qianwen_long(file_id,user_query) + cleaned_res=clean_json_string(res) + keys_list=generate_key_paths(cleaned_res['采购需求']) + print(keys_list) + +if __name__ == "__main__": + file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\招招招标文件(一中多媒体报告厅教学设备)_20240829101650_tobidders_notice_table.pdf" + fetch_purchasing_list(file_path) diff --git a/flask_app/货物标/货物标截取pdf.py b/flask_app/货物标/货物标截取pdf.py new file mode 100644 index 0000000..ac649d7 --- /dev/null +++ b/flask_app/货物标/货物标截取pdf.py @@ -0,0 +1,105 @@ +from PyPDF2 import PdfReader, PdfWriter +import re # 导入正则表达式库 +import os # 用于文件和文件夹操作 + +def clean_page_numbers(text): + # 使用正则表达式删除页码 + # 假设页码在文本的最开始,紧跟着文字且无空格分隔 + cleaned_text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 + # 删除结尾的页码 + cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text) + # 删除形如 /129 的页码 + cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text) + return cleaned_text +def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): + # 打开PDF文件 + pdf_document = PdfReader(pdf_path) + start_page = None + end_page = None + # 遍历文档的每一页,查找开始和结束短语的位置 + for i in range(len(pdf_document.pages)): + page = pdf_document.pages[i] + text = page.extract_text() + if text: + cleaned_text = clean_page_numbers(text) + if re.search(begin_pattern, cleaned_text) and i > begin_page: + start_page = i + if start_page is not None and re.search(end_pattern, cleaned_text) and i > (start_page+1): + end_page = i + break + # 确保找到了起始和结束页面 + if start_page is None or end_page is None: + print(f"未找到起始或结束页在文件 {pdf_path} 中!") + return None + + # 创建一个新的PDF文档保存截取的页面 + base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # Get the base name without extension + output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf") + output_doc = PdfWriter() + + # 添加需要的页面,从 start_page 开始,包括 end_page + for page_num in range(start_page, end_page + 1): + output_doc.add_page(pdf_document.pages[page_num]) + # 保存新的PDF文件 + with open(output_pdf_path, 'wb') as f: + output_doc.write(f) + + print(f"已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}") + + return output_pdf_path + +def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): + # 确保输出文件夹存在 + if not os.path.exists(output_folder): + os.makedirs(output_folder) + if os.path.isdir(input_path): + generated_files = [] + # 遍历文件夹内的所有PDF文件 + for file in os.listdir(input_path): + if file.endswith(".pdf"): + pdf_path = os.path.join(input_path, file) + output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) + if output_pdf_path and os.path.isfile(output_pdf_path): + generated_files.append(output_pdf_path) + return generated_files + elif os.path.isfile(input_path) and input_path.endswith(".pdf"): + # 处理单个PDF文件 + output_pdf_path = extract_pages(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) + if output_pdf_path and os.path.isfile(output_pdf_path): + return [output_pdf_path] # 以列表形式返回,以保持一致性 + else: + print("提供的路径既不是文件夹也不是PDF文件。") + return [] + + +def truncate_pdf_main(input_path, output_folder, selection): + if selection == 1: + # Configure patterns and phrases for "投标人须知前附表" + begin_pattern = re.compile(r'第[一二三四五六七八九十百千]+章.*?(?:项目|服务|商务).*?要求') + begin_page = 5 + end_pattern = re.compile(r'第[一二三四五六七八九十百千]+章\s*(资格审查|评标方法|评审办法)') + # 示例文本进行测试 + output_suffix = "tobidders_notice_table" + + else: + print("无效的选择") + return None + + # Process the selected input + return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) + +def truncate_pdf_multiple(input_path, output_folder): + truncate_files = [] + for selection in range(1, 2): + files = truncate_pdf_main(input_path, output_folder, selection) + truncate_files.extend(files) + return truncate_files + +if __name__ == "__main__": + input_path = "C:\\Users\\Administrator\\WPSDrive\\292826853\\WPS云盘\\应用\\输出为PDF\\公安局音视频监控系统设备采购项目(定稿)_20240829133603.pdf" + output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1" + # truncate_pdf_multiple(input_path,output_folder) + selection = 1 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-招标公告-合同条款前 + generated_files = truncate_pdf_main(input_path, output_folder, selection) + # print("生成的文件:", generated_files) + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..55e8bf7 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,17 @@ +requests==2.32.3 +PyPDF2==3.0.1 +Flask==3.0.3 +python-docx==1.1.2 +llama-index-core==0.10.44 +llama-index-embeddings-dashscope==0.1.3 +llama-index-indices-managed-dashscope-custom==0.1.1 +llama-index-llms-dashscope==0.1.2 +llama-index-node-parser-dashscope-custom==0.1.2 +llama-index-readers-dashscope-custom==0.1.2 +llama-index-readers-file==0.1.23 +llamaindex-py-client==0.1.19 +dashscope==1.19.2 +PyMuPDF==1.24.1 +openai==1.33.0 +pathlib==1.0.1 +alibabacloud_bailian20231229==1.7.0 \ No newline at end of file