From 906476ad2c4d2bd1fb3b7783eccd4867ba9e8d77 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Mon, 11 Nov 2024 17:12:38 +0800 Subject: [PATCH] =?UTF-8?q?11.8=20=E8=AF=84=E6=A0=87=E4=BF=AE=E6=94=B9=20?= =?UTF-8?q?=E6=8A=80=E6=9C=AF=E5=8F=82=E6=95=B0=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../general/投标人须知正文提取指定内容.py | 2 +- .../投标人须知正文条款提取成json文件.py} | 261 ++-------------- flask_app/general/清除file_id.py | 2 + flask_app/main/table_content_extraction.py | 2 +- flask_app/main/工程标解析main.py | 21 +- flask_app/main/形式响应评审.py | 4 +- flask_app/main/截取pdf.py | 14 +- flask_app/main/投标人须知正文提取指定内容.py | 94 +++--- .../main/投标人须知正文条款提取成json文件.py | 287 ------------------ flask_app/main/提取json工程标版.py | 100 ++++++ flask_app/main/无效标和废标和禁止投标整合.py | 16 +- flask_app/main/解析old.py | 2 +- flask_app/main/资格审查模块.py | 167 ++++++++-- flask_app/main/资格评审.py | 14 +- flask_app/main/资格评审new.py | 120 ++++++++ flask_app/old_version/形式响应评审old.py | 2 +- flask_app/old_version/招标文件解析.py | 2 +- flask_app/old_version/资格审查模块old.py | 2 +- flask_app/货物标/技术参数要求提取.py | 6 +- .../投标人须知正文提取指定内容货物标版.py | 1 - flask_app/货物标/提取json货物标版.py | 194 ++++++++++++ flask_app/货物标/货物标解析main.py | 2 +- flask_app/货物标/资格审查main.py | 19 +- 23 files changed, 692 insertions(+), 642 deletions(-) rename flask_app/{货物标/投标人须知正文条款提取成json文件货物标版.py => general/投标人须知正文条款提取成json文件.py} (61%) delete mode 100644 flask_app/main/投标人须知正文条款提取成json文件.py create mode 100644 flask_app/main/提取json工程标版.py create mode 100644 flask_app/main/资格评审new.py create mode 100644 flask_app/货物标/提取json货物标版.py diff --git a/flask_app/general/投标人须知正文提取指定内容.py b/flask_app/general/投标人须知正文提取指定内容.py index 67292bb..44cb48f 100644 --- a/flask_app/general/投标人须知正文提取指定内容.py +++ b/flask_app/general/投标人须知正文提取指定内容.py @@ -199,7 +199,7 @@ def concatenate_keys_values(section_content): concatenated.append(f"{key} {value}") return concatenated -#生成无结构的数据工程标 +#生成无结构的数据工程标,对提取出的若干键值对,生成外键为target_value,值为列表的新键值对 def extract_sections(data, target_values): """ Extracts sections from the input dictionary where the top-level keys' values diff --git a/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py b/flask_app/general/投标人须知正文条款提取成json文件.py similarity index 61% rename from flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py rename to flask_app/general/投标人须知正文条款提取成json文件.py index ffb11be..fd64a94 100644 --- a/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py +++ b/flask_app/general/投标人须知正文条款提取成json文件.py @@ -1,90 +1,8 @@ -import json -import docx import re -import os -import fitz from PyPDF2 import PdfReader + from flask_app.货物标.截取pdf货物标版 import clean_page_content,extract_common_header - -def extract_text_from_docx(file_path): - doc = docx.Document(file_path) - return '\n'.join([para.text for para in doc.paragraphs]) - -#PYPDF2版本 -def extract_text_from_pdf(file_path, start_word, end_pattern): - # 从PDF文件中提取文本 - common_header = extract_common_header(file_path) - pdf_document = PdfReader(file_path) - exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') - all_pages_text = [] - start_index = None - # 处理所有页面 - for i, page in enumerate(pdf_document.pages): - page_text = page.extract_text() if page.extract_text() else "" - cleaned_text = clean_page_content(page_text, common_header) - # print(cleaned_text) - # 在第一页查找开始位置 - if i == 0 and start_index is None: - start_match = re.search(start_word, cleaned_text, re.MULTILINE) - if start_match: - start_index = start_match.start() - cleaned_text = cleaned_text[start_index:] - - # 在最后一页查找结束位置 - if i == len(pdf_document.pages) - 1: - matches = list(re.finditer(end_pattern, cleaned_text, re.MULTILINE)) - if matches: - end_index = matches[-1].start() - cleaned_text = cleaned_text[:end_index] - - all_pages_text.append(cleaned_text) - - # 合并所有页面的文本 - full_text = "\n".join(all_pages_text) - return full_text - -#fitz库版本 -# def extract_text_from_pdf(file_path, start_word, end_pattern): -# # 从PDF文件中提取文本 -# common_header = extract_common_header(file_path) -# doc = fitz.open(file_path) -# all_pages_text = [] -# start_index = None -# -# # 处理所有页面 -# for i in range(len(doc)): -# page = doc[i] -# page_text = page.get_text() -# cleaned_text = clean_page_content(page_text, common_header) -# print(cleaned_text) -# print("yes") -# # 在第一页查找开始位置 -# if i == 0 and start_index is None: -# start_match = re.search(start_word, cleaned_text, re.MULTILINE) -# if start_match: -# start_index = start_match.start() -# cleaned_text = cleaned_text[start_index:] -# -# # 在最后一页查找结束位置 -# if i == len(doc) - 1: -# for pattern in end_pattern: -# matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE)) -# if matches: -# end_index = matches[-1].start() -# cleaned_text = cleaned_text[:end_index] -# break -# -# all_pages_text.append(cleaned_text) -# -# # 合并所有页面的文本 -# full_text = "\n".join(all_pages_text) -# # 关闭文档 -# doc.close() -# -# return full_text - - def compare_headings(current, new): # 使用过滤来确保只处理非空且为数字的部分 current_nums = [int(num) for num in current.split('.') if num.isdigit()] @@ -115,16 +33,6 @@ def handle_content_append(current_content, line_content, append_newline, keyword current_content.append('\n') return append_newline -""" -保存换行符的具体逻辑: - -对于二级标题(如 1.1),如果其后的内容包含关键词或内容较短(<=20字符),会在内容前添加一个换行符。 -这个换行符会被保留在 current_content 列表中。 -当处理下一个标题时,之前的内容(包括可能存在的换行符)会被合并并保存到 data 字典中。 - -解决了'一''二'这类标题出现在正文中的情况。但是目前的逻辑是如果'一'已有了,就加入正文,否则'一'作为新的标题。 -""" -#提取json主函数 def parse_text_by_heading(text): keywords = ['包含', '以下'] data = {} @@ -369,145 +277,34 @@ def parse_text_by_heading(text): return data -#type=2时提取货物标的第一章招标公告时采用该逻辑 -def parse_text_to_dict(text): - """ - 解析文本,根据大标题划分内容,生成字典。 +def extract_text_from_pdf(file_path, start_word, end_pattern): + # 从PDF文件中提取文本 + common_header = extract_common_header(file_path) + pdf_document = PdfReader(file_path) + exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') #仅匹配第一页和最后一页,不需要exclusion_pattern + all_pages_text = [] + start_index = None + # 处理所有页面 + for i, page in enumerate(pdf_document.pages): + page_text = page.extract_text() if page.extract_text() else "" + cleaned_text = clean_page_content(page_text, common_header) + # print(cleaned_text) + # 在第一页查找开始位置 + if i == 0 and start_index is None: + start_match = re.search(start_word, cleaned_text, re.MULTILINE) + if start_match: + start_index = start_match.start() + cleaned_text = cleaned_text[start_index:] - 参数: - text (str): 要解析的文本。 + # 在最后一页查找结束位置 + if i == len(pdf_document.pages) - 1: + matches = list(re.finditer(end_pattern, cleaned_text, re.MULTILINE)) + if matches: + end_index = matches[-1].start() + cleaned_text = cleaned_text[:end_index] - 返回: - dict: 大标题作为键,内容作为值的字典。 - """ - # 正则表达式模式:匹配以一至十的汉字数字开头,后跟顿号和任意字符,且位于行首 - pattern = re.compile(r'^([一二三四五六七八九十]+\s*、\s*.*)$', re.MULTILINE) + all_pages_text.append(cleaned_text) - # 使用 re.finditer 找到所有大标题的位置 - matches = list(pattern.finditer(text)) - - result = {} - for i, match in enumerate(matches): - title = match.group(1).strip() # 获取大标题文本 - start = match.end() # 内容的起始位置 - - if i + 1 < len(matches): - end = matches[i + 1].start() # 下一个大标题的起始位置 - else: - end = len(text) # 最后一个大标题,内容到文本末尾 - - content = text[start:end].strip() # 获取内容并去除前后空白 - # 规范化换行符,并移除每行开头和结尾的空白 - content = content.replace('\r\n', '\n') # 统一换行符 - content = re.sub(r'[ \t]+\n', '\n', content) # 移除行尾空白 - content = re.sub(r'^[ \t]+|[ \t]+$', '', content, flags=re.MULTILINE) # 移除行首和行尾空白 - content = clean_content(content) # 处理内容中的换行符 - result[title] = content - - return result -def clean_content(content): - """ - 处理内容中的换行符: - - 保留在子项编号前的换行符。 - - 保留在冒号 ':' 或全角冒号 ':' 前的第一个换行符。 - - 移除其他位置的换行符,不留下额外的空格。 - - 参数: - content (str): 要处理的内容字符串。 - - 返回: - str: 处理后的内容字符串。 - """ - # 定义子项编号的正则模式,包括: - # - 数字+点号+数字(如 1.1 或 1.1) - # - 数字+顿号(如 2、) - # - 点号+数字(如 .3 或 .3) - # - 数字+右括号(如 1) 或 1)) - # - 圆括号包围的数字(如 (5)) - # - 全角圆括号包围的数字(如 (5)) - # - 数字+点号(如 1. 或 1.) - numbering_pattern = r'(?:\d+[..]\d+(?:[..]\d+)*|\d+、|[..]\d+|\d+[))]|\(\d+\)|(\d+)|\d+[..])' - - # 定义需要保留换行符的情况: - # 1. 换行符前面是子项编号 - # 2. 换行符前面是任意非空白字符,且后面跟着 ':' 或 ':' - pattern_keep = r'\n(?=(?:' + numbering_pattern + r'|[^:\n\r\f\v]+[::]))' - - # 定义占位符,用于暂时替换需要保留的换行符 - placeholder = "___PLACEHOLDER___" - - # Step 1: 将需要保留的换行符替换为占位符 - content_with_placeholder = re.sub(pattern_keep, placeholder, content) - - # Step 2: 移除所有剩余的换行符 - content_no_newlines = content_with_placeholder.replace('\n', '') - - # Step 3: 将占位符替换回换行符 - cleaned_content = content_no_newlines.replace(placeholder, '\n') - - return cleaned_content - -#如果file_path为空,返回"" -def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json"): - if not os.path.exists(file_path): - print(f"The specified file does not exist: {file_path}") - return "" - if type == 1: - start_word = r'^\s*(?:[((]?\s*[一二12]?\s*[))]?\s*[、..]*\s*)?(说\s*明|总\s*则)' - end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*)$' - text = extract_text_from_pdf(file_path, start_word, end_pattern) - result = parse_text_by_heading(text) - else: - start_word = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*|.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*)$' - end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+' - text = extract_text_from_pdf(file_path, start_word, end_pattern) - result=parse_text_to_dict(text) - # result = convert_to_json(input_path, start_word, end_pattern) - # 检查输出文件夹是否存在,如果不存在则创建 - if not os.path.exists(output_folder): - os.makedirs(output_folder) - print(f"Created output folder: {output_folder}") - file_name = "clause1.json" if type == 1 else "clause2.json" - # file_name = f"clause{suffix_counter}.json" - output_path = os.path.join(output_folder, file_name) - with open(output_path, 'w', encoding='utf-8') as f: - json.dump(result, f, indent=4, ensure_ascii=False) - print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.") - return output_path - - -def process_folder(input_folder, output_folder): - # 获取输入文件夹中的所有文件,过滤掉不以 'part2' 结尾的文件 - files = [f for f in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, f)) and f.endswith('part2.pdf')] - - # 遍历文件并对每个文件进行处理 - for file_name in files: - file_path = os.path.join(input_folder, file_name) - # 去掉文件扩展名 - file_name_without_extension = os.path.splitext(file_name)[0] - try: - # 调用 convert_clause_to_json,传递文件路径、输出文件夹和递增的后缀 - output_path = convert_clause_to_json(file_path, output_folder, 1, file_name_without_extension) - print(f"Processed file: {file_name}, JSON saved to: {output_path}") - except ValueError as e: - print(f"Error processing {file_name}: {e}") - -#TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次)_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏 -#TODO: 投标人须知正文这块,序号可能是乱序的,或许可以删除判断序号大小的逻辑,只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理 - -if __name__ == "__main__": - # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf' - file_path=r'C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\ztbfile_tobidders_notice_part2.pdf' - # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf' - output_folder = r'C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\tmp' - try: - output_path = convert_clause_to_json(file_path,output_folder,1) - print(f"Final JSON result saved to: {output_path}") - except ValueError as e: - print("Error:", e) - - # input_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4' - # output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1' - # - # # 调用 process_folder 来处理整个文件夹中的所有文件 - # process_folder(input_folder, output_folder) + # 合并所有页面的文本 + full_text = "\n".join(all_pages_text) + return full_text \ No newline at end of file diff --git a/flask_app/general/清除file_id.py b/flask_app/general/清除file_id.py index ebe8988..cab29ba 100644 --- a/flask_app/general/清除file_id.py +++ b/flask_app/general/清除file_id.py @@ -17,6 +17,8 @@ file_data = json.loads(file_stk.model_dump_json()) # 提取所有文件的 id file_ids = [file["id"] for file in file_data["data"]] +# num=len(file_ids) +# print(num) # 循环删除每个文件 for file_id in file_ids: file_object = client.files.delete(file_id) diff --git a/flask_app/main/table_content_extraction.py b/flask_app/main/table_content_extraction.py index 1effed1..f5a2071 100644 --- a/flask_app/main/table_content_extraction.py +++ b/flask_app/main/table_content_extraction.py @@ -130,7 +130,7 @@ def process_all_part1_pdfs(folder_path, output_folder): extract_tables_main(file_path, subfolder_path) if __name__ == "__main__": - path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_tobidders_notice_table.docx' + path =r"C:\Users\Administrator\Desktop\new招标文件\工程标\tmp\\HBDL-2024-0017-001-招标文件_evaluation_method.pdf" output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test" # 前附表json文件 res=extract_tables_main("", output_folder) print(res) diff --git a/flask_app/main/工程标解析main.py b/flask_app/main/工程标解析main.py index e4dc8df..098b018 100644 --- a/flask_app/main/工程标解析main.py +++ b/flask_app/main/工程标解析main.py @@ -9,7 +9,7 @@ from flask_app.general.投标人须知正文提取指定内容 import get_requir from flask_app.main.截取pdf import truncate_pdf_multiple from flask_app.general.merge_pdfs import merge_pdfs from flask_app.main.table_content_extraction import extract_tables_main -from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json +from flask_app.main.提取json工程标版 import convert_clause_to_json from flask_app.general.json_utils import transform_json_values from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice @@ -54,9 +54,8 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id): # 处理各个部分 tobidders_notice_table=truncate_files[0] - truncate0_docpath = pdf2docx(tobidders_notice_table) # 投标人须知前附表转docx - - truncate_jsonpath = extract_tables_main(truncate0_docpath, output_folder) # 投标人须知前附表docx->json + # tobidders_notice_table_docx = pdf2docx(tobidders_notice_table) # 投标人须知前附表转docx + # truncate_jsonpath = extract_tables_main(tobidders_notice_table_docx, output_folder) # 投标人须知前附表docx->json tobidders_notice = truncate_files[1] #投标人须知正文 @@ -86,7 +85,6 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id): 'tobidders_notice': tobidders_notice, 'evaluation_method':evaluation_method, 'qualification': qualification, - 'truncate0_jsonpath': truncate_jsonpath, 'merged_baseinfo_path':merged_baseinfo_path, 'merged_baseinfo_path_more':merged_baseinfo_path_more, 'clause_path': clause_path, @@ -112,14 +110,14 @@ def fetch_project_basic_info(invalid_path, merged_baseinfo_path, merged_baseinfo # 形式、响应、资格评审 -def fetch_qualification_review(evaluation_method, qualification, output_folder, truncate0_jsonpath, clause_path, invalid_path, merged_baseinfo_path): +def fetch_qualification_review(evaluation_method, qualification, output_folder, tobidders_notice_table, clause_path, invalid_path, merged_baseinfo_path): logger.info("starting 资格审查...") start_time = time.time() if not evaluation_method: evaluation_method = invalid_path if not merged_baseinfo_path: merged_baseinfo_path = invalid_path - review_standards_res = combine_review_standards(evaluation_method, qualification, output_folder, truncate0_jsonpath, clause_path, invalid_path, merged_baseinfo_path) + review_standards_res = combine_review_standards(evaluation_method, qualification, output_folder, tobidders_notice_table, clause_path, invalid_path, merged_baseinfo_path) end_time = time.time() logger.info(f"资格审查 done,耗时:{end_time - start_time:.2f} 秒") return review_standards_res @@ -143,10 +141,10 @@ def fetch_evaluation_standards(invalid_path, evaluation_method): # 无效、废标项解析 -def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, qualification): +def fetch_invalid_requirements(invalid_docpath, output_folder, tobidders_notice_table, clause_path, qualification): logger.info("starting 无效标与废标...") start_time = time.time() - find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, qualification) + find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, tobidders_notice_table, clause_path, qualification) end_time = time.time() logger.info(f"无效标与废标 done,耗时:{end_time - start_time:.2f} 秒") return find_invalid_res @@ -194,12 +192,12 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_ processed_data['tobidders_notice'], processed_data['clause_path']), 'qualification_review': executor.submit(fetch_qualification_review, processed_data['evaluation_method'], processed_data['qualification'], output_folder, - processed_data['truncate0_jsonpath'], + processed_data['tobidders_notice_table'], processed_data['clause_path'], processed_data['invalid_path'], processed_data['merged_baseinfo_path']), 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['invalid_path'],processed_data['evaluation_method']), 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], - output_folder, processed_data['truncate0_jsonpath'], + output_folder, processed_data['tobidders_notice_table'], processed_data['clause_path'], processed_data['qualification']), 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'], processed_data['merged_baseinfo_path_more'],processed_data['clause_path']), 'opening_bid': executor.submit(fetch_bid_opening,processed_data['invalid_path'],processed_data['merged_baseinfo_path_more'], processed_data['clause_path']) @@ -228,6 +226,7 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_ logger.error(f"Error processing {key}: {exc}") yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False) +#TODO:废标项,针对新文件作优化,统一成货物标的处理逻辑 if __name__ == "__main__": start_time = time.time() output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test1" diff --git a/flask_app/main/形式响应评审.py b/flask_app/main/形式响应评审.py index 7c7b2a1..a72c78c 100644 --- a/flask_app/main/形式响应评审.py +++ b/flask_app/main/形式响应评审.py @@ -7,7 +7,7 @@ import time from flask_app.general.多线程提问 import multi_threading from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2 from flask_app.general.json_utils import clean_json_string -from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json +from flask_app.main.提取json工程标版 import convert_clause_to_json from flask_app.general.通义千问long import upload_file from flask_app.general.merge_pdfs import merge_pdfs prompt = """ @@ -312,7 +312,7 @@ def process_reviews(original_dict_data, output_folder, truncate0_jsonpath, claus formatted_questions = formatted_questions1 + formatted_questions2 if formatted_questions: - output_path = fetch_specific_pdf(output_folder) + output_path = fetch_specific_pdf(output_folder) #合并merged_info if output_path: file_id = upload_file(output_path) results = multi_threading(formatted_questions, "", file_id, 2) diff --git a/flask_app/main/截取pdf.py b/flask_app/main/截取pdf.py index bc6fdcc..33e94af 100644 --- a/flask_app/main/截取pdf.py +++ b/flask_app/main/截取pdf.py @@ -202,7 +202,8 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter text = page.extract_text() if text: cleaned_text = clean_page_content(text, common_header) - if is_secondary_match and re.search(exclusion_pattern, cleaned_text): # 跳过投标人须知正文中的"投标文件的组成" + # if is_secondary_match and re.search(exclusion_pattern, cleaned_text): # 跳过投标人须知正文中的"投标文件的组成" + if re.search(exclusion_pattern, cleaned_text): continue if re.search(begin_pattern, cleaned_text) and i >= begin_page: if start_page and (output_suffix == "notice" or output_suffix == "invalid"): @@ -233,8 +234,9 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l try: pdf_document = PdfReader(pdf_path) if output_suffix == "qualification": + print("twice:qualificaiton!") # 动态设置 include_keys - include_keys = ["资格", "资质", "能力", "信誉"] + include_keys = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查","资格检查","能力","信誉"] # 定义起始匹配模式,仅匹配“附录”、“附件”或“附表” begin_pattern = r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::])' # 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻) @@ -586,18 +588,18 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, uniqu # 投标人须知前附表改为货物标一样的 if __name__ == "__main__": start_time = time.time() - # input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标\\HBDL-2024-0017-001-招标文件.pdf" + # input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标" # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf" input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹" - output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp" + output_folder = "C:\\Users\\Administrator\\Desktop\\new招标文件\\output3" # files=truncate_pdf_multiple(input_path,output_folder) # selections = [4, 1] # 仅处理 selection 4、1 # files=truncate_pdf_specific_engineering(input_path,output_folder,selections) # print(files) - selection = 2 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标 + selection = 3 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标 generated_files = truncate_pdf_main(input_path, output_folder, selection) - # print(generated_files) + print(generated_files) # print("生成的文件:", generated_files) end_time = time.time() print("耗时:" + str(end_time - start_time)) diff --git a/flask_app/main/投标人须知正文提取指定内容.py b/flask_app/main/投标人须知正文提取指定内容.py index a20858e..3359b01 100644 --- a/flask_app/main/投标人须知正文提取指定内容.py +++ b/flask_app/main/投标人须知正文提取指定内容.py @@ -1,6 +1,7 @@ import json import re -from flask_app.general.投标人须知正文提取指定内容 import process_nested_data, transform_json, get_requirements_with_gpt,extract_sections +from flask_app.general.投标人须知正文提取指定内容 import process_nested_data, transform_json, get_requirements_with_gpt, \ + extract_sections, concatenate_keys_values # 对于每个target_value元素,如果有完美匹配json_data中的键,那就加入这个完美匹配的键名,否则,把全部模糊匹配到的键名都加入 @@ -17,40 +18,7 @@ def find_keys_with_prefix(key_prefix, json_data): subheadings = [k for k in json_data if k.startswith(key_prefix)] return subheadings - -# 从完整的json文件中读取所需数据,eg:投标、评标 -# def extract_json(data, target_values): -# results = {} -# -# # 遍历所有目标值 -# for target_value in target_values: -# # 找到所有与目标值匹配的键 -# matched_keys = find_keys_by_value(target_value, data) -# -# for key in matched_keys: -# # 查找所有以该键为前缀的子键,限制只提取直接子项 -# key_and_subheadings = find_keys_with_prefix(key, data) -# -# for subkey in key_and_subheadings: -# # 如果子键有多级结构(比如 '7.2.1'),并且是直接子项 -# if "." in subkey: -# parent_key = subkey.rsplit('.', 1)[0] -# top_level_key = parent_key.split('.')[0] + '.' -# -# # 确保顶级键不会重复添加 -# if top_level_key not in results: -# results[top_level_key] = data[top_level_key] -# -# # 添加或更新父级键 -# if parent_key not in results: -# if parent_key in data: -# results[parent_key] = data[parent_key] -# -# # 添加当前子键和它的值 -# if subkey in data: -# results[subkey] = data[subkey] -# -# return results +#将 top_level_key 的值设为 target_value。 def extract_json(data, target_values): results = {} for target_value in target_values: @@ -72,6 +40,39 @@ def extract_json(data, target_values): results[subkey] = data[subkey] return results +def extract_between_sections(data, target_values): + target_found = False + extracted_data = {} + current_section_title = "" + section_pattern = re.compile(r'^[一二三四五六七八九十]+$') # 匹配 "一", "二", "三" 等大标题 + current_block = {} + + # 遍历所有键值对 + for key, value in data.items(): + # 只匹配形如 "一": "竞争性磋商响应文件" 的章节标题 + if section_pattern.match(key): + if target_found: + # 如果已经找到了符合的章节,并且遇到了另一个章节 + # 保存当前块并重置 + if current_block: + extracted_data[current_section_title] = current_block + current_block = {} + target_found = False + + # 检查当前标题是否包含 target_values 中的任意关键词 + if any(tv in value for tv in target_values): + target_found = True # 找到了目标章节,开始捕获后续内容 + current_section_title = value # 保存章节标题内容 + + elif target_found: # 只捕获目标值之后的内容 + current_block[key] = value + + # 保存最后一个块(如果有的话) + if current_block: + extracted_data[current_section_title] = current_block + + return extracted_data + def sort_clean_data_keys(data): # 预处理:删除键名中的空格 def preprocess_key(key): @@ -114,13 +115,20 @@ def extract_from_notice(merged_baseinfo_path,clause_path, type): "Invalid type specified. Use 1 for '投标文件, 投标' or 2 for '开标, 评标, 定标'or 3 for '重新招标'") with open(clause_path, 'r', encoding='utf-8') as file: data = json.load(file) - extracted_data = extract_json(data, target_values) # 读取json + extracted_data = extract_between_sections(data, target_values) if not extracted_data: - final_result = get_requirements_with_gpt(merged_baseinfo_path, type) # 万一没用正则匹配到,那就调用大模型 + extracted_data = extract_json(data, target_values) # 提取需要的数据 + if not extracted_data: + final_result = get_requirements_with_gpt(merged_baseinfo_path, type) # 万一没用正则匹配到,那就调用大模型 + return final_result + final_result=extract_sections(extracted_data,target_values) #后处理,生成键名 return final_result - # print(json.dumps(extracted_data,ensure_ascii=False,indent=4)) - final_result=extract_sections(extracted_data,target_values) - return final_result + else: + extracted_data_concatenated = { + section: concatenate_keys_values(content) + for section, content in extracted_data.items() + } + return extracted_data_concatenated # print(json.dumps(res, ensure_ascii=False, indent=4)) # sorted_data = sort_clean_data_keys(extracted_data) # 对输入的字典 data 的键进行预处理和排序 # transformed_data = transform_json(sorted_data) @@ -130,10 +138,10 @@ def extract_from_notice(merged_baseinfo_path,clause_path, type): if __name__ == "__main__": # file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json' - merged_baseinfo_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\4e5bc6c2-c2b8-4c0b-8e57-81a498b982f6\\ztbfile_tobidders_notice.pdf" - clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\4e5bc6c2-c2b8-4c0b-8e57-81a498b982f6\\clause1.json" + merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\ea4c3d02-1198-48ab-b841-e62959b3668c\merged_baseinfo_path_more.pdf" + clause_path=r"D:\flask_project\flask_app\static\output\output1\ea4c3d02-1198-48ab-b841-e62959b3668c\tmp\clause1.json" try: - res = extract_from_notice(merged_baseinfo_path,clause_path, 1) # 可以改变此处的 type 参数测试不同的场景 + res = extract_from_notice(merged_baseinfo_path,clause_path, 3) # 可以改变此处的 type 参数测试不同的场景 res2 = json.dumps(res, ensure_ascii=False, indent=4) print(res2) except ValueError as e: diff --git a/flask_app/main/投标人须知正文条款提取成json文件.py b/flask_app/main/投标人须知正文条款提取成json文件.py deleted file mode 100644 index 293618b..0000000 --- a/flask_app/main/投标人须知正文条款提取成json文件.py +++ /dev/null @@ -1,287 +0,0 @@ -import json -import docx -import re -import os -from PyPDF2 import PdfReader -from flask_app.main.截取pdf import clean_page_content,extract_common_header - -def extract_text_from_docx(file_path): - doc = docx.Document(file_path) - return '\n'.join([para.text for para in doc.paragraphs]) - - -# def extract_text_from_pdf(file_path): -# # 从PDF文件中提取文本 -# common_header = extract_common_header(file_path) -# pdf_document = PdfReader(file_path) -# text = "" -# # 遍历每一页 -# for page in pdf_document.pages: -# # 提取当前页面的文本 -# page_text = page.extract_text() if page.extract_text() else "" -# # 清洗页面文本 -# page_text = clean_page_content(page_text, common_header) -# # 将清洗后的文本添加到总文本中 -# text += page_text+"\n" -# return text -def extract_text_from_pdf(file_path, start_word, end_pattern): - # 从PDF文件中提取文本 - common_header = extract_common_header(file_path) - pdf_document = PdfReader(file_path) - all_pages_text = [] - start_index = None - - # 处理所有页面 - for i, page in enumerate(pdf_document.pages): - page_text = page.extract_text() if page.extract_text() else "" - cleaned_text = clean_page_content(page_text, common_header) - - # 在第一页查找开始位置 - if i == 0 and start_index is None: - for pattern in (start_word if isinstance(start_word, list) else [start_word]): - start_match = re.search(pattern, cleaned_text) - if start_match: - start_index = start_match.start() - cleaned_text = cleaned_text[start_index:] - break # 找到一个匹配后跳出循环 - - # 在最后一页查找结束位置 - if i == len(pdf_document.pages) - 1: - matches = list(re.finditer(end_pattern, cleaned_text)) - if matches: - end_index = matches[-1].start() - cleaned_text = cleaned_text[:end_index] - - all_pages_text.append(cleaned_text) - - # 合并所有页面的文本 - full_text = "\n".join(all_pages_text) - return full_text -def extract_section(text, start_pattern, end_phrases): - # 查找开始模式 - start_match = re.search(start_pattern, text) - if not start_match: - return "" # 如果没有找到匹配的开始模式,返回空字符串 - start_index = start_match.end() # 从匹配的结束位置开始 - - # 初始化结束索引为文本总长度 - end_index = len(text) - - # 遍历所有结束短语,查找第一个出现的结束短语 - for phrase in end_phrases: - match = re.search(phrase, text[start_index:], flags=re.MULTILINE) - if match: - end_index = start_index + match.start() # 更新结束索引为匹配到的开始位置 - break # 找到第一个匹配后立即停止搜索 - - # 提取并返回从开始模式后到结束模式前的内容 - return text[start_index:end_index] - -def compare_headings(current, new): - # 使用过滤来确保只处理非空且为数字的部分 - current_nums = [int(num) for num in current.split('.') if num.isdigit()] - new_nums = [int(num) for num in new.split('.') if num.isdigit()] - - # 比较数字序列以确定标题的层次关系 - for c, n in zip(current_nums, new_nums): - if n > c: - return True - elif n < c: - return False - - # 如果新标题有更多层次,认为是新的子章节 - return len(new_nums) > len(current_nums) - - -def should_add_newline(content, keywords, max_length=20): - content_str = ''.join(content).strip() - return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length - -def handle_content_append(current_content, line_content, append_newline, keywords): - if append_newline: - if should_add_newline(current_content, keywords): - current_content.append('\n') # 添加换行符 - append_newline = False - current_content.append(line_content) - return append_newline - -#对二级标题如x.x进行额外处理:如果当前处理内容包含keywords中的内容,则必须保留换行符/如果当前内容字数大于20,不保留换行。 -def parse_text_by_heading(text): - keywords = ['包含', '以下'] - data = {} - current_key = None - current_content = [] - append_newline = False - lines = text.split('\n') - - def check_year_pattern(line): - line_no_spaces = line.replace(' ', '') - return re.match(r'^\d{4}', line_no_spaces) and line_no_spaces[4:5] == '年' - for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n' - line_stripped = line.strip().replace('.', '.') - # 检查是否以年份开头,如果是,属于上一个标题内容 - if check_year_pattern(line_stripped) and current_key is not None: - current_content.append(line_stripped) - continue - # 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号 - match = re.match(r'^(?json queries = [ ( r'否\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效', @@ -392,7 +398,7 @@ def combine_find_invalid(invalid_docpath, output_dir, truncate_json_path, clause output_file, result_key, keywords, - truncate_json_path + truncate_jsonpath ) futures.append((future, result_key)) time.sleep(0.5) # 暂停0.5秒后再提交下一个任务 @@ -408,7 +414,7 @@ def combine_find_invalid(invalid_docpath, output_dir, truncate_json_path, clause # 禁止投标(find_forbidden)部分 try: # print("starting不得存在的情形...") - forbidden_res = find_forbidden(truncate_json_path, clause_path, qualification) + forbidden_res = find_forbidden(truncate_jsonpath, clause_path, qualification) except Exception as e: print(f"find_forbidden 处理时出错: {e}") forbidden_res = {'不得存在的其他情形': ""} @@ -422,13 +428,13 @@ def combine_find_invalid(invalid_docpath, output_dir, truncate_json_path, clause if __name__ == '__main__': start_time = time.time() - truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\truncate_output.json" + tobidders_notice_table="" clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json" qualification="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf" output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\zbout" # doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx' invalid_docpath = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_invalid.docx' - results = combine_find_invalid(invalid_docpath, output_dir,truncate_json_path,clause_path,qualification) + results = combine_find_invalid(invalid_docpath, output_dir,tobidders_notice_table,clause_path,qualification) end_time = time.time() print("Elapsed time:", str(end_time - start_time)) print("Results:", json.dumps(results,ensure_ascii=False,indent=4)) diff --git a/flask_app/main/解析old.py b/flask_app/main/解析old.py index e424955..3ba3b10 100644 --- a/flask_app/main/解析old.py +++ b/flask_app/main/解析old.py @@ -5,7 +5,7 @@ import time from concurrent.futures import ThreadPoolExecutor from flask_app.main.截取pdf import truncate_pdf_multiple from flask_app.main.table_content_extraction import extract_tables_main -from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json +from flask_app.main.提取json工程标版 import convert_clause_to_json from flask_app.general.json_utils import transform_json_values from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice diff --git a/flask_app/main/资格审查模块.py b/flask_app/main/资格审查模块.py index 08860d3..1acda8f 100644 --- a/flask_app/main/资格审查模块.py +++ b/flask_app/main/资格审查模块.py @@ -1,42 +1,151 @@ import json +import os.path import time -from flask_app.general.json_utils import extract_content_from_json +from flask_app.general.format_change import pdf2docx +from flask_app.general.json_utils import extract_content_from_json, clean_json_string +from flask_app.main.table_content_extraction import extract_tables_main from flask_app.main.形式响应评审 import process_reviews from flask_app.main.资格评审 import process_qualification from flask_app.general.通义千问long import upload_file, qianwen_long from concurrent.futures import ThreadPoolExecutor +from flask_app.货物标.资格审查main import combine_qualification_review +from flask_app.general.merge_pdfs import merge_pdfs +def process_notice(notice_path): + print("call notice_path") + try: + # 上传通知文件并获取文件ID + file_id1 = upload_file(notice_path) + + # 定义用户查询,提取申请人资格要求 + user_query1 = """ + 第一章招标公告(投标邀请书)中说明的申请人资格要求是怎样的?请以json格式给出回答,外键为'申请人资格要求',键值为字符串列表,其中每个字符串对应原文中的一条要求,你的回答与原文内容一致,不要擅自总结删减。输出格式示例如下: + { + "申请人资格要求":[ + "1.满足《中华人民共和国政府采购法》第二十二条规定;", + "1.1 法人或者其他组织的营业执照等证明文件,如供应商是自然人的提供身份证明材料;", + "2.未被列入“信用中国”网站(www.creditchina.gov.cn)信用服务栏失信被执行人、重大税收违法案件当事人名单;" + ] + } + """ + # 执行查询并清洗结果 + res1 = clean_json_string(qianwen_long(file_id1, user_query1)) + # 提取申请人资格要求 + requirements = res1.get("申请人资格要求", "未找到相关内容") + return {"申请人资格要求": requirements} + except Exception as e: + print(f"处理申请人资格要求时出错: {e}") + return {"申请人资格要求": "处理失败"} -def combine_review_standards(evaluation_method,qualification,output_folder,truncate0_jsonpath,clause_path,invalid_path,merged_baseinfo_path): #评标办法前附表 - # 形式评审、响应评审:千问 - file_id=upload_file(evaluation_method) #评标办法前附表 - user_query_1 = "根据该文档中的评标办法前附表,请你列出该文件中的形式评审标准和响应性评审标准和资格评审标准,请以json格式返回,外层键名为'形式评审标准'和'响应性评审标准'和'资格评审标准',嵌套键名为'评审因素'中的内容,相应的键值为对应'评审标准'中的内容。" - results = qianwen_long(file_id, user_query_1) - original_dict_data = extract_content_from_json(results) - qualification_review = original_dict_data.pop('资格评审标准', {}) #qianwen-long有关资格评审的内容 - with ThreadPoolExecutor() as executor: - # 创建Future对象 - future_qualification = executor.submit(process_qualification, qualification_review, qualification,invalid_path,merged_baseinfo_path) - future_form_response = executor.submit(process_reviews, original_dict_data,output_folder, truncate0_jsonpath, - clause_path) - # 等待执行结果 - final_qualify_json = future_qualification.result() - form_response_dict = future_form_response.result() - form_response_dict.update(final_qualify_json) - return {"资格审查":form_response_dict} +def combine_review_standards(evaluation_method, qualification_path, output_folder, tobidders_notice_table, clause_path, + invalid_path, merged_baseinfo_path): + """ + 结合评审标准,包括形式评审、响应评审、资格评审及申请人资格要求。 + + 参数: + evaluation_method (str): 评标办法文件路径。 + qualification (str): 资格文件路径。 + output_folder (str): 输出文件夹路径。 + tobidders_notice_table (str): JSON截断路径。 + clause_path (str): 条款路径。 + invalid_path (str): 无效文件路径。 + merged_baseinfo_path (str): 合并基础信息路径。 + notice_path (str): 通知文件路径。 + + 返回: + dict: 包含资格审查和申请人资格要求的合并结果。 + """ + # 上传评标办法前附表并获取文件ID + file_id = upload_file(evaluation_method) # 评标办法前附表 + + first_query=""" + 该文档中是否说明了符合性审查标准?说明了就回答'是',否则回答'否',请以json格式给我返回结果,键名分别是'符合性审查',键值仅限于'是','否'。注意:它与形式、响应性评审是对立的,也就是说只要文档中描述了形式、响应性评审,那么符合性审查的键值一定是'否'。以下为输出示例: + { + "符合性审查":"是" + } + """ + first_res=clean_json_string(qianwen_long(file_id,first_query)) + if first_res.get("符合性审查") == "是": + print("new 资格审查") + paths=[qualification_path,evaluation_method] + output_path=os.path.join(output_folder,"merged_qualification.pdf") + merge_pdfs(paths,output_path) + final_result=combine_qualification_review(invalid_path,output_path,merged_baseinfo_path) + else: + tobidders_notice_table_docx = pdf2docx(tobidders_notice_table) # 投标人须知前附表转docx + truncate_jsonpath = extract_tables_main(tobidders_notice_table_docx, output_folder) # 投标人须知前附表docx->json + # 定义用户查询,提取形式评审标准、响应性评审标准和资格评审标准 + user_query_1 = """ + 根据该文档中的评标办法前附表,请你列出该文件中的形式评审标准和响应性评审标准和资格评审标准,请以json格式返回,外层键名为'形式评审标准'和'响应性评审标准'和'资格评审标准',嵌套键名为'评审因素'中的内容,相应的键值为对应'评审标准'中的内容。 + """ + # 执行查询并提取内容 + results = qianwen_long(file_id, user_query_1) + original_dict_data = extract_content_from_json(results) + + # 提取资格评审标准 + qualification_review = original_dict_data.pop('资格评审标准', {}) # qianwen_long有关资格评审的内容 + + # 初始化 ThreadPoolExecutor,设定最多三个线程以处理三个任务 + with ThreadPoolExecutor(max_workers=3) as executor: + # 提交任务并建立任务名到Future的映射 + futures = { + "资格审查": executor.submit( + process_qualification, + qualification_review, + qualification_path, + invalid_path, + merged_baseinfo_path + ), + "形式及响应性审查": executor.submit( + process_reviews, + original_dict_data, + output_folder, + truncate_jsonpath, + clause_path + ), + "申请人资格要求": executor.submit( + process_notice, + merged_baseinfo_path + ) + } + # 定义所需的顺序 + desired_order = ["申请人资格要求", "资格审查", "形式及响应性审查"] + # 初始化结果字典 + combined_results = {} + # 按指定顺序收集结果 + for key in desired_order: + future = futures.get(key) + if future: + try: + result = future.result() + if isinstance(result, dict): + combined_results.update(result) + else: + combined_results[key] = result + except Exception as e: + print(f"处理 '{key}' 时出错: {e}") + combined_results[key] = "处理失败" + else: + combined_results[key] = "未提交任务" + # 将各部分结果合并到最终的资格审查字典中 + final_result = {"资格审查": combined_results} + return final_result + if __name__ == "__main__": - start_time=time.time() - evaluation_method = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_evaluation_method.pdf" - qualification="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_qualification.pdf" - output_folder = "D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21" + start_time = time.time() + evaluation_method = r"C:\Users\Administrator\Desktop\new招标文件\工程标\tmp\\HBDL-2024-0017-001-招标文件_evaluation_method.pdf" + qualification_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标\tmp\\HBDL-2024-0017-001-招标文件_qualification.pdf" + output_folder = r"C:\Users\Administrator\Desktop\new招标文件\工程标\tmp" # knowledge_name="zbtest20" clause_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\clause1.json" - truncate0_jsonpath = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\truncate_output.json" - invalid_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_invalid.pdf" - merged_baseinfo_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_merged_baseinfo.pdf" - res=combine_review_standards(evaluation_method,qualification, output_folder,truncate0_jsonpath,clause_path,invalid_path,merged_baseinfo_path) - print(json.dumps(res,ensure_ascii=False,indent=4)) - end_time=time.time() - print("elapsed time:"+str(end_time-start_time)) \ No newline at end of file + tobidders_notice_table = r"C:\Users\Administrator\Desktop\new招标文件\工程标\tmp\\HBDL-2024-0017-001-招标文件_tobidders_notice_table.pdf" + + invalid_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标\tmp\\HBDL-2024-0017-001-招标文件_invalid.pdf" + merged_baseinfo_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标\tmp\\HBDL-2024-0017-001-招标文件_merged_baseinfo.pdf" + res = combine_review_standards(evaluation_method, qualification_path, output_folder, tobidders_notice_table, clause_path, + invalid_path, merged_baseinfo_path) + print(json.dumps(res, ensure_ascii=False, indent=4)) + end_time = time.time() + print("elapsed time:" + str(end_time - start_time)) diff --git a/flask_app/main/资格评审.py b/flask_app/main/资格评审.py index 4e3f03c..029e07e 100644 --- a/flask_app/main/资格评审.py +++ b/flask_app/main/资格评审.py @@ -123,17 +123,17 @@ def get_all_dict(invalid_path, ques=None): return {'资格评审': qualification_combined_res} -def process_qualification(qualification_review, qualification, invalid_path, merged_baseinfo_path): +def process_qualification(qualification_review, qualification_path, invalid_path, merged_baseinfo_path): # 资格评审 matching_keys_list, non_matching_dict = extract_matching_keys_qual( qualification_review) # matching_keys_list:['资质条件', '财务状况'] non_matching_dict:{'营业执照': '具备有效的营业执照', '施工机械设备': '具备完善的施工设备'} if not matching_keys_list: if not non_matching_dict: # 古法提取 non_matching_dict和matching_keys_list都为空 - if qualification != "": # 提取到资格审查附件的情况 + if qualification_path != "": # 提取到资格审查附件的情况 print("资格评审: type1") matching_keys_list = ["资质条件", "财务要求", "业绩要求", "信誉要求", "其他要求"] ques = generate_qual_question(matching_keys_list) - file_id2 = upload_file(qualification) + file_id2 = upload_file(qualification_path) results2 = multi_threading(ques, "", file_id2, 2) # 资格评审表,调用qianwen-long res_list = [clean_json_string(res) for _, res in results2] if results2 else [] if res_list: @@ -161,7 +161,7 @@ def process_qualification(qualification_review, qualification, invalid_path, mer else: return new_non_matching_json or {"资格评审": ""} - elif matching_keys_list and qualification == "": # 这种情况是评分办法前附表中有要求,但是没有正确截取到'资格审查表' + elif matching_keys_list and qualification_path == "": # 这种情况是评分办法前附表中有要求,但是没有正确截取到'资格审查表' # print("资格评审: type4") # target=["资质","业绩","财务","信誉","人员","项目经理","负责人","联合体"] # question_template="该招标文件中{key}的内容是怎样的?请你以json格式返回结果,键名为{key},若存在嵌套内容,嵌套键名为你对相应要求的总结,而对应键值需要完全与原文保持一致,不要擅自总结、删减。" @@ -190,7 +190,7 @@ def process_qualification(qualification_review, qualification, invalid_path, mer else: # 大多数情况 print("资格评审: type5") user_querys = generate_qual_question(matching_keys_list) # 生成提问->‘附件:资格审查’ - file_id2 = upload_file(qualification) + file_id2 = upload_file(qualification_path) results2 = multi_threading(user_querys, "", file_id2, 2) # 资格评审表,调用qianwen-long res_list = [] if not results2: @@ -212,12 +212,12 @@ if __name__ == "__main__": # qualification_review={'营业执照': '具备有效的营业执照', '资质等级': '具备建设行政主管部门颁发的市政公用工程监理乙级及以上资质或房屋建筑工程监理乙级及以上资质或工程监理综合资质证书', '财务状况': '投标人须提供近三年(2018 年、2019 年、2020 年)完', '类似项目业绩': '投标人近 5 年(2017 年至今)须具有至少一项投资概算在4000 万元及以上房屋建筑工程或市政公用工程监理业绩,同时提供中标通知书及监理服务合同,项目规模及项目时间认定以监理服务合同内信息为准', '信誉': '根据《关于在招标投标活动中对失信被执行', '主要人员': '监理工程师:至少提供 1 名具有房屋建筑工程专','不存在禁止投标的':'不存在第二章“投标人须知”第 1.4.3 项规定的情形','联合体投标':'hha'} qualification_review = {'营业执照': '具备有效的营业执照', '安全生产许可证': '具备有效的安全生产许可证', '资质等级': '符合第二章“投标人须知”规定', '财务状况': '符合第二章“投标人须知”规定'} - qualification= "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_qualification.pdf" + qualification_path= "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_qualification.pdf" # output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test" invalid_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_invalid.pdf" merged_baseinfo_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_merged_baseinfo.pdf" # knowledge_name = "招标解析word13" - res = process_qualification(qualification_review, qualification, invalid_path, merged_baseinfo_path) + res = process_qualification(qualification_review, qualification_path, invalid_path, merged_baseinfo_path) print(json.dumps(res, ensure_ascii=False, indent=4)) # 该招标文件中资格评审关于财务状况的内容是怎样的?请你以json格式返回结果,外层键名为'财务状况',请你忠于原文,回答要求完整准确,不要擅自总结、删减,且不要回答诸如'见投标人须知前附表'或'见第x.x项规定'这类无实质性内容的回答。 diff --git a/flask_app/main/资格评审new.py b/flask_app/main/资格评审new.py new file mode 100644 index 0000000..218d1f8 --- /dev/null +++ b/flask_app/main/资格评审new.py @@ -0,0 +1,120 @@ +from concurrent.futures import ThreadPoolExecutor, as_completed +from flask_app.general.通义千问long import upload_file, qianwen_long +from flask_app.general.json_utils import clean_json_string +def combine_qualification_new(invalid_path, qualification_path,notice_path): + detailed_res = {} + # 初始化无效文件ID + invalid_file_id = None + if qualification_path: + # 上传资格文件并获取文件ID + qualification_file_id = upload_file(qualification_path) + + # 定义第一个查询,用于检查资格性审查是否存在 + first_query = """ + 该文档中是否有关于资格性审查标准的具体内容?请以json格式给出回答,外键为'资格性审查',键值仅限于'是','否',输出格式示例如下: + { + "资格性审查":"是" + } + """ + + # 执行第一个查询并清洗返回的JSON字符串 + print("call first_query") + first_res = clean_json_string(qianwen_long(qualification_file_id, first_query)) + # 判断是否存在资格性审查 + zige_file_id = qualification_file_id if first_res.get("资格性审查") == "是" else None + + # 如果需要,上传无效文件 + if zige_file_id is None: + if invalid_file_id is None: + invalid_file_id = upload_file(invalid_path) + zige_file_id = invalid_file_id + + else: + # 如果 qualification_path 为空,直接使用无效文件 + zige_file_id = upload_file(invalid_path) + + # 定义第二组查询,仅包含资格性审查 + second_query = [ + { + "key": "资格性审查", + "query": "该招标文件中规定的资格性审查标准是怎样的?请以json格式给出,外层为'资格性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关符合性审查的内容。" + } + ] + + # 定义任务函数 + def process_second_query(key, query, file_id): + print("call second_query") + try: + res = qianwen_long(file_id, query) + cleaned_res = clean_json_string(res) + return key, cleaned_res.get(key, "未找到相关内容") + except Exception as e: + print(f"执行查询 '{key}' 时出错: {e}") + return key, "查询失败" + + def process_notice(notice_path): + print("call notice_path") + try: + # 上传通知文件并获取文件ID + file_id1 = upload_file(notice_path) + + # 定义用户查询,提取申请人资格要求 + user_query1 = """ + 第一章招标公告(投标邀请书)中说明的申请人资格要求是怎样的?请以json格式给出回答,外键为'申请人资格要求',键值为字符串列表,其中每个字符串对应原文中的一条要求,你的回答与原文内容一致,不要擅自总结删减。输出格式示例如下: + { + "申请人资格要求":[ + "1.满足《中华人民共和国政府采购法》第二十二条规定;", + "1.1 法人或者其他组织的营业执照等证明文件,如供应商是自然人的提供身份证明材料;", + "2.未被列入“信用中国”网站(www.creditchina.gov.cn)信用服务栏失信被执行人、重大税收违法案件当事人名单;" + ] + } + """ + + # 执行查询并清洗结果 + res1 = clean_json_string(qianwen_long(file_id1, user_query1)) + + # 提取申请人资格要求 + requirements = res1.get("申请人资格要求", "未找到相关内容") + return "申请人资格要求", requirements + except Exception as e: + print(f"处理申请人资格要求时出错: {e}") + return "申请人资格要求", "处理失败" + + # 初始化 ThreadPoolExecutor + with ThreadPoolExecutor(max_workers=2) as executor: + future_to_key = {} + + # 提交第二组查询 + for query_info in second_query: + key = query_info["key"] + query = query_info["query"] + current_file_id = zige_file_id + future = executor.submit(process_second_query, key, query, current_file_id) + future_to_key[future] = key + + # 有条件地提交通知处理 + if notice_path: + future = executor.submit(process_notice, notice_path) + future_to_key[future] = "申请人资格要求" + else: + future = executor.submit(process_notice, invalid_path) + future_to_key[future] = "申请人资格要求" + + # 收集结果(按完成顺序) + for future in as_completed(future_to_key): + key, result = future.result() + detailed_res[key] = result + + # 定义所需的顺序 + desired_order = ["申请人资格要求", "资格性审查"] + # print(json.dumps(detailed_res,ensure_ascii=False,indent=4)) + # 创建一个新的有序字典 + ordered_res = {} + for key in desired_order: + if key in detailed_res: + ordered_res[key] = detailed_res[key] + + # 最终处理结果,例如打印或保存 + return {"资格审查": ordered_res} + + diff --git a/flask_app/old_version/形式响应评审old.py b/flask_app/old_version/形式响应评审old.py index 46e1979..193dc78 100644 --- a/flask_app/old_version/形式响应评审old.py +++ b/flask_app/old_version/形式响应评审old.py @@ -7,7 +7,7 @@ from flask_app.general.多线程提问 import multi_threading from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2 from flask_app.general.json_utils import extract_content_from_json from flask_app.main.截取pdf import truncate_pdf_main -from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json +from flask_app.main.提取json工程标版 import convert_clause_to_json prompt = """ # 角色 你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。 diff --git a/flask_app/old_version/招标文件解析.py b/flask_app/old_version/招标文件解析.py index 5ae1c2b..5b1eee0 100644 --- a/flask_app/old_version/招标文件解析.py +++ b/flask_app/old_version/招标文件解析.py @@ -6,7 +6,7 @@ from concurrent.futures import ThreadPoolExecutor from flask_app.main.截取pdf import truncate_pdf_multiple from flask_app.main.table_content_extraction import extract_tables_main from flask_app.old_version.文档理解大模型版知识库处理.知识库操作 import addfileToKnowledge, deleteKnowledge -from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json +from flask_app.main.提取json工程标版 import convert_clause_to_json from flask_app.general.json_utils import transform_json_values from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice diff --git a/flask_app/old_version/资格审查模块old.py b/flask_app/old_version/资格审查模块old.py index e30cac4..4f640f2 100644 --- a/flask_app/old_version/资格审查模块old.py +++ b/flask_app/old_version/资格审查模块old.py @@ -1,6 +1,6 @@ import os -from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json +from flask_app.main.提取json工程标版 import convert_clause_to_json from flask_app.general.json_utils import extract_content_from_json from flask_app.old_version.形式响应评审old import process_reviews from flask_app.old_version.资格评审old import process_qualification diff --git a/flask_app/货物标/技术参数要求提取.py b/flask_app/货物标/技术参数要求提取.py index 2c8d496..b921d02 100644 --- a/flask_app/货物标/技术参数要求提取.py +++ b/flask_app/货物标/技术参数要求提取.py @@ -240,12 +240,12 @@ def get_technical_requirements(file_id,invalid_path): """根据所有键是否已添加处理技术要求""" # 更新原始采购需求字典 - update_res=combine_and_update_results(cleaned_res['采购需求'], technical_requirements_combined_res) + final_res=combine_and_update_results(cleaned_res['采购需求'], technical_requirements_combined_res) # final_res = postprocess(cleaned_res) - update_res["货物列表"] = good_list + final_res["货物列表"] = good_list # 输出最终的 JSON 字符串 - return {"采购需求": update_res} + return {"采购需求": final_res} def test_all_files_in_folder(input_folder, output_folder): # 确保输出文件夹存在 diff --git a/flask_app/货物标/投标人须知正文提取指定内容货物标版.py b/flask_app/货物标/投标人须知正文提取指定内容货物标版.py index f5ba1b2..ee357fa 100644 --- a/flask_app/货物标/投标人须知正文提取指定内容货物标版.py +++ b/flask_app/货物标/投标人须知正文提取指定内容货物标版.py @@ -99,7 +99,6 @@ def extract_from_notice(merged_baseinfo_path, clause_path, type): if clause_path: with open(clause_path, 'r', encoding='utf-8') as file: data = json.load(file) - # 提取目标部分 extracted_data = extract_between_sections(data, target_values) # 读取json,截取大标题之间的内容 diff --git a/flask_app/货物标/提取json货物标版.py b/flask_app/货物标/提取json货物标版.py new file mode 100644 index 0000000..3beec2c --- /dev/null +++ b/flask_app/货物标/提取json货物标版.py @@ -0,0 +1,194 @@ +import json +import docx +import re +import os +from flask_app.general.投标人须知正文条款提取成json文件 import parse_text_by_heading, extract_text_from_pdf + +#fitz库版本 +# def extract_text_from_pdf(file_path, start_word, end_pattern): +# # 从PDF文件中提取文本 +# common_header = extract_common_header(file_path) +# doc = fitz.open(file_path) +# all_pages_text = [] +# start_index = None +# +# # 处理所有页面 +# for i in range(len(doc)): +# page = doc[i] +# page_text = page.get_text() +# cleaned_text = clean_page_content(page_text, common_header) +# print(cleaned_text) +# print("yes") +# # 在第一页查找开始位置 +# if i == 0 and start_index is None: +# start_match = re.search(start_word, cleaned_text, re.MULTILINE) +# if start_match: +# start_index = start_match.start() +# cleaned_text = cleaned_text[start_index:] +# +# # 在最后一页查找结束位置 +# if i == len(doc) - 1: +# for pattern in end_pattern: +# matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE)) +# if matches: +# end_index = matches[-1].start() +# cleaned_text = cleaned_text[:end_index] +# break +# +# all_pages_text.append(cleaned_text) +# +# # 合并所有页面的文本 +# full_text = "\n".join(all_pages_text) +# # 关闭文档 +# doc.close() +# +# return full_text + + + + + + +#type=2时提取货物标的第一章招标公告时采用该逻辑 +def parse_text_to_dict(text): + """ + 解析文本,根据大标题划分内容,生成字典。 + + 参数: + text (str): 要解析的文本。 + + 返回: + dict: 大标题作为键,内容作为值的字典。 + """ + # 正则表达式模式:匹配以一至十的汉字数字开头,后跟顿号和任意字符,且位于行首 + pattern = re.compile(r'^([一二三四五六七八九十]+\s*、\s*.*)$', re.MULTILINE) + + # 使用 re.finditer 找到所有大标题的位置 + matches = list(pattern.finditer(text)) + + result = {} + for i, match in enumerate(matches): + title = match.group(1).strip() # 获取大标题文本 + start = match.end() # 内容的起始位置 + + if i + 1 < len(matches): + end = matches[i + 1].start() # 下一个大标题的起始位置 + else: + end = len(text) # 最后一个大标题,内容到文本末尾 + + content = text[start:end].strip() # 获取内容并去除前后空白 + # 规范化换行符,并移除每行开头和结尾的空白 + content = content.replace('\r\n', '\n') # 统一换行符 + content = re.sub(r'[ \t]+\n', '\n', content) # 移除行尾空白 + content = re.sub(r'^[ \t]+|[ \t]+$', '', content, flags=re.MULTILINE) # 移除行首和行尾空白 + content = clean_content(content) # 处理内容中的换行符 + result[title] = content + + return result +def clean_content(content): + """ + 处理内容中的换行符: + - 保留在子项编号前的换行符。 + - 保留在冒号 ':' 或全角冒号 ':' 前的第一个换行符。 + - 移除其他位置的换行符,不留下额外的空格。 + + 参数: + content (str): 要处理的内容字符串。 + + 返回: + str: 处理后的内容字符串。 + """ + # 定义子项编号的正则模式,包括: + # - 数字+点号+数字(如 1.1 或 1.1) + # - 数字+顿号(如 2、) + # - 点号+数字(如 .3 或 .3) + # - 数字+右括号(如 1) 或 1)) + # - 圆括号包围的数字(如 (5)) + # - 全角圆括号包围的数字(如 (5)) + # - 数字+点号(如 1. 或 1.) + numbering_pattern = r'(?:\d+[..]\d+(?:[..]\d+)*|\d+、|[..]\d+|\d+[))]|\(\d+\)|(\d+)|\d+[..])' + + # 定义需要保留换行符的情况: + # 1. 换行符前面是子项编号 + # 2. 换行符前面是任意非空白字符,且后面跟着 ':' 或 ':' + pattern_keep = r'\n(?=(?:' + numbering_pattern + r'|[^:\n\r\f\v]+[::]))' + + # 定义占位符,用于暂时替换需要保留的换行符 + placeholder = "___PLACEHOLDER___" + + # Step 1: 将需要保留的换行符替换为占位符 + content_with_placeholder = re.sub(pattern_keep, placeholder, content) + + # Step 2: 移除所有剩余的换行符 + content_no_newlines = content_with_placeholder.replace('\n', '') + + # Step 3: 将占位符替换回换行符 + cleaned_content = content_no_newlines.replace(placeholder, '\n') + + return cleaned_content + +#如果file_path为空,返回"" +def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json"): + if not os.path.exists(file_path): + print(f"The specified file does not exist: {file_path}") + return "" + if type == 1: + start_word = r'^\s*(?:[((]?\s*[一二12]?\s*[))]?\s*[、..]*\s*)?(说\s*明|总\s*则)' + end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*)$' + else: + start_word = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*|.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*)$' + end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+' + if file_path.endswith('.pdf'): + text = extract_text_from_pdf(file_path, start_word, end_pattern) + else: + raise ValueError("Unsupported file format") + parsed_data = parse_text_by_heading(text) + # result = convert_to_json(input_path, start_word, end_pattern) + # 检查输出文件夹是否存在,如果不存在则创建 + if not os.path.exists(output_folder): + os.makedirs(output_folder) + print(f"Created output folder: {output_folder}") + file_name = "clause1.json" if type == 1 else "clause2.json" + # file_name = f"clause{suffix_counter}.json" + output_path = os.path.join(output_folder, file_name) + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(parsed_data, f, indent=4, ensure_ascii=False) + print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.") + return output_path + +#批量转换,测试时使用 +def process_folder(input_folder, output_folder): + # 获取输入文件夹中的所有文件,过滤掉不以 'part2' 结尾的文件 + files = [f for f in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, f)) and f.endswith('part2.pdf')] + + # 遍历文件并对每个文件进行处理 + for file_name in files: + file_path = os.path.join(input_folder, file_name) + # 去掉文件扩展名 + file_name_without_extension = os.path.splitext(file_name)[0] + try: + # 调用 convert_clause_to_json,传递文件路径、输出文件夹和递增的后缀 + output_path = convert_clause_to_json(file_path, output_folder, 1, file_name_without_extension) + print(f"Processed file: {file_name}, JSON saved to: {output_path}") + except ValueError as e: + print(f"Error processing {file_name}: {e}") + +#TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次)_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏 +#TODO: 投标人须知正文这块,序号可能是乱序的,或许可以删除判断序号大小的逻辑,只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理 + +if __name__ == "__main__": + # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf' + file_path=r'C:\Users\Administrator\Desktop\招标文件\output3\zb_tobidders_notice.pdf' + # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf' + output_folder = r'C:\Users\Administrator\Desktop\招标文件\output3\tmp' + try: + output_path = convert_clause_to_json(file_path,output_folder,1) + print(f"Final JSON result saved to: {output_path}") + except ValueError as e: + print("Error:", e) + + # input_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4' + # output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1' + # + # # 调用 process_folder 来处理整个文件夹中的所有文件 + # process_folder(input_folder, output_folder) diff --git a/flask_app/货物标/货物标解析main.py b/flask_app/货物标/货物标解析main.py index 1b3aa7f..2442e87 100644 --- a/flask_app/货物标/货物标解析main.py +++ b/flask_app/货物标/货物标解析main.py @@ -8,7 +8,7 @@ from flask_app.货物标.投标人须知正文提取指定内容货物标版 imp from flask_app.货物标.截取pdf货物标版 import truncate_pdf_multiple from concurrent.futures import ThreadPoolExecutor import concurrent.futures -from flask_app.货物标.投标人须知正文条款提取成json文件货物标版 import convert_clause_to_json +from flask_app.货物标.提取json货物标版 import convert_clause_to_json from flask_app.货物标.无效标和废标和禁止投标整合main import combine_find_invalid from flask_app.货物标.资格审查main import combine_qualification_review from flask_app.货物标.评分标准提取main import combine_evaluation_standards diff --git a/flask_app/货物标/资格审查main.py b/flask_app/货物标/资格审查main.py index e868769..87ddff0 100644 --- a/flask_app/货物标/资格审查main.py +++ b/flask_app/货物标/资格审查main.py @@ -8,7 +8,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from flask_app.general.通义千问long import upload_file, qianwen_long from flask_app.general.多线程提问 import multi_threading from flask_app.general.json_utils import clean_json_string -from flask_app.货物标.投标人须知正文条款提取成json文件货物标版 import convert_clause_to_json +from flask_app.货物标.提取json货物标版 import convert_clause_to_json import copy import concurrent.futures # 这个字典可能有嵌套,你需要遍历里面的键名,对键名作判断,而不是键值,具体是这样的:如果处于同一层级的键的数量>1并且键名全由数字或点号组成。那么就将这些序号键名全部删除,重新组织成一个字典格式的数据,你可以考虑用字符串列表来保持部分平级的数据 @@ -445,7 +445,7 @@ def combine_qualification_review(invalid_path, qualification_path, notice_path): # 定义用户查询,提取申请人资格要求 user_query1 = """ - 该文档中说明的申请人资格要求是怎样的?请以json格式给出回答,外键为'申请人资格要求',键值为字符串列表,其中每个字符串对应原文中的一条要求,你的回答与原文内容一致,不要擅自总结删减。输出格式示例如下: + 第一章招标公告(投标邀请书)中说明的申请人资格要求是怎样的?请以json格式给出回答,外键为'申请人资格要求',键值为字符串列表,其中每个字符串对应原文中的一条要求,你的回答与原文内容一致,不要擅自总结删减。输出格式示例如下: { "申请人资格要求":[ "1.满足《中华人民共和国政府采购法》第二十二条规定;", @@ -454,10 +454,8 @@ def combine_qualification_review(invalid_path, qualification_path, notice_path): ] } """ - # 执行查询并清洗结果 res1 = clean_json_string(qianwen_long(file_id1, user_query1)) - # 提取申请人资格要求 requirements = res1.get("申请人资格要求", "未找到相关内容") return "申请人资格要求", requirements @@ -478,9 +476,12 @@ def combine_qualification_review(invalid_path, qualification_path, notice_path): future_to_key[future] = key # 有条件地提交通知处理 - if qualification_path and notice_path and first_res.get("资格性审查") == "是": + if notice_path: future = executor.submit(process_notice, notice_path) future_to_key[future] = "申请人资格要求" + else: + future=executor.submit(process_notice,invalid_path) + future_to_key[future] = "申请人资格要求" # 收集结果(按完成顺序) for future in as_completed(future_to_key): @@ -585,17 +586,17 @@ if __name__ == "__main__": start_time=time.time() # qualification_path="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\6.2定版视频会议磋商文件_qualification2.pdf" # output_folder = "D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89" - output_folder="C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5" + output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\tmp" # qualification_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5\\ztbfile_qualification1.pdf" # qualification_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\6558a50a-13ea-4279-a5db-684935481c39\\ztbfile_qualification2.pdf" - qualification_path="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\6.2定版视频会议磋商文件_qualification2.pdf" + qualification_path="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\2-招标文件(统计局智能终端二次招标)_qualification1.pdf" # notice_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\6558a50a-13ea-4279-a5db-684935481c39\\ztbfile_notice.pdf" # notice_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5\\ztbfile_notice.pdf" - notice_path="C:\\Users\\Administrator\\Desktop\\货物标\\output5\\6.2定版视频会议磋商文件_notice.pdf" + notice_path="C:\\Users\\Administrator\\Desktop\\货物标\\output5\\2-招标文件(统计局智能终端二次招标)_notice.pdf" # knowledge_name = "6.2视频会议docx" # invalid_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile.pdf" # invalid_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5\\ztbfile.pdf" - invalid_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\6.2定版视频会议磋商文件.pdf" + invalid_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件(统计局智能终端二次招标).pdf" res = combine_qualification_review(invalid_path, qualification_path, notice_path) print(json.dumps(res, ensure_ascii=False, indent=4)) end_time=time.time()