From c53fd32b859f20fb8cdf8846a37aba48effe55f0 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Thu, 10 Oct 2024 11:52:37 +0800 Subject: [PATCH] =?UTF-8?q?10.10=E6=97=A0=E6=95=88=E6=8A=95=E6=A0=87bug?= =?UTF-8?q?=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/main/docx截取docx.py | 4 +- flask_app/main/table_content_extraction.py | 32 +- flask_app/main/test.py | 235 +++++------- flask_app/main/商务标技术标整合.py | 3 +- flask_app/main/无效标和废标和禁止投标整合.py | 20 +- flask_app/main/禁止投标情形.py | 2 +- flask_app/main/读取文件/split_test2.py | 98 +++++ .../无效标和废标和禁止投标整合货物标版.py | 354 ++++++++++++++++++ 8 files changed, 581 insertions(+), 167 deletions(-) create mode 100644 flask_app/main/读取文件/split_test2.py create mode 100644 flask_app/货物标/无效标和废标和禁止投标整合货物标版.py diff --git a/flask_app/main/docx截取docx.py b/flask_app/main/docx截取docx.py index 92a73bc..cf8ea8a 100644 --- a/flask_app/main/docx截取docx.py +++ b/flask_app/main/docx截取docx.py @@ -44,9 +44,11 @@ def copy_docx(source_path): new_doc.save(destination_path) # 保存新文档 print("docx截取docx成功!") + return destination_path # 调用函数 if __name__ == '__main__': source_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\ztbfile.docx" - copy_docx(source_path) + res=copy_docx(source_path) + print(res) \ No newline at end of file diff --git a/flask_app/main/table_content_extraction.py b/flask_app/main/table_content_extraction.py index 104681b..fecf0a5 100644 --- a/flask_app/main/table_content_extraction.py +++ b/flask_app/main/table_content_extraction.py @@ -106,7 +106,35 @@ def extract_tables_main(path, output_folder): return save_data_to_json(flattened_data, output_folder) +#货物标批量生成truncate_output.json +def process_all_part1_pdfs(folder_path, output_folder): + """遍历指定文件夹中的所有以part1.docx结尾的文件,执行extract_tables_main操作并创建子文件夹.""" + if not os.path.exists(folder_path): + print(f"指定的文件夹不存在: {folder_path}") + return + + for root, dirs, files in os.walk(folder_path): + for file in files: + if file.endswith("part1.docx"): + file_path = os.path.join(root, file) + print(f"正在处理文件: {file_path}") + + # 获取文件名(去除扩展名) + base_filename = os.path.splitext(file)[0] + + # 在output_folder中创建对应的子文件夹 + subfolder_path = os.path.join(output_folder, base_filename) + os.makedirs(subfolder_path, exist_ok=True) + + # 调用extract_tables_main,将子文件夹作为output_folder传递 + extract_tables_main(file_path, subfolder_path) + if __name__ == "__main__": - path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件(一中多媒体报告厅教学设备)_tobidders_notice_part1.docx' + path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part1.docx' output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp" # 前附表json文件 - extract_tables_main(path, output_folder) + res=extract_tables_main(path, output_folder) + print(res) + # + # folder_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4' + # output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2" + # process_all_part1_pdfs(folder_path, output_folder) \ No newline at end of file diff --git a/flask_app/main/test.py b/flask_app/main/test.py index 122f29c..7d9d898 100644 --- a/flask_app/main/test.py +++ b/flask_app/main/test.py @@ -1,166 +1,99 @@ +def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): + from collections import OrderedDict + from docx import Document + import re -import json + if isinstance(keywords, str): + keywords = [keywords] -data={ - "技术评分": { - "工程概况": { - "评分": "1分", - "要求": "工程主要情况、各专业设计简介、工程施工条件描述准确清晰 1分;基本准确 0.5-0.9分,不准确得 0分。" - }, - "施工部署": { - "评分": "3分", - "要求": "工程施工目标、主要施工内容、施工流水段划分、施工的重点和难点分析、工程管理的组织机构形式、项目经理部的工作岗位设置及其职责划分、新技术、新工艺部署及其技术和管理要求(如有)、主要分包工程施工单位的选择要求及管理方式(如有)等各项安排科学、合理、针对性强 2.8-3分;合理、可行 2.1-2.7分;欠合理,基本可行 1-2.0分;不可行,不能满足工程需要得 0分。" - }, - "施工进度计划": { - "评分": "3分", - "要求": "网络图或横道图安排的进度计划科学、合理、针对性强 2.8-3分;合理、可行 1.6-2.7分;欠合理,基本可行 1-1.5分;不可行,不能满足招标文件要求 0分。" - }, - "施工准备与资源配置计划": { - "评分": "2分", - "要求": "技术准备、现场准备和资金准备;资源配置计划包括劳动力配置计划、主要工程材料和设备配置计划、主要周转材料和施工机具配置计划等内容完备,合理、针对性强 1.9-2分;内容完备,可行 1.3-1.8分;内容欠完备,基本可行 0.8-1.2分;不可行 0分。" - }, - "主要施工方案": { - "评分": "5分", - "要求": "主要分部、分项工程施工方案合理、可行;脚手架工程、起重吊装工程、临时用水用电工程、季节性施工等专项工程的施工方案(如有)有必要的验算和说明;对易发生质量通病、易出现安全问题、施工难度大、技术含量高的分项工程(工序)等有重点说明。科学、合理、针对性强 4.6-5分;合理、可行 3.1-4.5;欠合理,基本可行 2.0-3.0;不可行,不能满足工程需要 0分。" - }, - "施工现场平面布置": { - "评分": "2分", - "要求": "拟建建(构)筑物的位置轮廓、尺寸、层数;加工设施、存储设施、办公和生活用房的位置和面积;布置垂直运输设施、供电设施、供水、供热设施、排水排污设施和临时施工道路;必备的安全、消防、保卫和环境保护设施;相邻的地上地下既有建(构)筑物及相关环境等要素齐全,现场布置合理 1.9-2分;现场布置可行 1.3-1.8分;现场布置基本可行 0.8-1.2分;现场布置不可行 0分。" - }, - "主要施工管理计划": { - "评分": "4分", - "要求": "进度管理计划、质量管理计划、安全管理计划、环境管理计划、成本管理计划、治安保卫管理计划、合同管理计划,组织协调管理计划、成品保护管理计划、质量保修管理计划、人力资源、施工机具、材料设备等管理计划内容完备,合理、针对性强 3.5-4分;内容基本完备,可行 2.7-3.4分;内容欠完备,基本可行 2.0-2.6分;不可行 0分。" - } - }, - "商务评分": { - "项目经理资格": { - "学历": { - "评分": "1分", - "要求": "大学专科及以上 1分;中专 0.5分;其他 0分。" - } - }, - "技术负责人资格": { - "职称": { - "评分": "1分", - "要求": "相关专业中级职称(工程师) 1分;其他 0分。" - }, - "学历": { - "评分": "1分", - "要求": "大学专科及以上 1分;中专 0.5分;其他 0分。" - } - }, - "其他主要人员": { - "评分": "2分", - "要求": "人员配备合理,满足需要 1.6-2分;人员配备基本合理,基本满足需要 0.1-1.5分;人员配备不合理 0分。" - }, - "认证体系": { - "评分": "3分", - "要求": "投标人具有有效的 IS09001(GB/T19001)质量管理体系认证证书、IS014001(GB/T24001)环境管理体系认证证书、OHSAD18001(GB/T28001)职业健康安全管理体系认证证书的,一项得 1分,三个得 3分(须提供认证证书的扫描件或复印件及在“全国认证认可信息公共服务平台”查询结果截图,未提供或不符合要求不得分。)" - }, - "工程奖项": { - "评分": "3分", - "要求": "投标人企业近三年(2018-2020)连续获得过县级及以上行政主管部门表彰的得 3分;其他不得分。" - }, - "信用": { - "评分": "1分", - "要求": "在信用中国查询未受到行政监督部门行政处罚的得 1分,被行政监督部门作出行政处罚的得 0分。提供网上查询截图(以开标当天“信用中国”网站查询为准),否则不得分。(行政处罚是指通过在“信用中国”查询投标人有行政处罚记录信息且在公示期内)" - } - }, - "投标报价评审标准": { - "有效投标报价确定": { - "评分": "未知", - "要求": "低于招标控制价。" - }, - "有效投标报价的排序": { - "评分": "未知", - "要求": "有效投标报价由低到高排序。" - }, - "投标文件基础数据分析和整理": { - "评分": "未知", - "要求": "清标报告。" - }, - "投标报价成本分析": { - "评分": "未知", - "要求": "清标报告。" - }, - "投标竞争下浮率": { - "评分": "未知", - "要求": "取值:1%(投标竞争下浮率的值为整数,具体由招标人结合项目特点和需要在招标文件中明确)。" - }, - "投标报价最终排序": { - "评分": "未知", - "要求": "有效投标报价由低到高排序。" - }, - "投标报价不合格的情形": { - "评分": "未知", - "要求": "投标报价不合格的情形仅限于投标报价高于最高投标限价或低于成本的情形。投标报价文件评定为“合格”的,评标委员会应列出报价组成不合理、不平衡报价、签订合同前应注意和澄清的事项。评标委员会在不改变投标人投标文件实质性内容的前提下,对投标文件已标价工程量清单进行符合性检查、算术错误修正,对其报价进行合理性分析,投标报价一致或呈规律性差异,评标委员会应当进行比较分析,判定应当否决投标,并形成清标报告。" - } - } -} + doc = Document(doc_path) + extracted_paragraphs = OrderedDict() + continue_collecting = False + current_section_pattern = None + active_key = None + def match_keywords(text, patterns): + return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns) -def remove_unknown_scores(data): - if isinstance(data, dict): - return { - k: remove_unknown_scores(v) - for k, v in data.items() - if not (k == "评分" and v in ["未知", "/", ""]) - } - elif isinstance(data, list): - return [remove_unknown_scores(item) for item in data] - else: - return data + def extract_from_text(text, current_index): + nonlocal continue_collecting, current_section_pattern, active_key + if text == "": + return current_index + if continue_collecting: + if current_section_pattern and re.match(current_section_pattern, text): + continue_collecting = False + active_key = None + else: + if active_key is not None: + extracted_paragraphs[active_key].append(text) + return current_index -def combine_technical_and_business(data, target_values): - data = remove_unknown_scores(data) - extracted_data = {} # 根级别存储所有数据 - technical_found = False - business_found = False + if match_keywords(text, keywords): + active_key = text + extracted_paragraphs[active_key] = [text] + if match_keywords(text, follow_up_keywords): + continue_collecting = True + section_number = re.match(r'(\d+(\s*\.\s*\d+)*)', text) + if section_number: + current_section_number = section_number.group(1) + level_count = current_section_number.count('.') - def extract_nested(data, parent_key='', is_technical=False, is_business=False): - nonlocal technical_found, business_found - if isinstance(data, dict): - for key, value in data.items(): - current_key = f"{parent_key}.{key}" if parent_key else key + # Pattern to match current level, e.g., 3.4.5 + pattern = r'^' + (r'\d+\s*\.\s*') * level_count + r'\d+' - # 检查是否为技术标的内容 - if any(target in key for target in target_values): - if not is_technical: - extracted_data[key] = value - technical_found = True - continue + # Generate patterns for next section at same level and parent level + parts = current_section_number.split('.') + matched_patterns = [pattern] # start with the full pattern - # 默认其他所有内容都归为商务标 - else: - if not is_business: - if '商务评分' not in extracted_data: - extracted_data['商务评分'] = {} - extracted_data['商务评分'][key] = value - business_found = True - continue + # Next section at same level + parts[-1] = str(int(parts[-1]) + 1) + next_pattern = r'^' + r'\s*\.\s*'.join(parts) + matched_patterns.append(next_pattern) - if isinstance(value, dict) or isinstance(value, list): - extract_nested(value, current_key, is_technical, is_business) + # Parent section (if applicable) + if len(parts) > 1: + parent_section_parts = parts[:-1] + parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1) + parent_pattern = r'^' + r'\s*\.\s*'.join(parent_section_parts) + matched_patterns.append(parent_pattern) - elif isinstance(data, list): - for index, item in enumerate(data): - extract_nested(item, f"{parent_key}[{index}]", is_technical, is_business) + # Combine the patterns + combined_pattern = r'(' + r')|('.join(matched_patterns) + r')' + current_section_pattern = re.compile(combined_pattern) - extract_nested(data) + else: + found_next_number = False + current_section_pattern = None - if not technical_found: - extracted_data['技术评分'] = '' - if not business_found: - extracted_data['商务评分'] = '' + while current_index < len(doc.paragraphs) - 1: + current_index += 1 + next_text = doc.paragraphs[current_index].text.strip() + if not found_next_number: + next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))', next_text) + if next_section_number: + found_next_number = True + if next_section_number.group(1): + section_parts = next_section_number.group(1).split('.') + dynamic_pattern = r'^' + r'\.'.join( + [r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b' + elif next_section_number.group(2): + dynamic_pattern = r'^[\(\(]\d+[\)\)]' + current_section_pattern = re.compile(dynamic_pattern) + if current_section_pattern and re.match(current_section_pattern, next_text): + extracted_paragraphs[active_key].append(next_text) + else: + continue_collecting = False + active_key = None + break - return extracted_data + return current_index -target_values1 = ['技术标','技术部分','设计', '实施',"技术评分"] -evaluation_standards_res=combine_technical_and_business(data,target_values1) -# 从结果中提取"商务标"和"技术标" -technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})} -commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})} - # 返回技术标和商务标 -print(json.dumps(technical_standards,ensure_ascii=False,indent=4)) -print(json.dumps(commercial_standards, ensure_ascii=False, indent=4)) \ No newline at end of file + index = 0 + while index < len(doc.paragraphs): + index = extract_from_text(doc.paragraphs[index].text.strip(), index) + index += 1 + + return extracted_paragraphs + +doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目).docx' diff --git a/flask_app/main/商务标技术标整合.py b/flask_app/main/商务标技术标整合.py index 608957c..1d1ede1 100644 --- a/flask_app/main/商务标技术标整合.py +++ b/flask_app/main/商务标技术标整合.py @@ -125,7 +125,6 @@ def combine_evaluation_standards(truncate2): ) evaluation_res = qianwen_long(file_id, user_query_2) - print(evaluation_res) target_values1 = ['技术标','技术部分','设计', '实施',"技术评分"] # target_values2=['投标报价','商务标','商务部分','报价部分','业绩','信誉','分值','计算公式','信用','人员','资格','奖项','认证','荣誉'] # update_json=combine_technical_and_business(clean_json_string(evaluation_res),target_values1,target_values2) @@ -134,7 +133,7 @@ def combine_evaluation_standards(truncate2): # return evaluation_combined_res return update_json #商务标技术标整合 if __name__ == "__main__": - truncate2="C:\\Users\\Administrator\\Desktop\\招标文件\\test4\\招标02_evaluation_method.pdf" + truncate2="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest1_evaluation_method.pdf" evaluation_standards_res=combine_evaluation_standards(truncate2) # 从结果中提取"商务标"和"技术标" technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})} diff --git a/flask_app/main/无效标和废标和禁止投标整合.py b/flask_app/main/无效标和废标和禁止投标整合.py index 87e539b..77a4b65 100644 --- a/flask_app/main/无效标和废标和禁止投标整合.py +++ b/flask_app/main/无效标和废标和禁止投标整合.py @@ -154,9 +154,9 @@ def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达 all_texts1.append(cleaned_text) # 将处理后的文本添加到结果列表 else: - print(text_list) + # print(text_list) new_text_list=preprocess_text_list(text_list) - print(new_text_list) + # print(new_text_list) pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)' data = re.sub(pattern, '', new_text_list[0]).strip() #去除序号 # 将修改后的第一个元素和剩余的元素连接起来 @@ -268,10 +268,10 @@ def extract_values_if_contains(data, includes): def handle_query(file_path, user_query, output_file, result_key, keywords, truncate_json_path): excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:"] follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下'] - extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) #字典结果 + extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) #提取正文(除表格) # print(extracted_contents) all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表 - all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords) + all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords) #提取表格数据(json_data) qianwen_txt = all_texts1 + all_tables1 # Proceed only if there is content to write if qianwen_txt: @@ -343,13 +343,13 @@ def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path,t if __name__ == '__main__': start_time = time.time() - truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zboutpub\\bed51bd0-a9d5-4ceb-9b6d-170da2aac3bd\\truncate_output.json" - clause_path="C:\\Users\\Administrator\\Desktop\\货物标\\zboutpub\\bed51bd0-a9d5-4ceb-9b6d-170da2aac3bd\\clause1.json" - truncate3="C:\\Users\\Administrator\\Desktop\\货物标\\zboutpub\\bed51bd0-a9d5-4ceb-9b6d-170da2aac3bd\\ztbfile_qualification.pdf" - output_dir = "C:\\Users\\Administrator\\Desktop\\货物标\\zboutpub\\zbout" + truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\truncate_output.json" + clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json" + truncate3="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf" + output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\zbout" # doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx' - doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zboutpub\\bed51bd0-a9d5-4ceb-9b6d-170da2aac3bd\\ztbfile_invalid111.docx' + doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_invalid.docx' results = combine_find_invalid(doc_path, output_dir,truncate_json_path,clause_path,truncate3) end_time = time.time() print("Elapsed time:", str(end_time - start_time)) - print("Results:", results) + print("Results:", json.dumps(results,ensure_ascii=False,indent=4)) diff --git a/flask_app/main/禁止投标情形.py b/flask_app/main/禁止投标情形.py index 8644d82..e8af613 100644 --- a/flask_app/main/禁止投标情形.py +++ b/flask_app/main/禁止投标情形.py @@ -157,7 +157,7 @@ def process_string_list(string_list): else: # 如果没有匹配到内容,返回空列表 return [] -def find_forbidden(truncate_json_path,clause_path,truncate3): #投标人须知前附表 条款 评分前附表和资格审查表中 +def find_forbidden(truncate_json_path,clause_path,truncate3=""): #投标人须知前附表 条款 评分前附表和资格审查表中 # output_filename="merged.pdf" # paths=[truncate1,truncate4] # merged_filepath=merge_pdfs(paths,output_filename) #暂时废弃,评分前附表中的在'否决投标'中摘录了。 diff --git a/flask_app/main/读取文件/split_test2.py b/flask_app/main/读取文件/split_test2.py new file mode 100644 index 0000000..7ba052c --- /dev/null +++ b/flask_app/main/读取文件/split_test2.py @@ -0,0 +1,98 @@ +import os +import docx +from spire.doc import * +from spire.doc.common import * + + +def split_and_clean_docx(input_file, output_folder): + """ + 拆分指定的Word文档为多个节,并去除文档中的水印。 + + 参数: + input_file (str): 需要拆分的源Word文档路径。 + output_folder (str): 拆分后的文档输出目录。 + """ + + # 检查输出目录是否存在,如果不存在则创建 + if not os.path.exists(output_folder): + os.makedirs(output_folder) + + # 加载源文档 + with Document() as document: + document.LoadFromFile(input_file) + + # 遍历文档中的所有节 + for sec_index in range(document.Sections.Count): + # 访问当前节 + section = document.Sections[sec_index] + + # 为当前节创建一个新文档 + with Document() as new_document: + # 将当前节复制到新文档 + new_document.Sections.Add(section.Clone()) + + # 复制源文档的主题和样式到新文档以确保格式一致 + document.CloneThemesTo(new_document) + document.CloneDefaultStyleTo(new_document) + + # 将新文档保存为单独的文件 + output_file = os.path.join(output_folder, f"节{sec_index + 1}.docx") + new_document.SaveToFile(output_file, FileFormat.Docx2016) + + # 去除水印 + remove_watermark(output_folder) + + print("文档拆分并去除水印完成!") + + +def remove_watermark(output_folder): + """ + 去除指定目录下所有docx文件中的水印。 + + 参数: + output_folder (str): 需要去除水印的文档所在的文件夹。 + """ + bookmark_name = "Evaluation Warning: The document was created with Spire.Doc for Python." + + for file_name in os.listdir(output_folder): + if file_name.endswith(".docx"): + doc_path = os.path.join(output_folder, file_name) + doc = docx.Document(doc_path) + # 查找并替换水印内容 + for paragraph in doc.paragraphs: + if bookmark_name in paragraph.text: + # 替换水印文本为空 + inline = paragraph.runs + for i in range(len(inline)): + if bookmark_name in inline[i].text: + inline[i].text = inline[i].text.replace(bookmark_name, "") + break + # 保存文档 + doc.save(doc_path) + +import os + +def test_split_and_clean_docx(): + """ + 测试文档拆分和水印去除功能。 + """ + input_file = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件.docx" # 替换为你的源文档路径 + output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zboutpub" # 指定输出文件夹 + + # 执行拆分和去除水印 + split_and_clean_docx(input_file, output_folder) + + # 验证输出文件是否生成 + if os.path.exists(output_folder): + print("测试通过: 输出文件夹已生成。") + output_files = os.listdir(output_folder) + if output_files: + print(f"测试通过: 生成了 {len(output_files)} 个文档。") + else: + print("测试失败: 没有生成任何文档。") + else: + print("测试失败: 输出文件夹未生成。") + + +if __name__ == "__main__": + test_split_and_clean_docx() diff --git a/flask_app/货物标/无效标和废标和禁止投标整合货物标版.py b/flask_app/货物标/无效标和废标和禁止投标整合货物标版.py new file mode 100644 index 0000000..24cca7d --- /dev/null +++ b/flask_app/货物标/无效标和废标和禁止投标整合货物标版.py @@ -0,0 +1,354 @@ +# -*- coding: utf-8 -*- +import json +import os.path +import time +import re +from flask_app.main.json_utils import combine_json_results, nest_json_under_key +from flask_app.main.通义千问long import upload_file, qianwen_long +from concurrent.futures import ThreadPoolExecutor +from flask_app.main.禁止投标情形 import find_forbidden, process_string_list + + +#如果当前段落有序号,则向下匹配直接遇到相同的序号样式 +#如果当前段落无序号,则向下匹配序号,把若干同类的序号都摘出来。 + +def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): + from collections import OrderedDict + from docx import Document + import re + + if isinstance(keywords, str): + keywords = [keywords] + + doc = Document(doc_path) + extracted_paragraphs = OrderedDict() + continue_collecting = False + current_section_pattern = None + active_key = None + + def match_keywords(text, patterns): + return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns) + + def extract_from_text(text, current_index): + nonlocal continue_collecting, current_section_pattern, active_key + if text == "": + return current_index + + if continue_collecting: + if current_section_pattern and re.match(current_section_pattern, text): + continue_collecting = False + active_key = None + else: + if active_key is not None: + extracted_paragraphs[active_key].append(text) + return current_index + + if match_keywords(text, keywords): + active_key = text + extracted_paragraphs[active_key] = [text] + if match_keywords(text, follow_up_keywords): + continue_collecting = True + section_number = re.match(r'(\d+(\s*\.\s*\d+)*)', text) + if section_number: + current_section_number = section_number.group(1) + level_count = current_section_number.count('.') + + # Pattern to match current level, e.g., 3.4.5 + pattern = r'^' + (r'\d+\s*\.\s*') * level_count + r'\d+' + + # Generate patterns for next section at same level and parent level + parts = current_section_number.split('.') + matched_patterns = [pattern] # start with the full pattern + + # Next section at same level + parts[-1] = str(int(parts[-1]) + 1) + next_pattern = r'^' + r'\s*\.\s*'.join(parts) + matched_patterns.append(next_pattern) + + # Parent section (if applicable) + if len(parts) > 1: + parent_section_parts = parts[:-1] + parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1) + parent_pattern = r'^' + r'\s*\.\s*'.join(parent_section_parts) + matched_patterns.append(parent_pattern) + + # Combine the patterns + combined_pattern = r'(' + r')|('.join(matched_patterns) + r')' + current_section_pattern = re.compile(combined_pattern) + + else: + found_next_number = False + current_section_pattern = None + + while current_index < len(doc.paragraphs) - 1: + current_index += 1 + next_text = doc.paragraphs[current_index].text.strip() + if not found_next_number: + next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))', next_text) + if next_section_number: + found_next_number = True + if next_section_number.group(1): + section_parts = next_section_number.group(1).split('.') + dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b' + elif next_section_number.group(2): + dynamic_pattern = r'^[\(\(]\d+[\)\)]' + current_section_pattern = re.compile(dynamic_pattern) + if current_section_pattern and re.match(current_section_pattern, next_text): + extracted_paragraphs[active_key].append(next_text) + else: + continue_collecting = False + active_key=None + break + + return current_index + + index = 0 + while index < len(doc.paragraphs): + index = extract_from_text(doc.paragraphs[index].text.strip(), index) + index += 1 + + return extracted_paragraphs + +def preprocess_text_list(text_list): + new_text_list = [] + # 正则表达式匹配中文字符或标点后的空格,该空格后紧跟字母、数字或带括号的数字 + split_pattern = re.compile(r'(?<=[\u4e00-\u9fff。;!??!;])(?=\s+[a-zA-Z\d]|\s+\([1-9]\d*\)|\s+\([1-9]\d*\))') + for text in text_list: + # 使用正则表达式检查并拆分元素 + parts = split_pattern.split(text) + new_text_list.extend(part.strip() for part in parts if part.strip()) # 添加非空字符串检查 + + return new_text_list + +def clean_dict_datas(extracted_contents, keywords,excludes): #让正则表达式提取到的东西格式化 + all_texts1 = [] + all_texts2=[] + # 定义用于分割句子的正则表达式,包括中文和西文的结束标点 + split_pattern = r'(?<=[。!?\!\?])' + + for key, text_list in extracted_contents.items(): + if len(text_list) == 1: + for data in text_list: + # 检查是否包含任何需要排除的字符串 + if any(exclude in data for exclude in excludes): + continue # 如果包含任何排除字符串,跳过这个数据 + # 去掉开头的序号,包括字母+数字的格式 以及括号+数字 + pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)' + data = re.sub(pattern, '', data).strip() + keyword_match = re.search(keywords, data) + if keyword_match: + # 从关键词位置开始查找结束标点符号 + start_pos = keyword_match.start() + # 截取从关键词开始到后面的内容 + substring = data[start_pos:] + # 按定义的结束标点分割 + sentences = re.split(split_pattern, substring, 1) + if len(sentences) > 0 and sentences[0]: + # 只取第一句,保留标点 + cleaned_text = data[:start_pos] + sentences[0] + else: + cleaned_text = data # 如果没有标点,使用整个字符串 + else: + # 如果没有找到关键词,保留原文本 + cleaned_text = data + all_texts1.append(cleaned_text) # 将处理后的文本添加到结果列表 + + else: + print(text_list) + new_text_list=preprocess_text_list(text_list) + print(new_text_list) + pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、)?)' + data = re.sub(pattern, '', new_text_list[0]).strip() #去除序号 + # 将修改后的第一个元素和剩余的元素连接起来 + new_text_list[0] = data # 更新列表中的第一个元素 + joined_text = "\n".join(new_text_list) # 如果列表中有多个元素,则连接它们 + all_texts2.append(joined_text) # 将每个列表的内容添加到 all_texts 中 + + return all_texts1,all_texts2 #all_texts1要额外用gpt all_text2直接返回结果 +def find_sentences_with_keywords(data, keywords, follow_up_keywords): + """递归查找并返回包含关键词的句子列表,并根据是否存在后续关键词分别存储到两个列表中。""" + sentences1 = [] # 保存没有后续关键词的情况 + sentences2 = [] # 保存有后续关键词的情况 + + if isinstance(data, dict): + for value in data.values(): + result1, result2 = find_sentences_with_keywords(value, keywords, follow_up_keywords) + sentences1.extend(result1) + sentences2.extend(result2) + elif isinstance(data, list): + for item in data: + result1, result2 = find_sentences_with_keywords(item, keywords, follow_up_keywords) + sentences1.extend(result1) + sentences2.extend(result2) + elif isinstance(data, str): + # 分割句子,保证句子完整性 + split_sentences = re.split(r'(?<=[。!?\!\?])', data) + i = 0 + while i < len(split_sentences): + sentence = split_sentences[i] + if re.search(keywords, sentence, re.IGNORECASE): + follow_up_present = any( + re.search(follow_up, sentence, re.IGNORECASE) for follow_up in follow_up_keywords) + if follow_up_present: + # 如果存在后续关键词,则从当前位置开始截取 + start_index = i + end_index = start_index + found_next_section = False + for j in range(start_index + 1, len(split_sentences)): + if re.match(r'\d+\.\d+(\.\d+)?', split_sentences[j].strip()): + end_index = j + found_next_section = True + break + + if found_next_section: + full_text = ' '.join(split_sentences[start_index:end_index]).strip() + else: + full_text = ' '.join(split_sentences[start_index:]).strip() + pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)' + data=re.sub(pattern,'',full_text) + sentences2.append(data) # 存储有后续关键词的情况 + i = end_index if found_next_section else len(split_sentences) + else: + pattern = r'^\s*([((]\d+[))]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)' + data = re.sub(pattern, '', sentence).replace('\n','').strip() + sentences1.append(data) # 存储没有后续关键词的情况 + i += 1 + else: + i += 1 + + return sentences1, sentences2 # 返回两个列表 + +def extract_sentences_from_json(json_path, keywords,follow_up_keywords): + with open(json_path, 'r', encoding='utf-8') as file: + data = json.load(file) + """从JSON数据中提取包含关键词的句子。""" + return find_sentences_with_keywords(data, keywords,follow_up_keywords) + +#处理无效投标 +def extract_values_if_contains(data, includes): + """ + 递归检查字典中的值是否包含列表 'includes' 中的内容。 + 如果包含,将这些值添加到一个列表中并返回。 + + 参数: + data (dict): 字典或从 JSON 解析得到的数据。 + includes (list): 包含要检查的关键词的列表。 + + 返回: + list: 包含满足条件的值的列表。 + """ + included_values = [] # 初始化结果列表 + + # 定义递归函数来处理嵌套字典 + def recursive_search(current_data): + if isinstance(current_data, dict): + for key, value in current_data.items(): + if isinstance(value, dict): + # 如果值是字典,递归搜索 + recursive_search(value) + elif isinstance(value, str): + # 如果值是字符串,检查是否包含任何 includes 中的关键词 + if any(include in value for include in includes): + included_values.append(value) + elif isinstance(current_data, list): + for item in current_data: + # 如果是列表,递归每个元素 + recursive_search(item) + + # 开始递归搜索 + recursive_search(data) + + return included_values + + +#你是一个文本助手,文本内的信息以'...............'分割,你负责准确筛选所需的信息并返回,每块信息要求完整,不遗漏,你不得擅自进行总结或删减。 +#以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号。 +#以上是原文内容,文本内的信息以'...............'分割,请你根据该信息回答:否决投标或拒绝投标或无效投标或使投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选所需的信息并返回。最终结果以json列表格式返回给我,键名为'否决和无效投标情形',你的回答完全忠于原文内容,且回答内容与原文内容一致,要求完整与准确,不能擅自总结或者概括。", + +def handle_query(file_path, user_query, output_file, result_key, keywords, truncate_json_path): + excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:"] + follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下'] + extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) #字典结果 + # print(extracted_contents) + all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表 + all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords) + qianwen_txt = all_texts1 + all_tables1 + # Proceed only if there is content to write + if qianwen_txt: + with open(output_file, 'w', encoding='utf-8') as file: + # 初始化一个计数器 + counter = 1 + for content in qianwen_txt: + file.write("..............."+'\n') + # 写入内容前加上序号,后面接一个点和空格,然后是内容 + file.write(f"{counter}. {content}\n") + # 更新计数器,每次循环递增 + counter += 1 + file_id = upload_file(output_file) + qianwen_ans = qianwen_long(file_id, user_query) + selected_contents = [] + num_list = process_string_list(qianwen_ans) + print(num_list) + + for index in num_list: + if index - 1 < len(qianwen_txt): + content = qianwen_txt[index - 1] # 转换序号为索引(假设序号从1开始) + selected_contents.append(content) + selected_contents += all_texts2 + selected_contents += all_tables2 + # 创建一个字典来保存结果 + res = {result_key: selected_contents} + # 将结果转换为JSON字符串 + # os.remove(output_file) # Remove the file after use + # print(f"Deleted temporary file: {output_file}") + else: + res = {result_key: ""} # Set the response to empty if no contents were extracted + return res + +def combine_find_invalid(file_path, output_dir, truncate_json_path,clause_path): + queries = [ + (r'否\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效', + "以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。", + os.path.join(output_dir, "temp1.txt"), "否决和无效投标情形"), + (r'废\s*标', + "以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:废标项的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。", + os.path.join(output_dir, "temp2.txt"), "废标项") + ] + results = [] + + # 使用线程池来并行处理查询 + with ThreadPoolExecutor() as executor: + futures = [] + for keywords, user_query, output_file, result_key in queries: + future = executor.submit(handle_query, file_path, user_query, output_file, result_key, keywords, + truncate_json_path) + futures.append(future) + time.sleep(1) # 暂停1秒后再提交下一个任务 + + for future in futures: + results.append(future.result()) + + #禁止投标 + print("starting不得存在的情形...") + forbidden_res = find_forbidden(truncate_json_path, clause_path) + results.append(forbidden_res) + + combined_dict = {} + for d in results: + combined_dict.update(d) + + print("无效标与废标done...") + # return nest_json_under_key(combined_dict, "无效标与废标项") + return {"无效标与废标项":combined_dict} + +if __name__ == '__main__': + start_time = time.time() + truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part1\\truncate_output.json" + clause_path="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.json" + output_dir = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\invalid" + # doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx' + doc_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目).docx' + results = combine_find_invalid(doc_path, output_dir,truncate_json_path,clause_path) + end_time = time.time() + print("Elapsed time:", str(end_time - start_time)) + print("Results:", json.dumps(results,ensure_ascii=False,indent=4))