diff --git a/flask_app/main/商务标技术标整合.py b/flask_app/main/商务标技术标整合.py index 36f1067..37f36ee 100644 --- a/flask_app/main/商务标技术标整合.py +++ b/flask_app/main/商务标技术标整合.py @@ -74,9 +74,9 @@ def combine_technical_and_business(data, target_values): # 默认其他所有内容都归为商务标 else: if not is_business: - if '商务标' not in extracted_data: - extracted_data['商务标'] = {} - extracted_data['商务标'][key] = value + if '商务评分' not in extracted_data: + extracted_data['商务评分'] = {} + extracted_data['商务评分'][key] = value business_found = True continue @@ -90,18 +90,18 @@ def combine_technical_and_business(data, target_values): extract_nested(data) if not technical_found: - extracted_data['技术标'] = '' + extracted_data['技术评分'] = '' if not business_found: - extracted_data['商务标'] = '' + extracted_data['商务评分'] = '' return extracted_data def combine_evaluation_standards(truncate2): # 商务标、技术标评分项:千问 file_id = upload_file(truncate2) user_query_2 = ( - "根据该文档中的评标办法前附表,请你列出该文件的技术标,商务标,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术标'中新增子键名'备注'存放该信息。如果评分内容不是这3个,则返回文档中给定的评分内容以及它的评分要求,都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容") + "根据该文档中的评标办法前附表,请你列出该文件的技术评分,商务评分,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术评分'中新增子键名'备注'存放该信息。如果评分内容不是这3个,则返回文档中给定的评分内容以及它的评分要求,都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容") evaluation_res = qianwen_long(file_id, user_query_2) - target_values1 = ['技术标','技术部分','设计', '实施'] + target_values1 = ['技术标','技术部分','设计', '实施',"技术评分"] # target_values2=['投标报价','商务标','商务部分','报价部分','业绩','信誉','分值','计算公式','信用','人员','资格','奖项','认证','荣誉'] # update_json=combine_technical_and_business(clean_json_string(evaluation_res),target_values1,target_values2) update_json = combine_technical_and_business(clean_json_string(evaluation_res), target_values1) @@ -112,8 +112,8 @@ if __name__ == "__main__": truncate2="C:\\Users\\Administrator\\Desktop\\fsdownload\\0883895c-e61f-4a99-9308-697fca1d4b77\\ztbfile_evaluation_method.pdf" evaluation_standards_res=combine_evaluation_standards(truncate2) # 从结果中提取"商务标"和"技术标" - technical_standards = {"技术标": evaluation_standards_res.get("技术标", {})} - commercial_standards = {"商务标": evaluation_standards_res.get("商务标", {})} + technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})} + commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})} # 返回技术标和商务标 print(json.dumps(technical_standards,ensure_ascii=False,indent=4)) print(json.dumps(commercial_standards, ensure_ascii=False, indent=4)) \ No newline at end of file diff --git a/flask_app/main/招标文件解析.py b/flask_app/main/招标文件解析.py index 5101a60..a63da41 100644 --- a/flask_app/main/招标文件解析.py +++ b/flask_app/main/招标文件解析.py @@ -119,8 +119,8 @@ def fetch_evaluation_standards(truncate1): # 评标办法前附表 evaluation_standards_res = combine_evaluation_standards(truncate1) # 获取技术标和商务标 - technical_standards = {"技术标": evaluation_standards_res.get("技术标", {})} - commercial_standards = {"商务标": evaluation_standards_res.get("商务标", {})} + technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})} + commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})} logger.info("商务标和技术标 done") diff --git a/flask_app/main/读取文件/按页读取pdf.py b/flask_app/main/读取文件/按页读取pdf.py index 5a2e16b..f33310d 100644 --- a/flask_app/main/读取文件/按页读取pdf.py +++ b/flask_app/main/读取文件/按页读取pdf.py @@ -72,6 +72,6 @@ def extract_text_by_page(file_path): print(f"Page {page_num + 1} is empty or text could not be extracted.") return result if __name__ == '__main__': - file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\ztbfile.pdf" + file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\中国电信股份有限公司随州分公司广水市雪亮工程信息化项目-招标文件(定稿).pdf" res=extract_text_by_page(file_path) # print(res) \ No newline at end of file diff --git a/flask_app/货物标/test3.py b/flask_app/货物标/test3.py index 0a829d7..d97fb00 100644 --- a/flask_app/货物标/test3.py +++ b/flask_app/货物标/test3.py @@ -1,42 +1,79 @@ import re +from unittest.mock import Mock -def contains_number_or_index(key, value): - return (isinstance(value, (int, float)) or - (isinstance(value, str) and value.isdigit()) or - '序号' in key or - (isinstance(value, str) and re.search(r'\d+', value))) +def clean_page_content(text, common_header): + # 模拟清理页面内容的函数 + return text.replace(common_header, '') -def preprocess_dict(data): - if isinstance(data, dict): - if len(data) > 1: - # 检查是否所有值都是 "" 或 "/" - if all(v == "" or v == "/" for v in data.values()): - return list(data.keys()) - else: - processed = {} - for k, v in data.items(): - if not contains_number_or_index(k, v): - processed_v = preprocess_dict(v) - # 即使 processed_v 是空字符串,也保留键 - processed[k] = processed_v - return processed - else: - return {k: preprocess_dict(v) for k, v in data.items()} - elif isinstance(data, list): - return [preprocess_dict(item) for item in data] - else: - return data +def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header): + start_page = None + mid_page = None + end_page = None + for i, page in enumerate(pdf_document.pages): + text = page.extract_text() or "" + cleaned_text = clean_page_content(text, common_header) + if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page: + start_page = i + if start_page is not None and mid_page is None and re.search(r'^一\s*[、.]\s*(说\s*明|总\s*则)', cleaned_text): + mid_page = i + if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page: + end_page = i + break + return start_page, mid_page, end_page -input_data = { - "具备法律、行政法规规定的其他条件的证明材料": { - "国家对生产和销售相关产品或提供相关服务有专门法律、行政法规规定的": "则 必须提供取得国家有关主管部门行政许可的证明材料", - "未被列入“信用中国”网站(www.creditchina.gov.cn)信用服务栏失信被执行人、 重大税收违法案件当事人名单、政府采购严重违法失信行为记录名单和“中国 政府采购”网站(www.ccgp.gov.cn)政府采购严重违法失信行为记录名单": "并 提供网页截图以证明", - "招标文件第一章“投标人资格要求”中有特殊要求的": "投标人应提供其符合特 殊要求的证明材料或者情况说明", - "不符合联合体投标相关规定和要求的": "", - "投标人认为需提供的其它相关资格证明材料": "", - "资格证明文件正本应为清晰彩色影印件且加盖单位公章": "" - } -} +# 创建模拟的PDF文档 +def create_mock_pdf(page_contents): + pdf_mock = Mock() + pdf_mock.pages = [Mock(extract_text=lambda: content) for content in page_contents] + return pdf_mock -processed_data = preprocess_dict(input_data) -print(processed_data) +# 测试用例 +def test_extract_pages(): + # 测试用例1:正常情况 + pdf1 = create_mock_pdf([ + "Page 0", + "第一章 投标人须知", + "其他内容", + "一、说明", + "更多内容", + "结束标记" + ]) + result1 = extract_pages_tobidders_notice(pdf1, r'第.+章\s*投标人须知', r'结束标记', -1, '') + print("Test Case 1:", result1) # 期望输出:(1, 3, 5) + + # 测试用例2:没有中间页 + pdf2 = create_mock_pdf([ + "Page 0", + "第一章 投标人须知", + "其他内容", + "结束标记" + ]) + result2 = extract_pages_tobidders_notice(pdf2, r'第.+章\s*投标人须知', r'结束标记', -1, '') + print("Test Case 2:", result2) # 期望输出:(1, None, 3) + + # 测试用例3:使用"总则"而不是"说明" + pdf3 = create_mock_pdf([ + "Page 0", + "第一章 投标人须知", + "其他内容", + "一、总则", + "更多内容", + "结束标记" + ]) + result3 = extract_pages_tobidders_notice(pdf3, r'第.+章\s*投标人须知', r'结束标记', -1, '') + print("Test Case 3:", result3) # 期望输出:(1, 3, 5) + + # 测试用例4:中间页格式变化 + pdf4 = create_mock_pdf([ + "Page 0", + "第一章 投标人须知", + "其他内容", + "一.说 明", + "更多内容", + "结束标记" + ]) + result4 = extract_pages_tobidders_notice(pdf4, r'第.+章\s*投标人须知', r'结束标记', -1, '') + print("Test Case 4:", result4) # 期望输出:(1, 3, 5) + +# 运行测试 +test_extract_pages() \ No newline at end of file diff --git a/flask_app/货物标/货物标截取pdf.py b/flask_app/货物标/货物标截取pdf.py index 2038a24..324c586 100644 --- a/flask_app/货物标/货物标截取pdf.py +++ b/flask_app/货物标/货物标截取pdf.py @@ -123,11 +123,13 @@ def merge_and_cleanup(output_pdf_path, suffix_to_merge): def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): pdf_path = convert_to_pdf(file_path) - output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) - if output_pdf_path: - if output_suffix == "qualification1": - merge_and_cleanup(output_pdf_path, "qualification3") - return output_pdf_path + result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) + if result: + if output_suffix == "tobidders_notice": + return result # 返回包含两个路径的元组 + elif output_suffix == "qualification1": + merge_and_cleanup(result, "qualification3") + return result return None def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): @@ -142,11 +144,17 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt if is_pdf_or_doc(file_path): result = process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) if result: - generated_files.append(result) + if isinstance(result, tuple): + generated_files.extend(result) + else: + generated_files.append(result) elif os.path.isfile(input_path) and is_pdf_or_doc(input_path): result = process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) if result: - generated_files.append(result) + if isinstance(result, tuple): + generated_files.extend(result) + else: + generated_files.append(result) else: print("提供的路径既不是文件夹也不是PDF文件。") @@ -174,20 +182,49 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter common_header = extract_common_header(pdf_path) pdf_document = PdfReader(pdf_path) exclusion_pattern = None - if output_suffix=="qualification1": - exclusion_pattern=re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') - start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, - common_header,exclusion_pattern) - if start_page is None or end_page is None: - print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") - return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header) - elif output_suffix=="qualification1": - truncate_pdf_main(pdf_path,output_folder,2,"qualification3") - return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix) + if output_suffix == "tobidders_notice": + start_page, mid_page, end_page = extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, + begin_page, common_header) + if start_page is None or mid_page is None or end_page is None: + print(f"未找到所需页面在文件 {pdf_path} 中!") + return None + path1 = save_extracted_pages(pdf_document, start_page, mid_page - 1, pdf_path, output_folder, + "tobidders_notice_part1") + path2 = save_extracted_pages(pdf_document, mid_page, end_page, pdf_path, output_folder, + "tobidders_notice_part2") + return path1, path2 + else: + # 原有的处理逻辑保持不变 + if output_suffix == "qualification1": + exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') + start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern) + if start_page is None or end_page is None: + print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") + return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header) + elif output_suffix == "qualification1": + truncate_pdf_main(pdf_path, output_folder, 2, "qualification3") + return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix) except Exception as e: print(f"Error processing {pdf_path}: {e}") return None +def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header): + start_page = None + mid_page = None + end_page = None + for i, page in enumerate(pdf_document.pages): + text = page.extract_text() or "" + cleaned_text = clean_page_content(text, common_header) + if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page: + start_page = i + if start_page is not None and mid_page is None and re.search( + r'^\s*[((]?\s*[一1]\s*[))]?\s*[、.]*\s*(说\s*明|总\s*则)', cleaned_text, re.MULTILINE): + mid_page = i + if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page: + end_page = i + break + return start_page, mid_page, end_page + def get_patterns_for_procurement(): begin_pattern = re.compile( @@ -298,6 +335,15 @@ def truncate_pdf_main(input_path, output_folder, selection, output_suffix="defau r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE ) local_output_suffix = "qualification1" + elif selection ==4: #投标人须知前附表和正文 + begin_page=1 + begin_pattern = re.compile( + r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)', re.MULTILINE + ) + end_pattern=re.compile( + r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE + ) + local_output_suffix = "tobidders_notice" else: print("无效的选择") return None @@ -313,7 +359,7 @@ def truncate_pdf_main(input_path, output_folder, selection, output_suffix="defau def truncate_pdf_multiple(input_path, output_folder): truncate_files = [] - for selection in range(1, 4): + for selection in range(1, 5): files = truncate_pdf_main(input_path, output_folder, selection) truncate_files.extend(files) return truncate_files @@ -321,8 +367,8 @@ def truncate_pdf_multiple(input_path, output_folder): # TODO:交通智能系统和招标(1)(1)文件有问题 if __name__ == "__main__": - input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles" - output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output3" + input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次).pdf" + output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output4" # truncate_pdf_multiple(input_path,output_folder) - selection = 3 # 例如:1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查 + selection = 4 # 例如:1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1和qualification2 4.投标人须知前附表 generated_files = truncate_pdf_main(input_path, output_folder, selection)