9.27

2024-09-27 15:00:30 +08:00 · 2024-09-27 15:00:30 +08:00 · d8247c95f4
commit d8247c95f4
parent e6cedef2c2
5 changed files with 153 additions and 70 deletions
--- a/flask_app/main/商务标技术标整合.py
+++ b/flask_app/main/商务标技术标整合.py
@ -74,9 +74,9 @@ def combine_technical_and_business(data, target_values):
        # 默认其他所有内容都归为商务标
        else:
          if not is_business:
-            if '商务标' not in extracted_data:
-              extracted_data['商务标'] = {}
-            extracted_data['商务标'][key] = value
+            if '商务评分' not in extracted_data:
+              extracted_data['商务评分'] = {}
+            extracted_data['商务评分'][key] = value
            business_found = True
            continue

@ -90,18 +90,18 @@ def combine_technical_and_business(data, target_values):
  extract_nested(data)

  if not technical_found:
-    extracted_data['技术标'] = ''
+    extracted_data['技术评分'] = ''
  if not business_found:
-    extracted_data['商务标'] = ''
+    extracted_data['商务评分'] = ''

  return extracted_data
 def combine_evaluation_standards(truncate2):
    # 商务标、技术标评分项：千问
    file_id = upload_file(truncate2)
    user_query_2 = (
-        "根据该文档中的评标办法前附表，请你列出该文件的技术标，商务标，投标报价评审标准以及它们对应的具体评分要求，若对应内容中存在其他信息，在键名如'技术标'中新增子键名'备注'存放该信息。如果评分内容不是这3个，则返回文档中给定的评分内容以及它的评分要求，都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容")
+        "根据该文档中的评标办法前附表，请你列出该文件的技术评分，商务评分，投标报价评审标准以及它们对应的具体评分要求，若对应内容中存在其他信息，在键名如'技术评分'中新增子键名'备注'存放该信息。如果评分内容不是这3个，则返回文档中给定的评分内容以及它的评分要求，都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容")
    evaluation_res = qianwen_long(file_id, user_query_2)
-    target_values1 = ['技术标','技术部分','设计', '实施']
+    target_values1 = ['技术标','技术部分','设计', '实施',"技术评分"]
    # target_values2=['投标报价','商务标','商务部分','报价部分','业绩','信誉','分值','计算公式','信用','人员','资格','奖项','认证','荣誉']
    # update_json=combine_technical_and_business(clean_json_string(evaluation_res),target_values1,target_values2)
    update_json = combine_technical_and_business(clean_json_string(evaluation_res), target_values1)
@ -112,8 +112,8 @@ if __name__ == "__main__":
    truncate2="C:\\Users\\Administrator\\Desktop\\fsdownload\\0883895c-e61f-4a99-9308-697fca1d4b77\\ztbfile_evaluation_method.pdf"
    evaluation_standards_res=combine_evaluation_standards(truncate2)
    # 从结果中提取"商务标"和"技术标"
-    technical_standards = {"技术标": evaluation_standards_res.get("技术标", {})}
-    commercial_standards = {"商务标": evaluation_standards_res.get("商务标", {})}
+    technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})}
+    commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})}
    # 返回技术标和商务标
    print(json.dumps(technical_standards,ensure_ascii=False,indent=4))
    print(json.dumps(commercial_standards, ensure_ascii=False, indent=4))
--- a/flask_app/main/招标文件解析.py
+++ b/flask_app/main/招标文件解析.py
@ -119,8 +119,8 @@ def fetch_evaluation_standards(truncate1):  # 评标办法前附表
    evaluation_standards_res = combine_evaluation_standards(truncate1)

    # 获取技术标和商务标
-    technical_standards = {"技术标": evaluation_standards_res.get("技术标", {})}
-    commercial_standards = {"商务标": evaluation_standards_res.get("商务标", {})}
+    technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})}
+    commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})}

    logger.info("商务标和技术标 done")

--- a/flask_app/main/读取文件/按页读取pdf.py
+++ b/flask_app/main/读取文件/按页读取pdf.py
@ -72,6 +72,6 @@ def extract_text_by_page(file_path):
                print(f"Page {page_num + 1} is empty or text could not be extracted.")
        return result
 if __name__ == '__main__':
-    file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\ztbfile.pdf"
+    file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\中国电信股份有限公司随州分公司广水市雪亮工程信息化项目-招标文件（定稿）.pdf"
    res=extract_text_by_page(file_path)
    # print(res)
--- a/flask_app/货物标/test3.py
+++ b/flask_app/货物标/test3.py
@ -1,42 +1,79 @@
 import re
+from unittest.mock import Mock

-def contains_number_or_index(key, value):
-    return (isinstance(value, (int, float)) or
-            (isinstance(value, str) and value.isdigit()) or
-            '序号' in key or
-            (isinstance(value, str) and re.search(r'\d+', value)))
+def clean_page_content(text, common_header):
+    # 模拟清理页面内容的函数
+    return text.replace(common_header, '')

-def preprocess_dict(data):
-    if isinstance(data, dict):
-        if len(data) > 1:
-            # 检查是否所有值都是 "" 或 "/"
-            if all(v == "" or v == "/" for v in data.values()):
-                return list(data.keys())
-            else:
-                processed = {}
-                for k, v in data.items():
-                    if not contains_number_or_index(k, v):
-                        processed_v = preprocess_dict(v)
-                        # 即使 processed_v 是空字符串，也保留键
-                        processed[k] = processed_v
-                return processed
-        else:
-            return {k: preprocess_dict(v) for k, v in data.items()}
-    elif isinstance(data, list):
-        return [preprocess_dict(item) for item in data]
-    else:
-        return data
+def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
+    start_page = None
+    mid_page = None
+    end_page = None
+    for i, page in enumerate(pdf_document.pages):
+        text = page.extract_text() or ""
+        cleaned_text = clean_page_content(text, common_header)
+        if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
+            start_page = i
+        if start_page is not None and mid_page is None and re.search(r'^一\s*[、.]\s*(说\s*明|总\s*则)', cleaned_text):
+            mid_page = i
+        if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page:
+            end_page = i
+            break
+    return start_page, mid_page, end_page

-input_data = {
-    "具备法律、行政法规规定的其他条件的证明材料": {
-        "国家对生产和销售相关产品或提供相关服务有专门法律、行政法规规定的": "则 必须提供取得国家有关主管部门行政许可的证明材料",
-        "未被列入“信用中国”网站(www.creditchina.gov.cn)信用服务栏失信被执行人、 重大税收违法案件当事人名单、政府采购严重违法失信行为记录名单和“中国 政府采购”网站（www.ccgp.gov.cn）政府采购严重违法失信行为记录名单": "并 提供网页截图以证明",
-        "招标文件第一章“投标人资格要求”中有特殊要求的": "投标人应提供其符合特 殊要求的证明材料或者情况说明",
-        "不符合联合体投标相关规定和要求的": "",
-        "投标人认为需提供的其它相关资格证明材料": "",
-        "资格证明文件正本应为清晰彩色影印件且加盖单位公章": ""
-    }
-}
+# 创建模拟的PDF文档
+def create_mock_pdf(page_contents):
+    pdf_mock = Mock()
+    pdf_mock.pages = [Mock(extract_text=lambda: content) for content in page_contents]
+    return pdf_mock

-processed_data = preprocess_dict(input_data)
-print(processed_data)
+# 测试用例
+def test_extract_pages():
+    # 测试用例1：正常情况
+    pdf1 = create_mock_pdf([
+        "Page 0",
+        "第一章 投标人须知",
+        "其他内容",
+        "一、说明",
+        "更多内容",
+        "结束标记"
+    ])
+    result1 = extract_pages_tobidders_notice(pdf1, r'第.+章\s*投标人须知', r'结束标记', -1, '')
+    print("Test Case 1:", result1)  # 期望输出：(1, 3, 5)
+
+    # 测试用例2：没有中间页
+    pdf2 = create_mock_pdf([
+        "Page 0",
+        "第一章 投标人须知",
+        "其他内容",
+        "结束标记"
+    ])
+    result2 = extract_pages_tobidders_notice(pdf2, r'第.+章\s*投标人须知', r'结束标记', -1, '')
+    print("Test Case 2:", result2)  # 期望输出：(1, None, 3)
+
+    # 测试用例3：使用"总则"而不是"说明"
+    pdf3 = create_mock_pdf([
+        "Page 0",
+        "第一章 投标人须知",
+        "其他内容",
+        "一、总则",
+        "更多内容",
+        "结束标记"
+    ])
+    result3 = extract_pages_tobidders_notice(pdf3, r'第.+章\s*投标人须知', r'结束标记', -1, '')
+    print("Test Case 3:", result3)  # 期望输出：(1, 3, 5)
+
+    # 测试用例4：中间页格式变化
+    pdf4 = create_mock_pdf([
+        "Page 0",
+        "第一章 投标人须知",
+        "其他内容",
+        "一.说  明",
+        "更多内容",
+        "结束标记"
+    ])
+    result4 = extract_pages_tobidders_notice(pdf4, r'第.+章\s*投标人须知', r'结束标记', -1, '')
+    print("Test Case 4:", result4)  # 期望输出：(1, 3, 5)
+
+# 运行测试
+test_extract_pages()
--- a/flask_app/货物标/货物标截取pdf.py
+++ b/flask_app/货物标/货物标截取pdf.py
@ -123,11 +123,13 @@ def merge_and_cleanup(output_pdf_path, suffix_to_merge):

 def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
    pdf_path = convert_to_pdf(file_path)
-    output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
-    if output_pdf_path:
-        if output_suffix == "qualification1":
-            merge_and_cleanup(output_pdf_path, "qualification3")
-        return output_pdf_path
+    result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
+    if result:
+        if output_suffix == "tobidders_notice":
+            return result  # 返回包含两个路径的元组
+        elif output_suffix == "qualification1":
+            merge_and_cleanup(result, "qualification3")
+        return result
    return None

 def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
@ -142,10 +144,16 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
            if is_pdf_or_doc(file_path):
                result = process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
                if result:
+                    if isinstance(result, tuple):
+                        generated_files.extend(result)
+                    else:
                        generated_files.append(result)
    elif os.path.isfile(input_path) and is_pdf_or_doc(input_path):
        result = process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
        if result:
+            if isinstance(result, tuple):
+                generated_files.extend(result)
+            else:
                generated_files.append(result)
    else:
        print("提供的路径既不是文件夹也不是PDF文件。")
@ -174,10 +182,22 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
        common_header = extract_common_header(pdf_path)
        pdf_document = PdfReader(pdf_path)
        exclusion_pattern = None
+        if output_suffix == "tobidders_notice":
+            start_page, mid_page, end_page = extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern,
+                                                                            begin_page, common_header)
+            if start_page is None or mid_page is None or end_page is None:
+                print(f"未找到所需页面在文件 {pdf_path} 中！")
+                return None
+            path1 = save_extracted_pages(pdf_document, start_page, mid_page - 1, pdf_path, output_folder,
+                                         "tobidders_notice_part1")
+            path2 = save_extracted_pages(pdf_document, mid_page, end_page, pdf_path, output_folder,
+                                         "tobidders_notice_part2")
+            return path1, path2
+        else:
+            # 原有的处理逻辑保持不变
            if output_suffix == "qualification1":
                exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
-        start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
-                                                     common_header,exclusion_pattern)
+            start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern)
            if start_page is None or end_page is None:
                print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！尝试备用提取策略。")
                return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header)
@ -188,6 +208,23 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
        print(f"Error processing {pdf_path}: {e}")
        return None

+def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
+    start_page = None
+    mid_page = None
+    end_page = None
+    for i, page in enumerate(pdf_document.pages):
+        text = page.extract_text() or ""
+        cleaned_text = clean_page_content(text, common_header)
+        if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
+            start_page = i
+        if start_page is not None and mid_page is None and re.search(
+                r'^\s*[（(]?\s*[一1]\s*[)）]?\s*[、.]*\s*(说\s*明|总\s*则)', cleaned_text, re.MULTILINE):
+            mid_page = i
+        if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page:
+            end_page = i
+            break
+    return start_page, mid_page, end_page
+

 def get_patterns_for_procurement():
    begin_pattern = re.compile(
@ -298,6 +335,15 @@ def truncate_pdf_main(input_path, output_folder, selection, output_suffix="defau
            r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE
        )
        local_output_suffix = "qualification1"
+    elif selection ==4:     #投标人须知前附表和正文
+        begin_page=1
+        begin_pattern = re.compile(
+            r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)', re.MULTILINE
+        )
+        end_pattern=re.compile(
+            r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE
+        )
+        local_output_suffix = "tobidders_notice"
    else:
        print("无效的选择")
        return None
@ -313,7 +359,7 @@ def truncate_pdf_main(input_path, output_folder, selection, output_suffix="defau

 def truncate_pdf_multiple(input_path, output_folder):
    truncate_files = []
-    for selection in range(1, 4):
+    for selection in range(1, 5):
        files = truncate_pdf_main(input_path, output_folder, selection)
        truncate_files.extend(files)
    return truncate_files
@ -321,8 +367,8 @@ def truncate_pdf_multiple(input_path, output_folder):

 # TODO:交通智能系统和招标(1)(1)文件有问题
 if __name__ == "__main__":
-    input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
-    output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output3"
+    input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件（定稿）（三次）.pdf"
+    output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output4"
    # truncate_pdf_multiple(input_path,output_folder)
-    selection = 3  # 例如：1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查
+    selection = 4  # 例如：1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1和qualification2  4.投标人须知前附表
    generated_files = truncate_pdf_main(input_path, output_folder, selection)