10.25切分改为多线程

2024-10-25 14:00:31 +08:00 · 2024-10-25 14:00:31 +08:00 · 7a03351f28
commit 7a03351f28
parent d692f06125
7 changed files with 262 additions and 122 deletions
--- a/flask_app/general/format_date.py
+++ b/flask_app/general/format_date.py
@ -0,0 +1,158 @@
+import datetime
+import re
+
+
+def format_chinese_date(date_str):
+    """
+    将包含中文字符的日期字符串格式化为标准的 'YYYY-MM-DD HH:MM:SS' 格式。
+
+    参数:
+        date_str (str): 输入的中文日期字符串，例如 "20 19 年7 月18日 09： 30整"
+
+    返回:
+        str: 格式化后的日期字符串，例如 "2019-07-18 09:30:00"
+        如果格式错误，返回 None 并打印错误信息。
+    """
+    # print("------------")
+    # print(f"原始输入: {date_str}")
+
+    # 1. 删除所有空格
+    date_str = ''.join(date_str.split())
+
+    # 1.5 删除括号及其内容（中文括号）
+    date_str = re.sub(r'（.*?）', '', date_str)
+
+    # 2. 使用正则表达式移除所有非数字和非必要的中文字符
+    # 只保留数字、年、月、日、时、分、秒、冒号、减号
+    date_str = re.sub(r'[^\d年月日时分秒:：\-]', '', date_str)
+
+    # print(f"去除多余字符后: {date_str}")
+
+    # 3. 替换全角冒号为半角冒号
+    date_str = date_str.replace('：', ':')
+
+    # 4. 替换'年'和'月'为 '-', '日'为 ' '，确保日期和时间之间有一个空格
+    date_str = date_str.replace('年', '-').replace('月', '-').replace('日', ' ')
+    # 替换'时'、'分'为 ':'，并移除'秒'
+    date_str = date_str.replace('时', ':').replace('分', ':').replace('秒', '')
+
+    # print(f"替换分隔符后: {date_str}")
+
+    # 5. 处理时间部分
+    if ' ' not in date_str:
+        # 如果没有时间部分，添加默认时间
+        date_str += ' 00:00:00'
+    else:
+        # 分割日期和时间部分
+        parts = date_str.split(' ', 1)
+        if len(parts) != 2:
+            # Unexpected format
+            print(f"日期格式错误: 分割后部分数不正确 - {date_str}")
+            return None
+        date_part, time_part = parts
+
+        if not time_part:
+            # 时间部分为空，设置为默认时间
+            time_part = '00:00:00'
+        else:
+            # 如果时间部分只有小时和分钟，添加秒
+            time_components = time_part.split(':')
+            if len(time_components) == 1:
+                # 只有小时
+                time_part = f"{time_components[0]:0>2}:00:00"
+            elif len(time_components) == 2:
+                # 小时和分钟
+                time_part = f"{time_components[0]:0>2}:{time_components[1]:0>2}:00"
+            elif len(time_components) == 3:
+                # 小时、分钟和秒
+                time_part = f"{time_components[0]:0>2}:{time_components[1]:0>2}:{time_components[2]:0>2}"
+            else:
+                # 超过3个部分，格式错误
+                print(f"日期格式错误: 时间部分格式不正确 - {time_part}")
+                return None
+
+        # 补齐日期部分中的月和日为双数字
+        date_parts = date_part.split('-')
+        if len(date_parts) != 3:
+            print(f"日期格式错误: 日期部分格式不正确 - {date_part}")
+            return None
+        year, month, day = date_parts
+        month = month.zfill(2)
+        day = day.zfill(2)
+        date_part = f"{year}-{month}-{day}"
+
+        date_str = f"{date_part} {time_part}"
+
+    # print(f"最终处理后字符串: {date_str}")
+
+    # 6. 定义输入字符串的格式
+    input_format = "%Y-%m-%d %H:%M:%S"
+
+    try:
+        # 解析字符串为 datetime 对象
+        dt = datetime.datetime.strptime(date_str, input_format)
+
+        # 格式化为所需的输出格式
+        formatted_date = dt.strftime("%Y-%m-%d %H:%M:%S")
+        return formatted_date
+    except ValueError as e:
+        print(f"日期格式错误: {e} - 处理后的字符串: '{date_str}'")
+        return None
+
+
+# 示例使用
+if __name__ == "__main__":
+    input_dates = [
+        # 完整的日期和时间
+        "2019年7月18日09：30",
+        "20 19 年7 月18日 09： 30整（北京时间）",
+        "2020年02月05日12时30分45秒",
+        "2021年3月15日16:45:30",
+        "2022年6月30日23时59分59秒",
+        "2023年01月01日00时00分00秒",
+        "2024年12月31日23:59:59",
+
+        # 仅有日期，没有时间
+        "2020年12月5日",
+        "2021年5月1日",
+        "2022年02月29日",  # 闰年
+        "2023年04月31日",  # 无效日期
+        "2021年13月1日",  # 无效月份
+        "2021年00月10日",  # 无效月份
+        "2021年05月00日",  # 无效日期
+        "2021年05月32日",  # 无效日期
+
+        # 日期和部分时间
+        "2020年2月5日12时",
+        "2020年2月5日12时30分",
+        "2021年3月15日16:",
+        "2023年01月01日00：00",
+        "2021年5月1日07时5分5秒",
+        "2021年5月1日07:5:5",
+
+        # 带有不同中文字符和额外内容
+        "2020年02月05日12:30整",
+        "2021年3月15日16:45（上午）",
+        "2022年6月30日23:59:59（夜间）",
+        "2023年01月01日00:00（凌晨）",
+        "2024年02月29日00:00:00（闰年）",
+
+        # 不同的时间表示方式
+        "2022年6月30日23：59：59",
+        "2023年01月01日00：00",
+        "2021年3月15日16时45分",
+        "2020年02月05日12：30：45",
+
+        # 无效的日期字符串
+        "无效日期字符串",
+        "2021年5月1日25时00分00秒",  # 无效小时
+        "2021年5月1日23时60分00秒",  # 无效分钟
+        "2021年5月1日23时59分60秒",  # 无效秒
+    ]
+
+    for input_date in input_dates:
+        formatted = format_chinese_date(input_date)
+        print(type(formatted))
+        print(f"原始输入: {input_date} -> 格式化后: {formatted}\n")
+        if not formatted:
+            print("error!")
--- a/flask_app/general/post_processing.py
+++ b/flask_app/general/post_processing.py
@ -1,7 +1,7 @@
 # -*- encoding:utf-8 -*-
 import json
 import re
-
+from flask_app.general.format_date import format_chinese_date

 # 定义一个辅助函数用于获取嵌套字典中的值
 def get_nested(dic, keys, default=None):
@ -96,6 +96,8 @@ def inner_post_processing(base_info):
        "招标单位名称": [["招标人/代理信息", "招标人"]],
        "招标公告地址": [["关键时间/内容", "信息公示媒介"], ["关键时间/内容", "评标结果公示媒介"]],
    }
+    # 定义需要格式化的日期字段
+    date_fields = ["开标时间", "报名截止日期"]

    # 提取并映射字段
    for new_key, paths in mapping.items():
@ -104,6 +106,11 @@ def inner_post_processing(base_info):
            value = get_nested(base_info, path)
            if value:
                break
+        if new_key in date_fields and value:
+            # 调用 format_chinese_date 函数格式化日期
+            formatted_value = format_chinese_date(value)  # 或者使用 format_chinese_date(value) 如果你定义了该函数
+            extracted_info[new_key] = formatted_value if formatted_value else ""
+        else:
            extracted_info[new_key] = value if value else ""

    # 特殊处理 '招标项目地点'
--- a/flask_app/general/读取文件/format_amout.py
+++ b/flask_app/general/读取文件/format_amout.py
@ -0,0 +1,40 @@
+import re
+
+
+def format_amount(original_amount):
+    """
+    格式化金额字符串
+
+    参数:
+        extracted_text (str): 输入的文本字符串
+
+    返回:
+        str: 格式化后的金额字符串
+    """
+    # amount_str = "未解析到招标项目预算"
+    # 1. 删除所有空格
+    amount_str = ''.join(original_amount.split())
+
+    # 匹配"万元"模式
+    wan_pattern = r"(\d+\.?\d*)\s*万元"
+    wan_match = re.search(wan_pattern, amount_str)
+
+    # 匹配"元"模式
+    yuan_pattern = r"(\d+\.?\d*)\s*元"
+    yuan_match = re.search(yuan_pattern, amount_str)
+
+    if wan_match:
+        # 如果找到"万元",将其乘以10000并格式化
+        value = float(wan_match.group(1)) * 10000
+        amount_str = "{:.0f}".format(value)
+    elif yuan_match:
+        # 找到"元",直接获取该数字并格式化
+        value = float(yuan_match.group(1))
+        amount_str = "{:.0f}".format(value)
+
+    return amount_str
+
+input_test=["RMB100.01万元威威"]
+for i in input_test:
+    formatted=format_amount(i)
+    print(formatted)
--- a/flask_app/main/商务标技术标整合.py
+++ b/flask_app/main/商务标技术标整合.py
@ -155,7 +155,8 @@ def combine_evaluation_standards(truncate2):
    update_json = combine_technical_and_business(clean_json_string(evaluation_res), target_values1)
    return update_json              #商务标技术标整合
 if __name__ == "__main__":
-    truncate2="C:\\Users\\Administrator\\Desktop\\招标文件\\招标01_evaluation_method.pdf"
+    # truncate2="C:\\Users\\Administrator\\Desktop\\招标文件\\招标01_evaluation_method.pdf"
+    truncate2="C:\\Users\\Administrator\\Desktop\\fsdownload\\1801777e-6746-4cd6-8b10-778b3ea57597\\ztbfile_evaluation_method.pdf"
    evaluation_standards_res=combine_evaluation_standards(truncate2)
    # 从结果中提取"商务标"和"技术标"
    technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})}
--- a/flask_app/main/截取pdf.py
+++ b/flask_app/main/截取pdf.py
@ -2,7 +2,6 @@ from PyPDF2 import PdfReader, PdfWriter
 import re  # 导入正则表达式库
 import os  # 用于文件和文件夹操作
 from flask_app.general.merge_pdfs import merge_pdfs
-import concurrent.futures
 def clean_page_content(text, common_header):
    # 首先删除抬头公共部分
    if common_header:  # 确保有公共抬头才进行替换
@ -391,42 +390,15 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na
        print(f"合并 PDF 文件时出错: {e}")
        return ""

-
 def truncate_pdf_multiple(input_path, output_folder):
-    """
-    处理 PDF 文件，选择 selection 1-6 的部分，并合并结果。
-
-    Args:
-        input_path (str): 要处理的 PDF 文件路径。
-        output_folder (str): 截取后的文件保存文件夹路径。
-
-    Returns:
-        list: 截取的文件路径列表，包括合并后的文件路径（如果有）。
-    """
    base_file_name = os.path.splitext(os.path.basename(input_path))[0]  # 纯文件名
    truncate_files = []
-    selections = range(1, 7)  # 选择 1 到 6
-
-    # 使用 ThreadPoolExecutor 进行多线程处理
-    with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
-        # 提交所有任务并获取未来对象
-        future_to_selection = {executor.submit(truncate_pdf_main, input_path, output_folder, selection): selection for
-                               selection in selections}
-
-        # 逐个获取完成的任务
-        for future in concurrent.futures.as_completed(future_to_selection):
-            selection = future_to_selection[future]
-            try:
-                files = future.result()
-                if files and any(f for f in files):
-                    # 过滤空字符串
-                    valid_files = [f for f in files if f]
-                    truncate_files.extend(valid_files)
+    for selection in range(1, 7):
+        files = truncate_pdf_main(input_path, output_folder, selection)
+        if files:
+            truncate_files.extend(files)
        else:
            truncate_files.append("")  # 截取失败时添加空字符串
-            except Exception as e:
-                print(f"Selection {selection} generated an exception: {e}")
-                truncate_files.append("")  # 发生异常时添加空字符串

    if any(f for f in truncate_files if f):  # 检查是否有有效的文件路径
        merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
@ -442,16 +414,14 @@ def truncate_pdf_multiple(input_path, output_folder):
        print(f"没有文件需要合并 for {input_path}")

    return truncate_files
-
-
 def truncate_pdf_specific_engineering(pdf_path, output_folder, selections):
    """
-    处理 PDF 文件，选择指定的 selections，并合并结果。
+    处理 PDF 文件，选择 selection 为指定部分，并合并结果。

    Args:
        pdf_path (str): 要处理的 PDF 文件路径。
        output_folder (str): 截取后的文件保存文件夹路径。
-        selections (list): 需要截取的部分（例如 [4, 5]）。
+        selections (list): 需要截取的部分

    Returns:
        list: 截取的文件路径列表，包括合并后的文件路径（如果有）。
@ -460,30 +430,19 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections):
        base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
        truncate_files = []

-        # 使用 ThreadPoolExecutor 进行多线程处理
-        with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
-            # 提交所有任务并获取未来对象
-            future_to_selection = {executor.submit(truncate_pdf_main, pdf_path, output_folder, selection): selection for
-                                   selection in selections}
-
-            # 逐个获取完成的任务
-            for future in concurrent.futures.as_completed(future_to_selection):
-                selection = future_to_selection[future]
-                try:
-                    files = future.result()
+        for selection in selections:
+            files = truncate_pdf_main(pdf_path, output_folder, selection)
            if files:
-                        # 过滤空字符串
-                        valid_files = [f for f in files if f]
-                        truncate_files.extend(valid_files)
+                if isinstance(files, list):
+                    truncate_files.extend(files)
+                elif isinstance(files, str):
+                    truncate_files.append(files)
            else:
                truncate_files.append("")  # 截取失败时添加空字符串
                print(f"截取 selection {selection} 失败，已添加空字符串。")
-                except Exception as e:
-                    print(f"Selection {selection} generated an exception: {e}")
-                    truncate_files.append("")  # 发生异常时添加空字符串

        if any(f for f in truncate_files if f):  # 检查是否有有效的文件路径
-            merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_specific.pdf")
+            merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
            merged_result = merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name)
            if merged_result:
                truncate_files.append(merged_result)
@ -510,10 +469,10 @@ if __name__ == "__main__":
    # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
    output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\output6"
    # files=truncate_pdf_multiple(input_path,output_folder)
-    selections = [5, 1]  # 仅处理 selection 5、1 和 3
-    files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
-    print(files)
-    # selection = 6 # 例如：1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
-    # generated_files = truncate_pdf_main(input_path, output_folder, selection)
-    # print(generated_files)
+    # selections = [5, 1]  # 仅处理 selection 5、1 和 3
+    # files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
+    # print(files)
+    selection = 6 # 例如：1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
+    generated_files = truncate_pdf_main(input_path, output_folder, selection)
+    print(generated_files)
    # print("生成的文件:", generated_files)
--- a/flask_app/testdir/判断截取位置.py
+++ b/flask_app/testdir/判断截取位置.py
@ -26,11 +26,12 @@ target_names = [

 # 定义user_query模板
 def generate_user_query(target, chapters, keywords):
-    template = f"""这是投标文件格式要求的部分，以序号和标题作为投标方，我需要把不同的资格证明材料填充到指定区域，请你根据该文件回答：{target}应该附在该文件哪个地方？以下是可能匹配的章节名称：{', '.join([f"'{chapter}'" for chapter in chapters])}；或者可能匹配的关键字：{', '.join([f"'{kw}'" for kw in keywords])}，你需要根据以上规则确定相关信息所在章节或小节，章节名格式通常是如'三、联合体协议书'这样的序号+标题。现在我需要将{target}贴在该章节的最后面，但是在下一章之前，目前我需要定位到插入的位置，请你返回给我插入位置的上下文，上文是该章节末尾的内容，下文应该是下一章的章节名或开头内容，上下文应该是连续的，字数限制在30字以内，以json格式返回，键名分别是'上文','下文'，上下文格式内容应完全与原文保持一致，不得擅自删减总结，输出格式示例如下：
+    #章节名格式通常是如'三、联合体协议书'这样的序号+标题。现在我需要将{target}贴在该章节的最后面，但是在下一章之前，目前我需要定位到插入的位置，
+    template = f"""这是投标文件格式要求的部分，作为投标方，我需要把不同的资格证明材料填充到指定区域，请你根据该文件回答：{target}应该附在该文件哪个地方？以下是可能匹配的章节名称：{', '.join([f"'{chapter}'" for chapter in chapters])}；或者可能匹配的关键字：{', '.join([f"'{kw}'" for kw in keywords])}，你需要根据以上规则确定{target}需要插入的位置，请你返回给我插入位置的上下文，以便于我定位原文，上文是插入章节或小节末尾的内容，下文应该是下一章节或小节开头的内容，上下文合在一起应该是连续的，字数都限制在20字以内。你的回答以json格式返回，键名分别是'上文','下文'，上下文内容应完全与原文保持一致，不得擅自删减总结，输出格式示例如下：
 {{
-"上文":"上文相关内容 
-测试",
-"下文":"四、下文章节名 
+"上文":"上文测试投标人： （盖单位章） 
+ 年 月 日",
+"下文":"下文测试章节名 
 （招标人名称）：测试"
 }}
 """
@ -42,7 +43,7 @@ def generate_user_query(target, chapters, keywords):
 （招标人名称）：测试"
 }}
    """
-    return template2
+    return template

 # 生成user_query_list
 user_query_list = []
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -3,7 +3,7 @@ import re  # 导入正则表达式库
 import os  # 用于文件和文件夹操作
 from flask_app.general.format_change import docx2pdf
 from flask_app.general.merge_pdfs import merge_and_cleanup,merge_pdfs
-import concurrent.futures
+
 def clean_page_content(text, common_header):
    # 首先删除抬头公共部分
    if common_header:  # 确保有公共抬头才进行替换
@ -431,7 +431,8 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
            else:
                print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！")
                return ""
-        return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
+        return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix,
+                                    )
    except Exception as e:
        print(f"Error in extract_pages_twice: {e}")
        return ""
@ -623,25 +624,10 @@ def truncate_pdf_main(input_path, output_folder, selection, output_suffix="defau
 def truncate_pdf_multiple(pdf_path, output_folder):
    base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
    truncate_files = []
-
-    # 定义要处理的选择范围
-    selections = range(1, 6)
-
-    # 使用 ThreadPoolExecutor 进行多线程处理
-    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
-        # 提交所有任务并获取未来对象
-        future_to_selection = {executor.submit(truncate_pdf_main, pdf_path, output_folder, selection,output_suffix="default"): selection for
-                               selection in selections}
-
-        # 逐个获取完成的任务
-        for future in concurrent.futures.as_completed(future_to_selection):
-            selection = future_to_selection[future]
-            try:
-                files = future.result()
+    for selection in range(1, 6):
+        files = truncate_pdf_main(pdf_path, output_folder, selection)
        if files:
            truncate_files.extend(files)
-            except Exception as e:
-                print(f"Selection {selection} generated an exception: {e}")

    if truncate_files:
        merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
@ -656,12 +642,11 @@ def truncate_pdf_multiple(pdf_path, output_folder):
 #小解析，只需要前三章内容
 def truncate_pdf_specific_goods(pdf_path, output_folder,selections):
    """
-    处理 PDF 文件，选择指定的 selections，并合并结果。
+    处理 PDF 文件，选择 selection 为 4 和 5 的部分，并合并结果。

    Args:
        pdf_path (str): 要处理的 PDF 文件路径。
        output_folder (str): 截取后的文件保存文件夹路径。
-        selections (iterable): 要处理的 selection 列表。

    Returns:
        list: 截取的文件路径列表，包括合并后的文件路径（如果有）。
@ -669,24 +654,13 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections):
    base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
    truncate_files = []

-    # 使用 ThreadPoolExecutor 进行多线程处理
-    with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
-        # 提交所有任务并获取未来对象
-        future_to_selection = {executor.submit(truncate_pdf_main, pdf_path, output_folder, selection,output_suffix="default"): selection for
-                               selection in selections}
-
-        # 逐个获取完成的任务
-        for future in concurrent.futures.as_completed(future_to_selection):
-            selection = future_to_selection[future]
-            try:
-                files = future.result()
+    for selection in selections:
+        files = truncate_pdf_main(pdf_path, output_folder, selection)
        if files:
            if isinstance(files, list):
                truncate_files.extend(files)
            elif isinstance(files, str):
                truncate_files.append(files)
-            except Exception as e:
-                print(f"Selection {selection} generated an exception: {e}")

    if truncate_files:
        merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_specific.pdf")
@ -708,9 +682,9 @@ if __name__ == "__main__":
    # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
    # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
    output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
-    files = truncate_pdf_multiple(input_path, output_folder)
+    # files = truncate_pdf_multiple(input_path, output_folder)
    # files=truncate_pdf_specific_goods(input_path,output_folder)
-    print(files)
-    # selection = 6# 例如：1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-公告
-    # generated_files = truncate_pdf_main(input_path, output_folder, selection)
-    # print(generated_files)
+    # print(files)
+    selection = 6# 例如：1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-公告
+    generated_files = truncate_pdf_main(input_path, output_folder, selection)
+    print(generated_files)