12.6 优化解析

2024-12-06 17:34:31 +08:00 · 2024-12-06 17:34:31 +08:00 · e5e63e400b
commit e5e63e400b
parent c771f801db
5 changed files with 29 additions and 7 deletions
--- a/flask_app/general/clean_pdf.py
+++ b/flask_app/general/clean_pdf.py
@ -76,7 +76,7 @@ def clean_page_content(text, common_header):
    text = re.sub(r'^第\d+页\s*', '', text)
    # 删除页码 eg:89/129   这个代码分三步走可以把89/129完全删除
    text = re.sub(r'^\s*\d+\s*(?=\D)', '', text)  # 删除开头的页码，仅当紧跟非数字字符时  投标人须知这块， 页码和顶部序号混在一起的时候也会把序号给去除了。'2018.' 20为页码 18.为序号
-    text = re.sub(r'^\s*\/\s*(共\s*)?\d{1,3}\s*(页)?\s*', '', text)   #删除/123  /共123 /共123页 /123页
+    text = re.sub(r'^\s*\/\s*(共\s*)?\d+\s*(页)?\s*', '', text)   #删除/123  /共123 /共123页 /123页
    text = re.sub(r'^\s*[—-]\s*\d{1,3}\s*[—-]\s*', '', text)  # 删除形如 '—2—' 或 '-2-' 的页码
    return text
--- a/flask_app/general/投标人须知正文条款提取成json文件.py
+++ b/flask_app/general/投标人须知正文条款提取成json文件.py
@ -169,10 +169,30 @@ def parse_text_by_heading(text):
                append_newline = True
            else:
                append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, in_special_section)
        elif dot_text_match:
-            # 处理以点号开头但不带数字的情况，存储到临时变量
+            # 处理以点号开头但不带数字的情况，自动生成下一个一级序号
-            temp_title = dot_text_match.group(1).strip()
+            temp_content = dot_text_match.group(1).strip()
            if last_main_number:
                # 生成下一个一级序号
                try:
                    # 尝试将 last_main_number 转为整数并加1
                    next_main_number = str(int(last_main_number) + 1) + '.'
                except ValueError:
                    # 如果 last_main_number 不是纯数字，处理为中文序号或其他逻辑
                    # 这里假设 last_main_number 是阿拉伯数字
                    next_main_number = '未识别的序号.'
            else:
                # 如果没有上一个主编号，默认从 '1.' 开始
                next_main_number = '1.'
            # 更新 current_key 和 last_main_number
            if current_key is not None:
                content_string = ''.join(current_content).strip()
                data[current_key] = data.get(current_key, '') + content_string
            current_key = next_main_number
            current_content = [temp_content]  # 不直接添加 '\n'
            last_main_number = next_main_number.rstrip('.')
            append_newline = True  # 设置标志，下一次内容添加时需要 '\n'
            continue  # 跳过进一步处理该行
        elif pure_number_match:
@ -245,6 +265,7 @@ def parse_text_by_heading(text):
                        if current_key is not None:
                            content_string = ''.join(current_content).strip()
                            data[current_key] = data.get(current_key, '') + content_string
                            current_content=[]
                        if current_key_chinese is not None:
                            data[current_key_chinese] = current_value_chinese
                            current_key_chinese = None
--- a/flask_app/general/读取文件/按页读取pdf.py
+++ b/flask_app/general/读取文件/按页读取pdf.py
@ -119,7 +119,7 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
 if __name__ == '__main__':
    # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
    # file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\东莞支队查验招标文件.pdf"
-    file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
+    file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
    # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
    # file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
--- a/flask_app/货物标/技术参数要求提取.py
+++ b/flask_app/货物标/技术参数要求提取.py
@ -219,7 +219,6 @@ def rename_keys(data):
    # 对整个数据结构进行递归重命名
    return rename_keys_recursive(data)
 def combine_and_update_results(original_data, updates):
    """
    先规范化original和updates中的字典，防止空格的情况导致匹配不上无法更新
--- a/flask_app/货物标/提取json货物标版.py
+++ b/flask_app/货物标/提取json货物标版.py
@ -175,9 +175,11 @@ def process_folder(input_folder, output_folder):
            print(f"Error processing {file_name}: {e}")
 #TODO:招标文件111_tobidders_notice_part2.pdf   陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目(1)_tobidders_notice_part2.pdf
 #TODO：19、竞争性磋商响应文件的加密 暂时没处理'19'缺失的情况
 #TODO: .不予受理的情形 ，‘.后面必须跟中文或者空格’
 if __name__ == "__main__":
    # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
-    file_path=r'C:\Users\Administrator\Desktop\货物标\output4\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_tobidders_notice_part2.pdf'
+    file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf'
    # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_tobidders_notice_part2.pdf'
    output_folder = r'C:\Users\Administrator\Desktop\招标文件\output4\tmp'
    try: