1.17 小bug

2025-01-17 17:36:46 +08:00 · 2025-01-17 17:36:46 +08:00 · 28ddae379a
commit 28ddae379a
parent 20e297c3ad
3 changed files with 21 additions and 16 deletions
--- a/flask_app/general/无效标和废标公共代码.py
+++ b/flask_app/general/无效标和废标公共代码.py
@ -344,8 +344,10 @@ def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keyword
                            continue  # 跳过空白行，进入下一个循环
                        if not found_next_number:
                            # 修改后的正则，支持 '数字 、' 格式
-                            next_section_number = re.match(r'^([A-Za-z0-9]+(?:[.．][A-Za-z0-9]+)*)|([（(]\s*\d+\s*[）)])|(\d+\s*、)',
+                            number_pattern=(r'^([A-Za-z0-9]+(?:[.．][A-Za-z0-9]+)*)'     #A1  1.1   abc.123
-                                                           next_text)
+                                            r'|([（(]\s*[一二三四五六七八九十\d]+\s*[）)])'       #(1) （2）
                                            r'|([一二三四五六七八九十\d]+\s*、)')              #1、 2、
                            next_section_number = re.match(number_pattern,next_text)
                            if next_section_number:
                                found_next_number = True
                                if next_section_number.group(1):
@ -353,9 +355,9 @@ def extract_text_with_keywords(processed_paragraphs, keywords, follow_up_keyword
                                    dynamic_pattern = r'^' + r'[.．]'.join(
                                        [r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
                                elif next_section_number.group(2):
-                                    dynamic_pattern = r'^[\(\（]\s*\d+\s*[\)\）]'
+                                    dynamic_pattern = r'^[\(\（]\s*[一二三四五六七八九十\d]+\s*[\)\）]'
                                elif next_section_number.group(3):
-                                    dynamic_pattern = r'^\d+\s*、'
+                                    dynamic_pattern = r'^[一二三四五六七八九十\d]+\s*、'
                                current_section_pattern = re.compile(dynamic_pattern)
                        if current_section_pattern and re.match(current_section_pattern, next_text):
                            extracted_paragraphs[active_key].append(next_text)
@ -488,17 +490,17 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
        processed_paragraphs = preprocess_paragraphs(doc_contents)
        extracted_contents = extract_text_with_keywords(processed_paragraphs, [keywords], follow_up_keywords)
        all_texts1,all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes)  # 列表
        # print(all_texts2)
        # 1. 得到有序的 all_text1_items
        all_text1_items = sorted(all_texts1.items(), key=lambda x: x[0])
        # 2. 得到纯内容列表
        all_texts1_list = [content for (_, content) in all_text1_items]
        # print(all_texts)
        # Proceed only if there is content to write
        selected_contents = {}
        final_list=[f"未解析到'{result_key}'！"]
        seen_contents = set()  # 使用集合跟踪已添加的内容以去重
-        if all_texts1_list:
+        if all_texts1_list or all_texts2:
            with open(output_file, 'w', encoding='utf-8') as file:
                counter = 1
                for content in all_texts1_list:
@ -511,6 +513,9 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
                        counter += 1
            # 生成用户查询
            if not all_texts1_list:
                num_list=[]
            else:
                user_query = generate_full_user_query(output_file, user_query)
                model_ans = qianwen_plus(user_query)  # 豆包模型返回结果
                # file_id = upload_file(output_file)
@ -656,7 +661,7 @@ if __name__ == '__main__':
    output_dir = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\废标"
    # invalid_added = insert_mark(pdf_path)
    # # invalid_added_docx = pdf2docx(invalid_added)
-    invalid_added_docx=r'C:\Users\Administrator\Desktop\新建文件夹 (3)\废标\invalid_added.docx'
+    invalid_added_docx=r'C:\Users\Administrator\Desktop\测试信号测试信号.docx'
    results = combine_find_invalid(invalid_added_docx, output_dir)
    end_time = time.time()
    print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
--- a/flask_app/general/读取文件/读取docx.py
+++ b/flask_app/general/读取文件/读取docx.py
@ -90,7 +90,7 @@ def read_docx_by_paragraphs(file_path):
        return []
 if __name__ == "__main__":
-    file_path =  r'D:\flask_project\flask_app\static\output\output1\2179c978-b638-4332-8bd2-2007ad6f7c9b\ztbfile.docx'
+    file_path =  r'C:\Users\Administrator\Desktop\测试信号测试信号.docx'
    read_docx(file_path)  #按行读取
    # paragraphs = read_docx_by_paragraphs(file_path)   #按段落读取
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -315,11 +315,11 @@ if __name__ == "__main__":
    logger = get_global_logger("123")
    # input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
    # pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
-    pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\开评定标\（发布稿）恒丰理财有限责任公司智能投研终端项目竞争性磋商.pdf"
+    pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp\251WA0201146未来大厦LED屏幕采购及安装-自行组织货物类招标文件V1.2.pdf"
    # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件（广水市教育局封闭管理）.pdf"
    # pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
-    output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\开评定标"
+    output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp"
    # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
-    selection = 4  # 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求 6-invalid_path
+    selection = 5  # 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求 6-invalid_path
    generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
    print(generated_files)