diff --git a/flask_app/general/insert_del_pagemark.py b/flask_app/general/insert_del_pagemark.py index 8c073bb..3157871 100644 --- a/flask_app/general/insert_del_pagemark.py +++ b/flask_app/general/insert_del_pagemark.py @@ -1,5 +1,5 @@ import os -import re +from docx.opc.exceptions import PackageNotFoundError from io import BytesIO from PyPDF2 import PdfReader, PdfWriter from docx import Document @@ -65,11 +65,16 @@ def insert_mark(input_pdf_path): def delete_mark(docx_path): - """ - 删除docx文档中的所有标记 - :param docx_path: docx文件路径 - """ - docx = Document(docx_path) + try: + docx = Document(docx_path) + except KeyError as e: + print(f"Error opening document: {e}") + return "" + except PackageNotFoundError as e: + print(f"Invalid package: {e}") + return "" + + # 继续处理文档 find_flag = False for para in docx.paragraphs: # 匹配标记: [$$index_mark_X$$] @@ -79,9 +84,6 @@ def delete_mark(docx_path): if find_flag and "w:sectPr" in para._element.xml: # 删空白分节符 para._element.getparent().remove(para._element) find_flag = False - - # 获取文件路径信息 - import os dir_path = os.path.dirname(docx_path) new_file_path = os.path.join(dir_path, 'invalid_del.docx') @@ -90,8 +92,12 @@ def delete_mark(docx_path): return new_file_path if __name__ == '__main__': - input=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\baada43d-24f6-459d-8a81-219d130f20da\ztbfile.pdf' + # input=r'D:\flask_project\flask_app\static\output\output1\90b073b5-b8fb-4b11-9962-10de7c3f3854\invalid_added.docx' # input=r'C:\Users\Administrator\Desktop\new招标文件\output5\广水市公安局音视频监控系统设备采购项目_procurement.pdf' - output=insert_mark(input) - # doc_path = r'C:\Users\Administrator\Desktop\fsdownload\1073c74f-02b5-463e-a129-23c790a3c872\invalid_added.docx' - # delete_mark(doc_path) \ No newline at end of file + # output=insert_mark(input) + doc_path = r'D:\flask_project\flask_app\static\output\output1\90b073b5-b8fb-4b11-9962-10de7c3f3854\invalid_added.docx' + res=delete_mark(doc_path) + if res: + print(res) + else: + print("No") \ No newline at end of file diff --git a/flask_app/general/json_utils.py b/flask_app/general/json_utils.py index 3e43abc..759caa8 100644 --- a/flask_app/general/json_utils.py +++ b/flask_app/general/json_utils.py @@ -227,6 +227,40 @@ def parse_json_with_duplicates(raw_string): print("未找到有效的 JSON 内容。") return {} # 返回空字典 +def extract_first_json(s): + """ + 从字符串中提取第一个完整的 JSON 对象。如果 JSON 对象不完整,尝试补全缺失的括号。 + + Args: + s (str): 输入字符串。 + + Returns: + str or None: 提取到的 JSON 字符串,或 None 如果未找到。 + """ + start = s.find('{') + if start == -1: + return None + + stack = [] + for i in range(start, len(s)): + char = s[i] + if char == '{': + stack.append('{') + elif char == '}': + if stack: + stack.pop() + if not stack: + # 找到一个完整的 JSON 对象 + return s[start:i+1] + else: + # 多余的右括号,忽略 + pass + # 如果遍历完后栈不为空,补全缺失的右括号 + if stack: + missing = '}' * len(stack) + return s[start:] + missing + return None + def extract_content_from_json(input_string,flag=False): """ 输入字符串,尝试解析 JSON 数据: @@ -292,8 +326,19 @@ def extract_content_from_json(input_string,flag=False): return parsed_data # 返回解析后的字典 except json.JSONDecodeError: print("方法3(非法转义修复)解析失败。") + # 所有方法均失败后,尝试使用 extract_first_json 作为最后手段 + print("尝试使用 extract_first_json 作为最后手段。") + fixed_json_final = extract_first_json(input_string) + if fixed_json_final: + try: + parsed_data = parse_json(fixed_json_final) + print("使用 extract_first_json 后解析成功。") + return parsed_data + except json.JSONDecodeError: + print("使用 extract_first_json 后解析失败。") + else: + print("extract_first_json 未能提取到有效的 JSON。") - # 如果所有方法都失败,检查字符串长度 print("所有修复方法均失败。传入的字符串:") print(input_string) print("-------------------") diff --git a/flask_app/routes/工程标解析main.py b/flask_app/routes/工程标解析main.py index 4a1163b..2723d48 100644 --- a/flask_app/routes/工程标解析main.py +++ b/flask_app/routes/工程标解析main.py @@ -65,9 +65,10 @@ def preprocess_files(output_folder, file_path, file_type,logger): print("yes") except Exception as e: # 捕获异常并打印错误信息 - invalid_added_docx=pdf2docx(pdf_path) + invalid_added_docx=pdf2docx(invalid_path) invalid_deleted_docx=delete_mark(invalid_added_docx) #无标记的invalid_path - + if not invalid_deleted_docx: + invalid_deleted_docx=pdf2docx(invalid_path) merged_baseinfo_path=truncate_files[-1] more_path=[merged_baseinfo_path,tobidders_notice] merged_baseinfo_path_more=os.path.join(output_folder,"merged_baseinfo_path_more.pdf") diff --git a/flask_app/routes/货物标解析main.py b/flask_app/routes/货物标解析main.py index 5b8f54b..28fd457 100644 --- a/flask_app/routes/货物标解析main.py +++ b/flask_app/routes/货物标解析main.py @@ -53,8 +53,10 @@ def preprocess_files(output_folder, file_path, file_type,logger): print("yes") except Exception as e: # 捕获异常并打印错误信息 - invalid_added_docx=pdf2docx(pdf_path) + invalid_added_docx=pdf2docx(invalid_path) invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path + if not invalid_deleted_docx: + invalid_deleted_docx=pdf2docx(invalid_path) # invalid_docpath = invalid_added_docx # docx截取无效标部分 procurement_path = truncate_files[5] # 采购需求