1.3 修改bug

This commit is contained in:
zy123 2025-01-03 10:45:32 +08:00
parent c33febf8a4
commit 14543f9663
4 changed files with 71 additions and 17 deletions

View File

@ -1,5 +1,5 @@
import os import os
import re from docx.opc.exceptions import PackageNotFoundError
from io import BytesIO from io import BytesIO
from PyPDF2 import PdfReader, PdfWriter from PyPDF2 import PdfReader, PdfWriter
from docx import Document from docx import Document
@ -65,11 +65,16 @@ def insert_mark(input_pdf_path):
def delete_mark(docx_path): def delete_mark(docx_path):
""" try:
删除docx文档中的所有标记
:param docx_path: docx文件路径
"""
docx = Document(docx_path) docx = Document(docx_path)
except KeyError as e:
print(f"Error opening document: {e}")
return ""
except PackageNotFoundError as e:
print(f"Invalid package: {e}")
return ""
# 继续处理文档
find_flag = False find_flag = False
for para in docx.paragraphs: for para in docx.paragraphs:
# 匹配标记: [$$index_mark_X$$] # 匹配标记: [$$index_mark_X$$]
@ -79,9 +84,6 @@ def delete_mark(docx_path):
if find_flag and "w:sectPr" in para._element.xml: # 删空白分节符 if find_flag and "w:sectPr" in para._element.xml: # 删空白分节符
para._element.getparent().remove(para._element) para._element.getparent().remove(para._element)
find_flag = False find_flag = False
# 获取文件路径信息
import os
dir_path = os.path.dirname(docx_path) dir_path = os.path.dirname(docx_path)
new_file_path = os.path.join(dir_path, 'invalid_del.docx') new_file_path = os.path.join(dir_path, 'invalid_del.docx')
@ -90,8 +92,12 @@ def delete_mark(docx_path):
return new_file_path return new_file_path
if __name__ == '__main__': if __name__ == '__main__':
input=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\baada43d-24f6-459d-8a81-219d130f20da\ztbfile.pdf' # input=r'D:\flask_project\flask_app\static\output\output1\90b073b5-b8fb-4b11-9962-10de7c3f3854\invalid_added.docx'
# input=r'C:\Users\Administrator\Desktop\new招标文件\output5\广水市公安局音视频监控系统设备采购项目_procurement.pdf' # input=r'C:\Users\Administrator\Desktop\new招标文件\output5\广水市公安局音视频监控系统设备采购项目_procurement.pdf'
output=insert_mark(input) # output=insert_mark(input)
# doc_path = r'C:\Users\Administrator\Desktop\fsdownload\1073c74f-02b5-463e-a129-23c790a3c872\invalid_added.docx' doc_path = r'D:\flask_project\flask_app\static\output\output1\90b073b5-b8fb-4b11-9962-10de7c3f3854\invalid_added.docx'
# delete_mark(doc_path) res=delete_mark(doc_path)
if res:
print(res)
else:
print("No")

View File

@ -227,6 +227,40 @@ def parse_json_with_duplicates(raw_string):
print("未找到有效的 JSON 内容。") print("未找到有效的 JSON 内容。")
return {} # 返回空字典 return {} # 返回空字典
def extract_first_json(s):
"""
从字符串中提取第一个完整的 JSON 对象如果 JSON 对象不完整尝试补全缺失的括号
Args:
s (str): 输入字符串
Returns:
str or None: 提取到的 JSON 字符串 None 如果未找到
"""
start = s.find('{')
if start == -1:
return None
stack = []
for i in range(start, len(s)):
char = s[i]
if char == '{':
stack.append('{')
elif char == '}':
if stack:
stack.pop()
if not stack:
# 找到一个完整的 JSON 对象
return s[start:i+1]
else:
# 多余的右括号,忽略
pass
# 如果遍历完后栈不为空,补全缺失的右括号
if stack:
missing = '}' * len(stack)
return s[start:] + missing
return None
def extract_content_from_json(input_string,flag=False): def extract_content_from_json(input_string,flag=False):
""" """
输入字符串尝试解析 JSON 数据 输入字符串尝试解析 JSON 数据
@ -292,8 +326,19 @@ def extract_content_from_json(input_string,flag=False):
return parsed_data # 返回解析后的字典 return parsed_data # 返回解析后的字典
except json.JSONDecodeError: except json.JSONDecodeError:
print("方法3非法转义修复解析失败。") print("方法3非法转义修复解析失败。")
# 所有方法均失败后,尝试使用 extract_first_json 作为最后手段
print("尝试使用 extract_first_json 作为最后手段。")
fixed_json_final = extract_first_json(input_string)
if fixed_json_final:
try:
parsed_data = parse_json(fixed_json_final)
print("使用 extract_first_json 后解析成功。")
return parsed_data
except json.JSONDecodeError:
print("使用 extract_first_json 后解析失败。")
else:
print("extract_first_json 未能提取到有效的 JSON。")
# 如果所有方法都失败,检查字符串长度
print("所有修复方法均失败。传入的字符串:") print("所有修复方法均失败。传入的字符串:")
print(input_string) print(input_string)
print("-------------------") print("-------------------")

View File

@ -65,9 +65,10 @@ def preprocess_files(output_folder, file_path, file_type,logger):
print("yes") print("yes")
except Exception as e: except Exception as e:
# 捕获异常并打印错误信息 # 捕获异常并打印错误信息
invalid_added_docx=pdf2docx(pdf_path) invalid_added_docx=pdf2docx(invalid_path)
invalid_deleted_docx=delete_mark(invalid_added_docx) #无标记的invalid_path invalid_deleted_docx=delete_mark(invalid_added_docx) #无标记的invalid_path
if not invalid_deleted_docx:
invalid_deleted_docx=pdf2docx(invalid_path)
merged_baseinfo_path=truncate_files[-1] merged_baseinfo_path=truncate_files[-1]
more_path=[merged_baseinfo_path,tobidders_notice] more_path=[merged_baseinfo_path,tobidders_notice]
merged_baseinfo_path_more=os.path.join(output_folder,"merged_baseinfo_path_more.pdf") merged_baseinfo_path_more=os.path.join(output_folder,"merged_baseinfo_path_more.pdf")

View File

@ -53,8 +53,10 @@ def preprocess_files(output_folder, file_path, file_type,logger):
print("yes") print("yes")
except Exception as e: except Exception as e:
# 捕获异常并打印错误信息 # 捕获异常并打印错误信息
invalid_added_docx=pdf2docx(pdf_path) invalid_added_docx=pdf2docx(invalid_path)
invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path
if not invalid_deleted_docx:
invalid_deleted_docx=pdf2docx(invalid_path)
# invalid_docpath = invalid_added_docx # docx截取无效标部分 # invalid_docpath = invalid_added_docx # docx截取无效标部分
procurement_path = truncate_files[5] # 采购需求 procurement_path = truncate_files[5] # 采购需求