1.3 修改bug

This commit is contained in:
zy123 2025-01-03 10:45:32 +08:00
parent c33febf8a4
commit 14543f9663
4 changed files with 71 additions and 17 deletions

View File

@ -1,5 +1,5 @@
import os
import re
from docx.opc.exceptions import PackageNotFoundError
from io import BytesIO
from PyPDF2 import PdfReader, PdfWriter
from docx import Document
@ -65,11 +65,16 @@ def insert_mark(input_pdf_path):
def delete_mark(docx_path):
"""
删除docx文档中的所有标记
:param docx_path: docx文件路径
"""
docx = Document(docx_path)
try:
docx = Document(docx_path)
except KeyError as e:
print(f"Error opening document: {e}")
return ""
except PackageNotFoundError as e:
print(f"Invalid package: {e}")
return ""
# 继续处理文档
find_flag = False
for para in docx.paragraphs:
# 匹配标记: [$$index_mark_X$$]
@ -79,9 +84,6 @@ def delete_mark(docx_path):
if find_flag and "w:sectPr" in para._element.xml: # 删空白分节符
para._element.getparent().remove(para._element)
find_flag = False
# 获取文件路径信息
import os
dir_path = os.path.dirname(docx_path)
new_file_path = os.path.join(dir_path, 'invalid_del.docx')
@ -90,8 +92,12 @@ def delete_mark(docx_path):
return new_file_path
if __name__ == '__main__':
input=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\baada43d-24f6-459d-8a81-219d130f20da\ztbfile.pdf'
# input=r'D:\flask_project\flask_app\static\output\output1\90b073b5-b8fb-4b11-9962-10de7c3f3854\invalid_added.docx'
# input=r'C:\Users\Administrator\Desktop\new招标文件\output5\广水市公安局音视频监控系统设备采购项目_procurement.pdf'
output=insert_mark(input)
# doc_path = r'C:\Users\Administrator\Desktop\fsdownload\1073c74f-02b5-463e-a129-23c790a3c872\invalid_added.docx'
# delete_mark(doc_path)
# output=insert_mark(input)
doc_path = r'D:\flask_project\flask_app\static\output\output1\90b073b5-b8fb-4b11-9962-10de7c3f3854\invalid_added.docx'
res=delete_mark(doc_path)
if res:
print(res)
else:
print("No")

View File

@ -227,6 +227,40 @@ def parse_json_with_duplicates(raw_string):
print("未找到有效的 JSON 内容。")
return {} # 返回空字典
def extract_first_json(s):
"""
从字符串中提取第一个完整的 JSON 对象如果 JSON 对象不完整尝试补全缺失的括号
Args:
s (str): 输入字符串
Returns:
str or None: 提取到的 JSON 字符串 None 如果未找到
"""
start = s.find('{')
if start == -1:
return None
stack = []
for i in range(start, len(s)):
char = s[i]
if char == '{':
stack.append('{')
elif char == '}':
if stack:
stack.pop()
if not stack:
# 找到一个完整的 JSON 对象
return s[start:i+1]
else:
# 多余的右括号,忽略
pass
# 如果遍历完后栈不为空,补全缺失的右括号
if stack:
missing = '}' * len(stack)
return s[start:] + missing
return None
def extract_content_from_json(input_string,flag=False):
"""
输入字符串尝试解析 JSON 数据
@ -292,8 +326,19 @@ def extract_content_from_json(input_string,flag=False):
return parsed_data # 返回解析后的字典
except json.JSONDecodeError:
print("方法3非法转义修复解析失败。")
# 所有方法均失败后,尝试使用 extract_first_json 作为最后手段
print("尝试使用 extract_first_json 作为最后手段。")
fixed_json_final = extract_first_json(input_string)
if fixed_json_final:
try:
parsed_data = parse_json(fixed_json_final)
print("使用 extract_first_json 后解析成功。")
return parsed_data
except json.JSONDecodeError:
print("使用 extract_first_json 后解析失败。")
else:
print("extract_first_json 未能提取到有效的 JSON。")
# 如果所有方法都失败,检查字符串长度
print("所有修复方法均失败。传入的字符串:")
print(input_string)
print("-------------------")

View File

@ -65,9 +65,10 @@ def preprocess_files(output_folder, file_path, file_type,logger):
print("yes")
except Exception as e:
# 捕获异常并打印错误信息
invalid_added_docx=pdf2docx(pdf_path)
invalid_added_docx=pdf2docx(invalid_path)
invalid_deleted_docx=delete_mark(invalid_added_docx) #无标记的invalid_path
if not invalid_deleted_docx:
invalid_deleted_docx=pdf2docx(invalid_path)
merged_baseinfo_path=truncate_files[-1]
more_path=[merged_baseinfo_path,tobidders_notice]
merged_baseinfo_path_more=os.path.join(output_folder,"merged_baseinfo_path_more.pdf")

View File

@ -53,8 +53,10 @@ def preprocess_files(output_folder, file_path, file_type,logger):
print("yes")
except Exception as e:
# 捕获异常并打印错误信息
invalid_added_docx=pdf2docx(pdf_path)
invalid_added_docx=pdf2docx(invalid_path)
invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path
if not invalid_deleted_docx:
invalid_deleted_docx=pdf2docx(invalid_path)
# invalid_docpath = invalid_added_docx # docx截取无效标部分
procurement_path = truncate_files[5] # 采购需求