1.3 修改bug
This commit is contained in:
parent
c33febf8a4
commit
14543f9663
@ -1,5 +1,5 @@
|
||||
import os
|
||||
import re
|
||||
from docx.opc.exceptions import PackageNotFoundError
|
||||
from io import BytesIO
|
||||
from PyPDF2 import PdfReader, PdfWriter
|
||||
from docx import Document
|
||||
@ -65,11 +65,16 @@ def insert_mark(input_pdf_path):
|
||||
|
||||
|
||||
def delete_mark(docx_path):
|
||||
"""
|
||||
删除docx文档中的所有标记
|
||||
:param docx_path: docx文件路径
|
||||
"""
|
||||
docx = Document(docx_path)
|
||||
try:
|
||||
docx = Document(docx_path)
|
||||
except KeyError as e:
|
||||
print(f"Error opening document: {e}")
|
||||
return ""
|
||||
except PackageNotFoundError as e:
|
||||
print(f"Invalid package: {e}")
|
||||
return ""
|
||||
|
||||
# 继续处理文档
|
||||
find_flag = False
|
||||
for para in docx.paragraphs:
|
||||
# 匹配标记: [$$index_mark_X$$]
|
||||
@ -79,9 +84,6 @@ def delete_mark(docx_path):
|
||||
if find_flag and "w:sectPr" in para._element.xml: # 删空白分节符
|
||||
para._element.getparent().remove(para._element)
|
||||
find_flag = False
|
||||
|
||||
# 获取文件路径信息
|
||||
import os
|
||||
dir_path = os.path.dirname(docx_path)
|
||||
new_file_path = os.path.join(dir_path, 'invalid_del.docx')
|
||||
|
||||
@ -90,8 +92,12 @@ def delete_mark(docx_path):
|
||||
return new_file_path
|
||||
|
||||
if __name__ == '__main__':
|
||||
input=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\baada43d-24f6-459d-8a81-219d130f20da\ztbfile.pdf'
|
||||
# input=r'D:\flask_project\flask_app\static\output\output1\90b073b5-b8fb-4b11-9962-10de7c3f3854\invalid_added.docx'
|
||||
# input=r'C:\Users\Administrator\Desktop\new招标文件\output5\广水市公安局音视频监控系统设备采购项目_procurement.pdf'
|
||||
output=insert_mark(input)
|
||||
# doc_path = r'C:\Users\Administrator\Desktop\fsdownload\1073c74f-02b5-463e-a129-23c790a3c872\invalid_added.docx'
|
||||
# delete_mark(doc_path)
|
||||
# output=insert_mark(input)
|
||||
doc_path = r'D:\flask_project\flask_app\static\output\output1\90b073b5-b8fb-4b11-9962-10de7c3f3854\invalid_added.docx'
|
||||
res=delete_mark(doc_path)
|
||||
if res:
|
||||
print(res)
|
||||
else:
|
||||
print("No")
|
@ -227,6 +227,40 @@ def parse_json_with_duplicates(raw_string):
|
||||
print("未找到有效的 JSON 内容。")
|
||||
return {} # 返回空字典
|
||||
|
||||
def extract_first_json(s):
|
||||
"""
|
||||
从字符串中提取第一个完整的 JSON 对象。如果 JSON 对象不完整,尝试补全缺失的括号。
|
||||
|
||||
Args:
|
||||
s (str): 输入字符串。
|
||||
|
||||
Returns:
|
||||
str or None: 提取到的 JSON 字符串,或 None 如果未找到。
|
||||
"""
|
||||
start = s.find('{')
|
||||
if start == -1:
|
||||
return None
|
||||
|
||||
stack = []
|
||||
for i in range(start, len(s)):
|
||||
char = s[i]
|
||||
if char == '{':
|
||||
stack.append('{')
|
||||
elif char == '}':
|
||||
if stack:
|
||||
stack.pop()
|
||||
if not stack:
|
||||
# 找到一个完整的 JSON 对象
|
||||
return s[start:i+1]
|
||||
else:
|
||||
# 多余的右括号,忽略
|
||||
pass
|
||||
# 如果遍历完后栈不为空,补全缺失的右括号
|
||||
if stack:
|
||||
missing = '}' * len(stack)
|
||||
return s[start:] + missing
|
||||
return None
|
||||
|
||||
def extract_content_from_json(input_string,flag=False):
|
||||
"""
|
||||
输入字符串,尝试解析 JSON 数据:
|
||||
@ -292,8 +326,19 @@ def extract_content_from_json(input_string,flag=False):
|
||||
return parsed_data # 返回解析后的字典
|
||||
except json.JSONDecodeError:
|
||||
print("方法3(非法转义修复)解析失败。")
|
||||
# 所有方法均失败后,尝试使用 extract_first_json 作为最后手段
|
||||
print("尝试使用 extract_first_json 作为最后手段。")
|
||||
fixed_json_final = extract_first_json(input_string)
|
||||
if fixed_json_final:
|
||||
try:
|
||||
parsed_data = parse_json(fixed_json_final)
|
||||
print("使用 extract_first_json 后解析成功。")
|
||||
return parsed_data
|
||||
except json.JSONDecodeError:
|
||||
print("使用 extract_first_json 后解析失败。")
|
||||
else:
|
||||
print("extract_first_json 未能提取到有效的 JSON。")
|
||||
|
||||
# 如果所有方法都失败,检查字符串长度
|
||||
print("所有修复方法均失败。传入的字符串:")
|
||||
print(input_string)
|
||||
print("-------------------")
|
||||
|
@ -65,9 +65,10 @@ def preprocess_files(output_folder, file_path, file_type,logger):
|
||||
print("yes")
|
||||
except Exception as e:
|
||||
# 捕获异常并打印错误信息
|
||||
invalid_added_docx=pdf2docx(pdf_path)
|
||||
invalid_added_docx=pdf2docx(invalid_path)
|
||||
invalid_deleted_docx=delete_mark(invalid_added_docx) #无标记的invalid_path
|
||||
|
||||
if not invalid_deleted_docx:
|
||||
invalid_deleted_docx=pdf2docx(invalid_path)
|
||||
merged_baseinfo_path=truncate_files[-1]
|
||||
more_path=[merged_baseinfo_path,tobidders_notice]
|
||||
merged_baseinfo_path_more=os.path.join(output_folder,"merged_baseinfo_path_more.pdf")
|
||||
|
@ -53,8 +53,10 @@ def preprocess_files(output_folder, file_path, file_type,logger):
|
||||
print("yes")
|
||||
except Exception as e:
|
||||
# 捕获异常并打印错误信息
|
||||
invalid_added_docx=pdf2docx(pdf_path)
|
||||
invalid_added_docx=pdf2docx(invalid_path)
|
||||
invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path
|
||||
if not invalid_deleted_docx:
|
||||
invalid_deleted_docx=pdf2docx(invalid_path)
|
||||
|
||||
# invalid_docpath = invalid_added_docx # docx截取无效标部分
|
||||
procurement_path = truncate_files[5] # 采购需求
|
||||
|
Loading…
x
Reference in New Issue
Block a user