1.3 修改bug
This commit is contained in:
parent
c33febf8a4
commit
14543f9663
@ -1,5 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
import re
|
from docx.opc.exceptions import PackageNotFoundError
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from PyPDF2 import PdfReader, PdfWriter
|
from PyPDF2 import PdfReader, PdfWriter
|
||||||
from docx import Document
|
from docx import Document
|
||||||
@ -65,11 +65,16 @@ def insert_mark(input_pdf_path):
|
|||||||
|
|
||||||
|
|
||||||
def delete_mark(docx_path):
|
def delete_mark(docx_path):
|
||||||
"""
|
try:
|
||||||
删除docx文档中的所有标记
|
docx = Document(docx_path)
|
||||||
:param docx_path: docx文件路径
|
except KeyError as e:
|
||||||
"""
|
print(f"Error opening document: {e}")
|
||||||
docx = Document(docx_path)
|
return ""
|
||||||
|
except PackageNotFoundError as e:
|
||||||
|
print(f"Invalid package: {e}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# 继续处理文档
|
||||||
find_flag = False
|
find_flag = False
|
||||||
for para in docx.paragraphs:
|
for para in docx.paragraphs:
|
||||||
# 匹配标记: [$$index_mark_X$$]
|
# 匹配标记: [$$index_mark_X$$]
|
||||||
@ -79,9 +84,6 @@ def delete_mark(docx_path):
|
|||||||
if find_flag and "w:sectPr" in para._element.xml: # 删空白分节符
|
if find_flag and "w:sectPr" in para._element.xml: # 删空白分节符
|
||||||
para._element.getparent().remove(para._element)
|
para._element.getparent().remove(para._element)
|
||||||
find_flag = False
|
find_flag = False
|
||||||
|
|
||||||
# 获取文件路径信息
|
|
||||||
import os
|
|
||||||
dir_path = os.path.dirname(docx_path)
|
dir_path = os.path.dirname(docx_path)
|
||||||
new_file_path = os.path.join(dir_path, 'invalid_del.docx')
|
new_file_path = os.path.join(dir_path, 'invalid_del.docx')
|
||||||
|
|
||||||
@ -90,8 +92,12 @@ def delete_mark(docx_path):
|
|||||||
return new_file_path
|
return new_file_path
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
input=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\baada43d-24f6-459d-8a81-219d130f20da\ztbfile.pdf'
|
# input=r'D:\flask_project\flask_app\static\output\output1\90b073b5-b8fb-4b11-9962-10de7c3f3854\invalid_added.docx'
|
||||||
# input=r'C:\Users\Administrator\Desktop\new招标文件\output5\广水市公安局音视频监控系统设备采购项目_procurement.pdf'
|
# input=r'C:\Users\Administrator\Desktop\new招标文件\output5\广水市公安局音视频监控系统设备采购项目_procurement.pdf'
|
||||||
output=insert_mark(input)
|
# output=insert_mark(input)
|
||||||
# doc_path = r'C:\Users\Administrator\Desktop\fsdownload\1073c74f-02b5-463e-a129-23c790a3c872\invalid_added.docx'
|
doc_path = r'D:\flask_project\flask_app\static\output\output1\90b073b5-b8fb-4b11-9962-10de7c3f3854\invalid_added.docx'
|
||||||
# delete_mark(doc_path)
|
res=delete_mark(doc_path)
|
||||||
|
if res:
|
||||||
|
print(res)
|
||||||
|
else:
|
||||||
|
print("No")
|
@ -227,6 +227,40 @@ def parse_json_with_duplicates(raw_string):
|
|||||||
print("未找到有效的 JSON 内容。")
|
print("未找到有效的 JSON 内容。")
|
||||||
return {} # 返回空字典
|
return {} # 返回空字典
|
||||||
|
|
||||||
|
def extract_first_json(s):
|
||||||
|
"""
|
||||||
|
从字符串中提取第一个完整的 JSON 对象。如果 JSON 对象不完整,尝试补全缺失的括号。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
s (str): 输入字符串。
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str or None: 提取到的 JSON 字符串,或 None 如果未找到。
|
||||||
|
"""
|
||||||
|
start = s.find('{')
|
||||||
|
if start == -1:
|
||||||
|
return None
|
||||||
|
|
||||||
|
stack = []
|
||||||
|
for i in range(start, len(s)):
|
||||||
|
char = s[i]
|
||||||
|
if char == '{':
|
||||||
|
stack.append('{')
|
||||||
|
elif char == '}':
|
||||||
|
if stack:
|
||||||
|
stack.pop()
|
||||||
|
if not stack:
|
||||||
|
# 找到一个完整的 JSON 对象
|
||||||
|
return s[start:i+1]
|
||||||
|
else:
|
||||||
|
# 多余的右括号,忽略
|
||||||
|
pass
|
||||||
|
# 如果遍历完后栈不为空,补全缺失的右括号
|
||||||
|
if stack:
|
||||||
|
missing = '}' * len(stack)
|
||||||
|
return s[start:] + missing
|
||||||
|
return None
|
||||||
|
|
||||||
def extract_content_from_json(input_string,flag=False):
|
def extract_content_from_json(input_string,flag=False):
|
||||||
"""
|
"""
|
||||||
输入字符串,尝试解析 JSON 数据:
|
输入字符串,尝试解析 JSON 数据:
|
||||||
@ -292,8 +326,19 @@ def extract_content_from_json(input_string,flag=False):
|
|||||||
return parsed_data # 返回解析后的字典
|
return parsed_data # 返回解析后的字典
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
print("方法3(非法转义修复)解析失败。")
|
print("方法3(非法转义修复)解析失败。")
|
||||||
|
# 所有方法均失败后,尝试使用 extract_first_json 作为最后手段
|
||||||
|
print("尝试使用 extract_first_json 作为最后手段。")
|
||||||
|
fixed_json_final = extract_first_json(input_string)
|
||||||
|
if fixed_json_final:
|
||||||
|
try:
|
||||||
|
parsed_data = parse_json(fixed_json_final)
|
||||||
|
print("使用 extract_first_json 后解析成功。")
|
||||||
|
return parsed_data
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
print("使用 extract_first_json 后解析失败。")
|
||||||
|
else:
|
||||||
|
print("extract_first_json 未能提取到有效的 JSON。")
|
||||||
|
|
||||||
# 如果所有方法都失败,检查字符串长度
|
|
||||||
print("所有修复方法均失败。传入的字符串:")
|
print("所有修复方法均失败。传入的字符串:")
|
||||||
print(input_string)
|
print(input_string)
|
||||||
print("-------------------")
|
print("-------------------")
|
||||||
|
@ -65,9 +65,10 @@ def preprocess_files(output_folder, file_path, file_type,logger):
|
|||||||
print("yes")
|
print("yes")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# 捕获异常并打印错误信息
|
# 捕获异常并打印错误信息
|
||||||
invalid_added_docx=pdf2docx(pdf_path)
|
invalid_added_docx=pdf2docx(invalid_path)
|
||||||
invalid_deleted_docx=delete_mark(invalid_added_docx) #无标记的invalid_path
|
invalid_deleted_docx=delete_mark(invalid_added_docx) #无标记的invalid_path
|
||||||
|
if not invalid_deleted_docx:
|
||||||
|
invalid_deleted_docx=pdf2docx(invalid_path)
|
||||||
merged_baseinfo_path=truncate_files[-1]
|
merged_baseinfo_path=truncate_files[-1]
|
||||||
more_path=[merged_baseinfo_path,tobidders_notice]
|
more_path=[merged_baseinfo_path,tobidders_notice]
|
||||||
merged_baseinfo_path_more=os.path.join(output_folder,"merged_baseinfo_path_more.pdf")
|
merged_baseinfo_path_more=os.path.join(output_folder,"merged_baseinfo_path_more.pdf")
|
||||||
|
@ -53,8 +53,10 @@ def preprocess_files(output_folder, file_path, file_type,logger):
|
|||||||
print("yes")
|
print("yes")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# 捕获异常并打印错误信息
|
# 捕获异常并打印错误信息
|
||||||
invalid_added_docx=pdf2docx(pdf_path)
|
invalid_added_docx=pdf2docx(invalid_path)
|
||||||
invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path
|
invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path
|
||||||
|
if not invalid_deleted_docx:
|
||||||
|
invalid_deleted_docx=pdf2docx(invalid_path)
|
||||||
|
|
||||||
# invalid_docpath = invalid_added_docx # docx截取无效标部分
|
# invalid_docpath = invalid_added_docx # docx截取无效标部分
|
||||||
procurement_path = truncate_files[5] # 采购需求
|
procurement_path = truncate_files[5] # 采购需求
|
||||||
|
Loading…
x
Reference in New Issue
Block a user