1.21解决bug

This commit is contained in:
zy123 2025-01-21 19:20:26 +08:00
parent 366a5b1fdb
commit e7051ea84e
5 changed files with 32 additions and 22 deletions

View File

@ -40,7 +40,6 @@ prom = '请记住以下材料,他们对回答问题有帮助,请你简洁准
def read_questions_from_file(file_path): def read_questions_from_file(file_path):
questions = [] questions = []
current_question = "" current_question = ""
current_number = 0
with open(file_path, 'r', encoding='utf-8') as file: with open(file_path, 'r', encoding='utf-8') as file:
for line in file: for line in file:
@ -59,8 +58,6 @@ def read_questions_from_file(file_path):
if current_question: if current_question:
questions.append(current_question.strip()) questions.append(current_question.strip())
# 开始新的问题
current_number = int(match.group(1))
# 提取问题内容,去掉编号和点 # 提取问题内容,去掉编号和点
current_question = line.split('.', 1)[1].strip() + "\n" current_question = line.split('.', 1)[1].strip() + "\n"
else: else:

View File

@ -80,7 +80,7 @@ def get_start_and_common_header(input_path, end_page):
continue # 如果存在目录,跳过当前页面 continue # 如果存在目录,跳过当前页面
if begin_pattern.search(cleaned_text): if begin_pattern.search(cleaned_text):
last_begin_index = i # 更新第一个匹配的索引页码从0开始 last_begin_index = i # 更新第一个匹配的索引页码从0开始
print(f"匹配到起始模式,设置 last_begin_index 为: {last_begin_index}") # print(f"匹配到起始模式,设置 last_begin_index 为: {last_begin_index}")
return common_header, last_begin_index return common_header, last_begin_index
return common_header, last_begin_index return common_header, last_begin_index
except Exception as e: except Exception as e:
@ -273,7 +273,7 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
# 定义结束模式 # 定义结束模式
end_patterns = [ end_patterns = [
regex.compile( regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:格式|文件(?:的)?组成|文件(?:的)?构成|文件(?:的)?编制).*|' r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:(?:文件|书)(?:的)?(?:组成|构成|编制)|格式).*|'
r'第[一二三四五六七八九十]+(?:章|部分).*?合同|[:]清标报告', r'第[一二三四五六七八九十]+(?:章|部分).*?合同|[:]清标报告',
regex.MULTILINE regex.MULTILINE
), ),

View File

@ -1,13 +1,15 @@
# -*- encoding:utf-8 -*- # -*- encoding:utf-8 -*-
import json import json
import re import re
import fitz
from PyPDF2 import PdfReader from PyPDF2 import PdfReader
import textwrap import textwrap
from flask_app.general.doubao import read_txt_to_string from flask_app.general.doubao import read_txt_to_string
from flask_app.general.json_utils import clean_json_string from flask_app.general.json_utils import clean_json_string
from flask_app.general.model_continue_query import continue_answer, process_continue_answers from flask_app.general.model_continue_query import continue_answer, process_continue_answers
from flask_app.general.截取pdf通用函数 import create_get_text_function
from flask_app.general.通义千问long import upload_file, qianwen_long_stream, qianwen_plus from flask_app.general.通义千问long import upload_file, qianwen_long_stream, qianwen_plus
from flask_app.货物标.截取pdf货物标版 import extract_common_header, clean_page_content from flask_app.general.clean_pdf import extract_common_header, clean_page_content
from flask_app.general.format_change import docx2pdf, pdf2docx from flask_app.general.format_change import docx2pdf, pdf2docx
import concurrent.futures import concurrent.futures
from flask_app.general.doubao import doubao_model from flask_app.general.doubao import doubao_model
@ -18,8 +20,19 @@ def find_exists(truncate_file, required_keys):
# if not truncate_file: # if not truncate_file:
# return ["技术要求", "商务要求", "服务要求", "其他要求"] # return ["技术要求", "商务要求", "服务要求", "其他要求"]
common_header = extract_common_header(truncate_file) # 假设该函数已定义 common_header = extract_common_header(truncate_file) # 假设该函数已定义
pdf_document = PdfReader(truncate_file) try:
pdf_document = PdfReader(truncate_file)
pdf_lib = 'pypdf2'
except Exception as e:
print(f"使用 PyPDF2 读取失败,切换到 fitz。错误信息: {e}")
pdf_document = fitz.open(truncate_file)
pdf_lib = 'fitz'
get_text = create_get_text_function(pdf_lib, pdf_document)
# 获取总页数
if pdf_lib == 'pypdf2':
total_pages = len(pdf_document.pages)
else: # fitz
total_pages = pdf_document.page_count
# 定义正则模式 # 定义正则模式
begin_pattern = re.compile( begin_pattern = re.compile(
r'(?:^第[一二三四五六七八九十百千]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分” r'(?:^第[一二三四五六七八九十百千]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
@ -36,13 +49,13 @@ def find_exists(truncate_file, required_keys):
end_pattern = re.compile( end_pattern = re.compile(
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE) r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE)
# 处理第一页和最后一页 # 处理第一页和最后一页
first_page = pdf_document.pages[0].extract_text() or "" first_page_text = get_text(0)
last_page = pdf_document.pages[-1].extract_text() or "" last_page_text = get_text(total_pages - 1)
# 清理页面内容 # 清理页面内容
first_page_clean = clean_page_content(first_page, common_header) first_page_clean = clean_page_content(first_page_text, common_header)
last_page_clean = clean_page_content(last_page, common_header) last_page_clean = clean_page_content(last_page_text, common_header)
# 在第一页寻找起始位置 # 在第一页寻找起始位置
start_match = re.search(begin_pattern, first_page_clean) start_match = re.search(begin_pattern, first_page_clean)
@ -63,9 +76,9 @@ def find_exists(truncate_file, required_keys):
# 获取中间页面的内容 # 获取中间页面的内容
middle_content = "" middle_content = ""
if len(pdf_document.pages) > 2: if total_pages > 2:
for page_num in range(1, len(pdf_document.pages) - 1): for page_num in range(1, total_pages - 1):
page_text = pdf_document.pages[page_num].extract_text() or "" page_text = get_text(page_num)
cleaned_text = clean_page_content(page_text, common_header) cleaned_text = clean_page_content(page_text, common_header)
middle_content += cleaned_text + "\n" middle_content += cleaned_text + "\n"

View File

@ -67,12 +67,12 @@ def combine_basic_info(merged_baseinfo_path, procurement_path,clause_path,invali
if __name__ == "__main__": if __name__ == "__main__":
start_time=time.time() start_time=time.time()
# baseinfo_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all\\ztbfile_merged_baseinfo\\ztbfile_merged_baseinfo_3-31.pdf" # baseinfo_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all\\ztbfile_merged_baseinfo\\ztbfile_merged_baseinfo_3-31.pdf"
merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\3783ce68-1839-4449-97e6-cd07749d8664\ztbfile_merged_baseinfo.pdf" merged_baseinfo_path=r"C:\Users\Administrator\Desktop\fsdownload\0c80edcc-cc86-4d53-8bd4-78a531446760\ztbfile.docx"
# procurement_file_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\tmp\\ztbfile_procurement.pdf" # procurement_file_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\tmp\\ztbfile_procurement.pdf"
clause_path=r'D:\flask_project\flask_app\static\output\output1\3783ce68-1839-4449-97e6-cd07749d8664\clause1.json' clause_path=r'D:\flask_project\flask_app\static\output\output1\3783ce68-1839-4449-97e6-cd07749d8664\clause1.json'
invalid_path=r'D:\flask_project\flask_app\static\output\output1\3783ce68-1839-4449-97e6-cd07749d8664\invalid_del.docx' invalid_path=r"C:\Users\Administrator\Desktop\fsdownload\0c80edcc-cc86-4d53-8bd4-78a531446760\ztbfile.docx"
# res = combine_basic_info(merged_baseinfo_path, procurement_file_path,clause_path) # res = combine_basic_info(merged_baseinfo_path, procurement_file_path,clause_path)
res=combine_basic_info(merged_baseinfo_path,"",clause_path,invalid_path) res=combine_basic_info(merged_baseinfo_path,"","",invalid_path)
print("------------------------------------") print("------------------------------------")
print(json.dumps(res, ensure_ascii=False, indent=4)) print(json.dumps(res, ensure_ascii=False, indent=4))
end_time=time.time() end_time=time.time()

View File

@ -319,11 +319,11 @@ if __name__ == "__main__":
logger = get_global_logger("123") logger = get_global_logger("123")
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标" # input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf" # pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp\鄂州市急救中心展厅布展项目.pdf" pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\0c80edcc-cc86-4d53-8bd4-78a531446760\ztbfile.pdf"
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf" # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf" # pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp" output_folder = r"C:\Users\Administrator\Desktop\fsdownload\0c80edcc-cc86-4d53-8bd4-78a531446760\tmp"
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2" # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
selection = 3 # 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path selection = 1 # 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger) generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
print(generated_files) print(generated_files)