1.21解决bug
This commit is contained in:
parent
366a5b1fdb
commit
e7051ea84e
@ -40,7 +40,6 @@ prom = '请记住以下材料,他们对回答问题有帮助,请你简洁准
|
|||||||
def read_questions_from_file(file_path):
|
def read_questions_from_file(file_path):
|
||||||
questions = []
|
questions = []
|
||||||
current_question = ""
|
current_question = ""
|
||||||
current_number = 0
|
|
||||||
|
|
||||||
with open(file_path, 'r', encoding='utf-8') as file:
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
for line in file:
|
for line in file:
|
||||||
@ -59,8 +58,6 @@ def read_questions_from_file(file_path):
|
|||||||
if current_question:
|
if current_question:
|
||||||
questions.append(current_question.strip())
|
questions.append(current_question.strip())
|
||||||
|
|
||||||
# 开始新的问题
|
|
||||||
current_number = int(match.group(1))
|
|
||||||
# 提取问题内容,去掉编号和点
|
# 提取问题内容,去掉编号和点
|
||||||
current_question = line.split('.', 1)[1].strip() + "\n"
|
current_question = line.split('.', 1)[1].strip() + "\n"
|
||||||
else:
|
else:
|
||||||
|
@ -80,7 +80,7 @@ def get_start_and_common_header(input_path, end_page):
|
|||||||
continue # 如果存在目录,跳过当前页面
|
continue # 如果存在目录,跳过当前页面
|
||||||
if begin_pattern.search(cleaned_text):
|
if begin_pattern.search(cleaned_text):
|
||||||
last_begin_index = i # 更新第一个匹配的索引,页码从0开始
|
last_begin_index = i # 更新第一个匹配的索引,页码从0开始
|
||||||
print(f"匹配到起始模式,设置 last_begin_index 为: {last_begin_index}")
|
# print(f"匹配到起始模式,设置 last_begin_index 为: {last_begin_index}")
|
||||||
return common_header, last_begin_index
|
return common_header, last_begin_index
|
||||||
return common_header, last_begin_index
|
return common_header, last_begin_index
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -273,7 +273,7 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
|||||||
# 定义结束模式
|
# 定义结束模式
|
||||||
end_patterns = [
|
end_patterns = [
|
||||||
regex.compile(
|
regex.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:格式|文件(?:的)?组成|文件(?:的)?构成|文件(?:的)?编制).*|'
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:(?:文件|书)(?:的)?(?:组成|构成|编制)|格式).*|'
|
||||||
r'第[一二三四五六七八九十]+(?:章|部分).*?合同|[::]清标报告',
|
r'第[一二三四五六七八九十]+(?:章|部分).*?合同|[::]清标报告',
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
),
|
),
|
||||||
|
@ -1,13 +1,15 @@
|
|||||||
# -*- encoding:utf-8 -*-
|
# -*- encoding:utf-8 -*-
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
import fitz
|
||||||
from PyPDF2 import PdfReader
|
from PyPDF2 import PdfReader
|
||||||
import textwrap
|
import textwrap
|
||||||
from flask_app.general.doubao import read_txt_to_string
|
from flask_app.general.doubao import read_txt_to_string
|
||||||
from flask_app.general.json_utils import clean_json_string
|
from flask_app.general.json_utils import clean_json_string
|
||||||
from flask_app.general.model_continue_query import continue_answer, process_continue_answers
|
from flask_app.general.model_continue_query import continue_answer, process_continue_answers
|
||||||
|
from flask_app.general.截取pdf通用函数 import create_get_text_function
|
||||||
from flask_app.general.通义千问long import upload_file, qianwen_long_stream, qianwen_plus
|
from flask_app.general.通义千问long import upload_file, qianwen_long_stream, qianwen_plus
|
||||||
from flask_app.货物标.截取pdf货物标版 import extract_common_header, clean_page_content
|
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
|
||||||
from flask_app.general.format_change import docx2pdf, pdf2docx
|
from flask_app.general.format_change import docx2pdf, pdf2docx
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
from flask_app.general.doubao import doubao_model
|
from flask_app.general.doubao import doubao_model
|
||||||
@ -18,8 +20,19 @@ def find_exists(truncate_file, required_keys):
|
|||||||
# if not truncate_file:
|
# if not truncate_file:
|
||||||
# return ["技术要求", "商务要求", "服务要求", "其他要求"]
|
# return ["技术要求", "商务要求", "服务要求", "其他要求"]
|
||||||
common_header = extract_common_header(truncate_file) # 假设该函数已定义
|
common_header = extract_common_header(truncate_file) # 假设该函数已定义
|
||||||
pdf_document = PdfReader(truncate_file)
|
try:
|
||||||
|
pdf_document = PdfReader(truncate_file)
|
||||||
|
pdf_lib = 'pypdf2'
|
||||||
|
except Exception as e:
|
||||||
|
print(f"使用 PyPDF2 读取失败,切换到 fitz。错误信息: {e}")
|
||||||
|
pdf_document = fitz.open(truncate_file)
|
||||||
|
pdf_lib = 'fitz'
|
||||||
|
get_text = create_get_text_function(pdf_lib, pdf_document)
|
||||||
|
# 获取总页数
|
||||||
|
if pdf_lib == 'pypdf2':
|
||||||
|
total_pages = len(pdf_document.pages)
|
||||||
|
else: # fitz
|
||||||
|
total_pages = pdf_document.page_count
|
||||||
# 定义正则模式
|
# 定义正则模式
|
||||||
begin_pattern = re.compile(
|
begin_pattern = re.compile(
|
||||||
r'(?:^第[一二三四五六七八九十百千]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
|
r'(?:^第[一二三四五六七八九十百千]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
|
||||||
@ -36,13 +49,13 @@ def find_exists(truncate_file, required_keys):
|
|||||||
end_pattern = re.compile(
|
end_pattern = re.compile(
|
||||||
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', re.MULTILINE)
|
||||||
|
|
||||||
# 只处理第一页和最后一页
|
# 处理第一页和最后一页
|
||||||
first_page = pdf_document.pages[0].extract_text() or ""
|
first_page_text = get_text(0)
|
||||||
last_page = pdf_document.pages[-1].extract_text() or ""
|
last_page_text = get_text(total_pages - 1)
|
||||||
|
|
||||||
# 清理页面内容
|
# 清理页面内容
|
||||||
first_page_clean = clean_page_content(first_page, common_header)
|
first_page_clean = clean_page_content(first_page_text, common_header)
|
||||||
last_page_clean = clean_page_content(last_page, common_header)
|
last_page_clean = clean_page_content(last_page_text, common_header)
|
||||||
|
|
||||||
# 在第一页寻找起始位置
|
# 在第一页寻找起始位置
|
||||||
start_match = re.search(begin_pattern, first_page_clean)
|
start_match = re.search(begin_pattern, first_page_clean)
|
||||||
@ -63,9 +76,9 @@ def find_exists(truncate_file, required_keys):
|
|||||||
|
|
||||||
# 获取中间页面的内容
|
# 获取中间页面的内容
|
||||||
middle_content = ""
|
middle_content = ""
|
||||||
if len(pdf_document.pages) > 2:
|
if total_pages > 2:
|
||||||
for page_num in range(1, len(pdf_document.pages) - 1):
|
for page_num in range(1, total_pages - 1):
|
||||||
page_text = pdf_document.pages[page_num].extract_text() or ""
|
page_text = get_text(page_num)
|
||||||
cleaned_text = clean_page_content(page_text, common_header)
|
cleaned_text = clean_page_content(page_text, common_header)
|
||||||
middle_content += cleaned_text + "\n"
|
middle_content += cleaned_text + "\n"
|
||||||
|
|
||||||
|
@ -67,12 +67,12 @@ def combine_basic_info(merged_baseinfo_path, procurement_path,clause_path,invali
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
start_time=time.time()
|
start_time=time.time()
|
||||||
# baseinfo_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all\\ztbfile_merged_baseinfo\\ztbfile_merged_baseinfo_3-31.pdf"
|
# baseinfo_file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\truncate_all\\ztbfile_merged_baseinfo\\ztbfile_merged_baseinfo_3-31.pdf"
|
||||||
merged_baseinfo_path=r"D:\flask_project\flask_app\static\output\output1\3783ce68-1839-4449-97e6-cd07749d8664\ztbfile_merged_baseinfo.pdf"
|
merged_baseinfo_path=r"C:\Users\Administrator\Desktop\fsdownload\0c80edcc-cc86-4d53-8bd4-78a531446760\ztbfile.docx"
|
||||||
# procurement_file_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\tmp\\ztbfile_procurement.pdf"
|
# procurement_file_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b4601ea1-f087-4fa2-88ae-336ad4d8e1e9\\tmp\\ztbfile_procurement.pdf"
|
||||||
clause_path=r'D:\flask_project\flask_app\static\output\output1\3783ce68-1839-4449-97e6-cd07749d8664\clause1.json'
|
clause_path=r'D:\flask_project\flask_app\static\output\output1\3783ce68-1839-4449-97e6-cd07749d8664\clause1.json'
|
||||||
invalid_path=r'D:\flask_project\flask_app\static\output\output1\3783ce68-1839-4449-97e6-cd07749d8664\invalid_del.docx'
|
invalid_path=r"C:\Users\Administrator\Desktop\fsdownload\0c80edcc-cc86-4d53-8bd4-78a531446760\ztbfile.docx"
|
||||||
# res = combine_basic_info(merged_baseinfo_path, procurement_file_path,clause_path)
|
# res = combine_basic_info(merged_baseinfo_path, procurement_file_path,clause_path)
|
||||||
res=combine_basic_info(merged_baseinfo_path,"",clause_path,invalid_path)
|
res=combine_basic_info(merged_baseinfo_path,"","",invalid_path)
|
||||||
print("------------------------------------")
|
print("------------------------------------")
|
||||||
print(json.dumps(res, ensure_ascii=False, indent=4))
|
print(json.dumps(res, ensure_ascii=False, indent=4))
|
||||||
end_time=time.time()
|
end_time=time.time()
|
||||||
|
@ -319,11 +319,11 @@ if __name__ == "__main__":
|
|||||||
logger = get_global_logger("123")
|
logger = get_global_logger("123")
|
||||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
|
||||||
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
|
||||||
pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp\鄂州市急救中心展厅布展项目.pdf"
|
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\0c80edcc-cc86-4d53-8bd4-78a531446760\ztbfile.pdf"
|
||||||
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
||||||
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
||||||
output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\temp"
|
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\0c80edcc-cc86-4d53-8bd4-78a531446760\tmp"
|
||||||
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
||||||
selection = 3 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
|
selection = 1 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
|
||||||
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
||||||
print(generated_files)
|
print(generated_files)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user