This commit is contained in:
zy123 2024-12-16 17:39:25 +08:00
parent 119c7715b5
commit df1de8b0bc
6 changed files with 107 additions and 50 deletions

View File

@ -75,6 +75,6 @@ def convert_pdf_to_markdown(file_path):
if __name__ == "__main__":
# file_path=r"C:\Users\Administrator\Desktop\fsdownload\e702f1e6-095d-443d-bb7d-ef2e42037cb1\ztbfile_procurement.pdf"
file_path=r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-云南-云南省森林消防总队昆明支队室内体能训练馆及训练场建设项目训练设施及训练器材采购项目.pdf"
file_path=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0516-001-招标文件_procurement.pdf"
res=convert_pdf_to_markdown(file_path)
print(res)

View File

@ -0,0 +1,53 @@
import PyPDF2
from reportlab.lib.units import cm
from reportlab.pdfgen import canvas
from io import BytesIO
def add_blank_pages_v2(input_pdf_path, output_pdf_path):
# 打开输入的PDF文件
with open(input_pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
pdf_writer = PyPDF2.PdfWriter()
total_pages = len(pdf_reader.pages)
# 遍历每一页
for page_num in range(total_pages):
page = pdf_reader.pages[page_num]
pdf_writer.add_page(page)
# 创建一个内存中的PDF用于存放带有文本的空白页
packet = BytesIO()
# 获取当前页面的宽度和高度
page_width = float(page.mediabox.width)
page_height = float(page.mediabox.height)
# 使用reportlab创建一个新的PDF页面
c = canvas.Canvas(packet, pagesize=(page_width, page_height))
# 计算文本的位置单位1厘米 ≈ 28.35点)
x_position = 2.3 * cm
y_position = page_height - 0.5 * cm # 从顶部开始计算,因此用页面高度减去上边距
# 绘制文本,使用 (page_num + 1) 作为索引
c.setFont("Helvetica", 12) # 设置字体和大小
c.drawString(x_position, y_position, f"[$$index_mark_{page_num + 1}$$]")
# 完成绘制
c.save()
# 将内存中的PDF读入PyPDF2
packet.seek(0)
new_pdf = PyPDF2.PdfReader(packet)
blank_page = new_pdf.pages[0]
# 将带有文本的空白页添加到写入器
pdf_writer.add_page(blank_page)
# 将所有页面写入输出的PDF文件
with open(output_pdf_path, 'wb') as output_file:
pdf_writer.write(output_file)
if __name__ == '__main__':
input=r'C:\Users\Administrator\Downloads\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.pdf'
output=r'C:\Users\Administrator\Downloads\output.pdf'
add_blank_pages_v2(input,output)

View File

@ -81,7 +81,7 @@ def preprocess_paragraphs(paragraphs):
while index < len(paragraphs):
current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白
# 检查当前段落是否匹配任一排除模式
if pattern_numbered.match(current_text) or pattern_parentheses.match(current_text):
if (pattern_numbered.match(current_text) or pattern_parentheses.match(current_text)) and len(current_text)<8:
# 如果匹配则跳过当前段落不添加到processed列表中
index += 1
continue
@ -224,7 +224,7 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
next_text = processed_paragraphs[current_index].strip()
if not found_next_number:
# 修改后的正则,支持 '数字 、' 格式
next_section_number = re.match(r'^([A-Za-z0-9]+(?:[.][A-Za-z0-9]+)*)|(\(\d+\))|(\d+\s*、)',
next_section_number = re.match(r'^([A-Za-z0-9]+(?:[.][A-Za-z0-9]+)*)|([(]\d+[)])|(\d+\s*、)',
next_text)
if next_section_number:
found_next_number = True
@ -463,46 +463,49 @@ def extract_values_if_contains(data, includes):
def handle_query(file_path, user_query, output_file, result_key, keywords):
try:
excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:", "我方"]
excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "本人保证:", "我方"]
follow_up_keywords = [r'\s*形\s*之\s*一', r'\s*况\s*之\s*一', r'\s*列', r'\s*下','\s*他\s*情\s*形\s*','\s*他\s*情\s*形\s*:']
extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) # 字典结果
all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
# table_data_list=read_docx_last_column(file_path) #从投标人须知前附表中提取信息生成列表data每个元素为'一行信息'
table_data_list = read_tables_from_docx(file_path)
all_tables1, all_tables2 = extract_table_with_keywords(table_data_list, keywords, follow_up_keywords)
qianwen_txt = all_texts1 + all_tables1
# Proceed only if there is content to write
selected_contents = set() # 使用 set 去重
if qianwen_txt:
with open(output_file, 'w', encoding='utf-8') as file:
counter = 1
for content in qianwen_txt:
file.write(f"{counter}. {content}\n")
file.write("..............." + '\n')
counter += 1
user_query = generate_full_user_query(output_file, user_query)
model_ans=doubao_model(user_query) #豆包
# file_id = upload_file(output_file)
# model_ans = qianwen_long(file_id, user_query)
# model_ans = qianwen_long_text(file_id, user_query)
num_list = process_string_list(model_ans)
print(result_key + "选中的序号:" + str(num_list))
for index in num_list:
if index - 1 < len(qianwen_txt):
content = qianwen_txt[index - 1]
selected_contents.add(content)
# 无论 qianwen_txt 是否为空,都添加 all_texts2 和 all_tables2 的内容
selected_contents.update(all_texts2)
selected_contents.update(all_tables2)
# 如果 selected_contents 不为空,则返回结果,否则返回空字符串
if selected_contents:
res = {result_key: list(selected_contents)}
else:
res = {result_key: ""}
return res
print(all_texts1)
print(all_texts2)
#
# # table_data_list=read_docx_last_column(file_path) #从投标人须知前附表中提取信息生成列表data每个元素为'一行信息'
# table_data_list = read_tables_from_docx(file_path)
# all_tables1, all_tables2 = extract_table_with_keywords(table_data_list, keywords, follow_up_keywords)
# qianwen_txt = all_texts1 + all_tables1
# # Proceed only if there is content to write
# selected_contents = set() # 使用 set 去重
# if qianwen_txt:
# with open(output_file, 'w', encoding='utf-8') as file:
# counter = 1
# for content in qianwen_txt:
# file.write(f"{counter}. {content}\n")
# file.write("..............." + '\n')
# counter += 1
# user_query = generate_full_user_query(output_file, user_query)
# model_ans=doubao_model(user_query) #豆包
# # file_id = upload_file(output_file)
# # model_ans = qianwen_long(file_id, user_query)
# # model_ans = qianwen_long_text(file_id, user_query)
# num_list = process_string_list(model_ans)
# print(result_key + "选中的序号:" + str(num_list))
#
# for index in num_list:
# if index - 1 < len(qianwen_txt):
# content = qianwen_txt[index - 1]
# selected_contents.add(content)
#
# # 无论 qianwen_txt 是否为空,都添加 all_texts2 和 all_tables2 的内容
# selected_contents.update(all_texts2)
# selected_contents.update(all_tables2)
#
# # 如果 selected_contents 不为空,则返回结果,否则返回空字符串
# if selected_contents:
# res = {result_key: list(selected_contents)}
# else:
# res = {result_key: ""}
# return res
except Exception as e:
print(f"handle_query 在处理 {result_key} 时发生异常: {e}")
return {result_key: ""}
@ -581,8 +584,8 @@ if __name__ == '__main__':
# truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招标文件实高电子显示屏_tobidders_notice_part1.docx"
# clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
# doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
doc_path = r'C:\Users\Administrator\Desktop\fsdownload\140aae9a-0dac-4bba-be57-ea706af069d6\ztbfile_invalid.docx'
output_dir = r"C:\Users\Administrator\Desktop\fsdownload\140aae9a-0dac-4bba-be57-ea706af069d6\tmp"
doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
output_dir = r"C:\Users\Administrator\Desktop\new招标文件\tmp\new招标文件\tmp"
results = combine_find_invalid(doc_path, output_dir)
end_time = time.time()
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))

View File

@ -812,9 +812,9 @@ if __name__ == "__main__":
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\ff2acdba-3a55-48a7-aa7a-61d9f89b909a\tmp"
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
# files = truncate_pdf_multiple(input_path, output_folder,logger)
selections = [1,4]
files=truncate_pdf_specific_goods(input_path,output_folder,selections)
print(files)
# selection = 2 # 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print(generated_files)
# selections = [1,4]
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
# print(files)
selection = 4 # 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
generated_files = truncate_pdf_main(input_path, output_folder, selection)
print(generated_files)

View File

@ -64,7 +64,7 @@ if __name__ == "__main__":
start_time = time.time()
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output"
# file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_procurement.pdf"
procurement_path = r"C:\Users\Administrator\Desktop\货物标\output1\陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1)_procurement.pdf"
procurement_path = r"C:\Users\Administrator\Downloads\ztbfile_procurement.pdf"
procurement_docpath = r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a"
invalid_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\db79e9e0-830e-442c-8cb6-1d036215f8ff\\ztbfile.pdf"
res = fetch_procurement_reqs(procurement_path, invalid_path)

View File

@ -16,4 +16,5 @@ openai==1.56.2
pathlib==1.0.1
alibabacloud_bailian20231229==1.7.0
ratelimit==2.2.1
waitress~=3.0.0
waitress~=3.0.0
reportlab==4.2.2