12.16
This commit is contained in:
parent
119c7715b5
commit
df1de8b0bc
@ -75,6 +75,6 @@ def convert_pdf_to_markdown(file_path):
|
||||
|
||||
if __name__ == "__main__":
|
||||
# file_path=r"C:\Users\Administrator\Desktop\fsdownload\e702f1e6-095d-443d-bb7d-ef2e42037cb1\ztbfile_procurement.pdf"
|
||||
file_path=r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-云南-云南省森林消防总队昆明支队室内体能训练馆及训练场建设项目训练设施及训练器材采购项目.pdf"
|
||||
file_path=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0516-001-招标文件_procurement.pdf"
|
||||
res=convert_pdf_to_markdown(file_path)
|
||||
print(res)
|
53
flask_app/general/insert_pagenum.py
Normal file
53
flask_app/general/insert_pagenum.py
Normal file
@ -0,0 +1,53 @@
|
||||
import PyPDF2
|
||||
from reportlab.lib.units import cm
|
||||
from reportlab.pdfgen import canvas
|
||||
from io import BytesIO
|
||||
|
||||
def add_blank_pages_v2(input_pdf_path, output_pdf_path):
|
||||
# 打开输入的PDF文件
|
||||
with open(input_pdf_path, 'rb') as file:
|
||||
pdf_reader = PyPDF2.PdfReader(file)
|
||||
pdf_writer = PyPDF2.PdfWriter()
|
||||
|
||||
total_pages = len(pdf_reader.pages)
|
||||
|
||||
# 遍历每一页
|
||||
for page_num in range(total_pages):
|
||||
page = pdf_reader.pages[page_num]
|
||||
pdf_writer.add_page(page)
|
||||
|
||||
# 创建一个内存中的PDF,用于存放带有文本的空白页
|
||||
packet = BytesIO()
|
||||
# 获取当前页面的宽度和高度
|
||||
page_width = float(page.mediabox.width)
|
||||
page_height = float(page.mediabox.height)
|
||||
# 使用reportlab创建一个新的PDF页面
|
||||
c = canvas.Canvas(packet, pagesize=(page_width, page_height))
|
||||
|
||||
# 计算文本的位置(单位:点,1厘米 ≈ 28.35点)
|
||||
x_position = 2.3 * cm
|
||||
y_position = page_height - 0.5 * cm # 从顶部开始计算,因此用页面高度减去上边距
|
||||
|
||||
# 绘制文本,使用 (page_num + 1) 作为索引
|
||||
c.setFont("Helvetica", 12) # 设置字体和大小
|
||||
c.drawString(x_position, y_position, f"[$$index_mark_{page_num + 1}$$]")
|
||||
|
||||
# 完成绘制
|
||||
c.save()
|
||||
|
||||
# 将内存中的PDF读入PyPDF2
|
||||
packet.seek(0)
|
||||
new_pdf = PyPDF2.PdfReader(packet)
|
||||
blank_page = new_pdf.pages[0]
|
||||
|
||||
# 将带有文本的空白页添加到写入器
|
||||
pdf_writer.add_page(blank_page)
|
||||
|
||||
# 将所有页面写入输出的PDF文件
|
||||
with open(output_pdf_path, 'wb') as output_file:
|
||||
pdf_writer.write(output_file)
|
||||
|
||||
if __name__ == '__main__':
|
||||
input=r'C:\Users\Administrator\Downloads\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.pdf'
|
||||
output=r'C:\Users\Administrator\Downloads\output.pdf'
|
||||
add_blank_pages_v2(input,output)
|
@ -81,7 +81,7 @@ def preprocess_paragraphs(paragraphs):
|
||||
while index < len(paragraphs):
|
||||
current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白
|
||||
# 检查当前段落是否匹配任一排除模式
|
||||
if pattern_numbered.match(current_text) or pattern_parentheses.match(current_text):
|
||||
if (pattern_numbered.match(current_text) or pattern_parentheses.match(current_text)) and len(current_text)<8:
|
||||
# 如果匹配,则跳过当前段落,不添加到processed列表中
|
||||
index += 1
|
||||
continue
|
||||
@ -224,7 +224,7 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
|
||||
next_text = processed_paragraphs[current_index].strip()
|
||||
if not found_next_number:
|
||||
# 修改后的正则,支持 '数字 、' 格式
|
||||
next_section_number = re.match(r'^([A-Za-z0-9]+(?:[..][A-Za-z0-9]+)*)|(\(\d+\))|(\d+\s*、)',
|
||||
next_section_number = re.match(r'^([A-Za-z0-9]+(?:[..][A-Za-z0-9]+)*)|([((]\d+[))])|(\d+\s*、)',
|
||||
next_text)
|
||||
if next_section_number:
|
||||
found_next_number = True
|
||||
@ -463,46 +463,49 @@ def extract_values_if_contains(data, includes):
|
||||
|
||||
def handle_query(file_path, user_query, output_file, result_key, keywords):
|
||||
try:
|
||||
excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:", "我方"]
|
||||
excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "本人保证:", "我方"]
|
||||
follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下','其\s*他\s*情\s*形\s*:','其\s*他\s*情\s*形\s*:']
|
||||
extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) # 字典结果
|
||||
all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表
|
||||
# table_data_list=read_docx_last_column(file_path) #从投标人须知前附表中提取信息生成列表data,每个元素为'一行信息'
|
||||
table_data_list = read_tables_from_docx(file_path)
|
||||
all_tables1, all_tables2 = extract_table_with_keywords(table_data_list, keywords, follow_up_keywords)
|
||||
qianwen_txt = all_texts1 + all_tables1
|
||||
# Proceed only if there is content to write
|
||||
selected_contents = set() # 使用 set 去重
|
||||
if qianwen_txt:
|
||||
with open(output_file, 'w', encoding='utf-8') as file:
|
||||
counter = 1
|
||||
for content in qianwen_txt:
|
||||
file.write(f"{counter}. {content}\n")
|
||||
file.write("..............." + '\n')
|
||||
counter += 1
|
||||
user_query = generate_full_user_query(output_file, user_query)
|
||||
model_ans=doubao_model(user_query) #豆包
|
||||
# file_id = upload_file(output_file)
|
||||
# model_ans = qianwen_long(file_id, user_query)
|
||||
# model_ans = qianwen_long_text(file_id, user_query)
|
||||
num_list = process_string_list(model_ans)
|
||||
print(result_key + "选中的序号:" + str(num_list))
|
||||
|
||||
for index in num_list:
|
||||
if index - 1 < len(qianwen_txt):
|
||||
content = qianwen_txt[index - 1]
|
||||
selected_contents.add(content)
|
||||
|
||||
# 无论 qianwen_txt 是否为空,都添加 all_texts2 和 all_tables2 的内容
|
||||
selected_contents.update(all_texts2)
|
||||
selected_contents.update(all_tables2)
|
||||
|
||||
# 如果 selected_contents 不为空,则返回结果,否则返回空字符串
|
||||
if selected_contents:
|
||||
res = {result_key: list(selected_contents)}
|
||||
else:
|
||||
res = {result_key: ""}
|
||||
return res
|
||||
print(all_texts1)
|
||||
print(all_texts2)
|
||||
#
|
||||
# # table_data_list=read_docx_last_column(file_path) #从投标人须知前附表中提取信息生成列表data,每个元素为'一行信息'
|
||||
# table_data_list = read_tables_from_docx(file_path)
|
||||
# all_tables1, all_tables2 = extract_table_with_keywords(table_data_list, keywords, follow_up_keywords)
|
||||
# qianwen_txt = all_texts1 + all_tables1
|
||||
# # Proceed only if there is content to write
|
||||
# selected_contents = set() # 使用 set 去重
|
||||
# if qianwen_txt:
|
||||
# with open(output_file, 'w', encoding='utf-8') as file:
|
||||
# counter = 1
|
||||
# for content in qianwen_txt:
|
||||
# file.write(f"{counter}. {content}\n")
|
||||
# file.write("..............." + '\n')
|
||||
# counter += 1
|
||||
# user_query = generate_full_user_query(output_file, user_query)
|
||||
# model_ans=doubao_model(user_query) #豆包
|
||||
# # file_id = upload_file(output_file)
|
||||
# # model_ans = qianwen_long(file_id, user_query)
|
||||
# # model_ans = qianwen_long_text(file_id, user_query)
|
||||
# num_list = process_string_list(model_ans)
|
||||
# print(result_key + "选中的序号:" + str(num_list))
|
||||
#
|
||||
# for index in num_list:
|
||||
# if index - 1 < len(qianwen_txt):
|
||||
# content = qianwen_txt[index - 1]
|
||||
# selected_contents.add(content)
|
||||
#
|
||||
# # 无论 qianwen_txt 是否为空,都添加 all_texts2 和 all_tables2 的内容
|
||||
# selected_contents.update(all_texts2)
|
||||
# selected_contents.update(all_tables2)
|
||||
#
|
||||
# # 如果 selected_contents 不为空,则返回结果,否则返回空字符串
|
||||
# if selected_contents:
|
||||
# res = {result_key: list(selected_contents)}
|
||||
# else:
|
||||
# res = {result_key: ""}
|
||||
# return res
|
||||
except Exception as e:
|
||||
print(f"handle_query 在处理 {result_key} 时发生异常: {e}")
|
||||
return {result_key: ""}
|
||||
@ -581,8 +584,8 @@ if __name__ == '__main__':
|
||||
# truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招标文件(实高电子显示屏)_tobidders_notice_part1.docx"
|
||||
# clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json"
|
||||
# doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx"
|
||||
doc_path = r'C:\Users\Administrator\Desktop\fsdownload\140aae9a-0dac-4bba-be57-ea706af069d6\ztbfile_invalid.docx'
|
||||
output_dir = r"C:\Users\Administrator\Desktop\fsdownload\140aae9a-0dac-4bba-be57-ea706af069d6\tmp"
|
||||
doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
|
||||
output_dir = r"C:\Users\Administrator\Desktop\new招标文件\tmp\new招标文件\tmp"
|
||||
results = combine_find_invalid(doc_path, output_dir)
|
||||
end_time = time.time()
|
||||
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
|
||||
|
@ -812,9 +812,9 @@ if __name__ == "__main__":
|
||||
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\ff2acdba-3a55-48a7-aa7a-61d9f89b909a\tmp"
|
||||
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
||||
# files = truncate_pdf_multiple(input_path, output_folder,logger)
|
||||
selections = [1,4]
|
||||
files=truncate_pdf_specific_goods(input_path,output_folder,selections)
|
||||
print(files)
|
||||
# selection = 2 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
||||
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
# print(generated_files)
|
||||
# selections = [1,4]
|
||||
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
|
||||
# print(files)
|
||||
selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
print(generated_files)
|
||||
|
@ -64,7 +64,7 @@ if __name__ == "__main__":
|
||||
start_time = time.time()
|
||||
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output"
|
||||
# file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_procurement.pdf"
|
||||
procurement_path = r"C:\Users\Administrator\Desktop\货物标\output1\陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1)_procurement.pdf"
|
||||
procurement_path = r"C:\Users\Administrator\Downloads\ztbfile_procurement.pdf"
|
||||
procurement_docpath = r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a"
|
||||
invalid_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\db79e9e0-830e-442c-8cb6-1d036215f8ff\\ztbfile.pdf"
|
||||
res = fetch_procurement_reqs(procurement_path, invalid_path)
|
||||
|
@ -16,4 +16,5 @@ openai==1.56.2
|
||||
pathlib==1.0.1
|
||||
alibabacloud_bailian20231229==1.7.0
|
||||
ratelimit==2.2.1
|
||||
waitress~=3.0.0
|
||||
waitress~=3.0.0
|
||||
reportlab==4.2.2
|
Loading…
x
Reference in New Issue
Block a user