From df1de8b0bc780724e1b21ee66b993b3f8acd0c2e Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Mon, 16 Dec 2024 17:39:25 +0800 Subject: [PATCH] 12.16 --- flask_app/general/file2markdown.py | 2 +- flask_app/general/insert_pagenum.py | 53 ++++++++++++++ flask_app/general/无效标和废标公共代码.py | 85 ++++++++++++----------- flask_app/货物标/截取pdf货物标版.py | 12 ++-- flask_app/货物标/提取采购需求main.py | 2 +- requirements.txt | 3 +- 6 files changed, 107 insertions(+), 50 deletions(-) create mode 100644 flask_app/general/insert_pagenum.py diff --git a/flask_app/general/file2markdown.py b/flask_app/general/file2markdown.py index 5732e5a..70e1351 100644 --- a/flask_app/general/file2markdown.py +++ b/flask_app/general/file2markdown.py @@ -75,6 +75,6 @@ def convert_pdf_to_markdown(file_path): if __name__ == "__main__": # file_path=r"C:\Users\Administrator\Desktop\fsdownload\e702f1e6-095d-443d-bb7d-ef2e42037cb1\ztbfile_procurement.pdf" - file_path=r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-云南-云南省森林消防总队昆明支队室内体能训练馆及训练场建设项目训练设施及训练器材采购项目.pdf" + file_path=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0516-001-招标文件_procurement.pdf" res=convert_pdf_to_markdown(file_path) print(res) \ No newline at end of file diff --git a/flask_app/general/insert_pagenum.py b/flask_app/general/insert_pagenum.py new file mode 100644 index 0000000..3284b95 --- /dev/null +++ b/flask_app/general/insert_pagenum.py @@ -0,0 +1,53 @@ +import PyPDF2 +from reportlab.lib.units import cm +from reportlab.pdfgen import canvas +from io import BytesIO + +def add_blank_pages_v2(input_pdf_path, output_pdf_path): + # 打开输入的PDF文件 + with open(input_pdf_path, 'rb') as file: + pdf_reader = PyPDF2.PdfReader(file) + pdf_writer = PyPDF2.PdfWriter() + + total_pages = len(pdf_reader.pages) + + # 遍历每一页 + for page_num in range(total_pages): + page = pdf_reader.pages[page_num] + pdf_writer.add_page(page) + + # 创建一个内存中的PDF,用于存放带有文本的空白页 + packet = BytesIO() + # 获取当前页面的宽度和高度 + page_width = float(page.mediabox.width) + page_height = float(page.mediabox.height) + # 使用reportlab创建一个新的PDF页面 + c = canvas.Canvas(packet, pagesize=(page_width, page_height)) + + # 计算文本的位置(单位:点,1厘米 ≈ 28.35点) + x_position = 2.3 * cm + y_position = page_height - 0.5 * cm # 从顶部开始计算,因此用页面高度减去上边距 + + # 绘制文本,使用 (page_num + 1) 作为索引 + c.setFont("Helvetica", 12) # 设置字体和大小 + c.drawString(x_position, y_position, f"[$$index_mark_{page_num + 1}$$]") + + # 完成绘制 + c.save() + + # 将内存中的PDF读入PyPDF2 + packet.seek(0) + new_pdf = PyPDF2.PdfReader(packet) + blank_page = new_pdf.pages[0] + + # 将带有文本的空白页添加到写入器 + pdf_writer.add_page(blank_page) + + # 将所有页面写入输出的PDF文件 + with open(output_pdf_path, 'wb') as output_file: + pdf_writer.write(output_file) + +if __name__ == '__main__': + input=r'C:\Users\Administrator\Downloads\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.pdf' + output=r'C:\Users\Administrator\Downloads\output.pdf' + add_blank_pages_v2(input,output) \ No newline at end of file diff --git a/flask_app/general/无效标和废标公共代码.py b/flask_app/general/无效标和废标公共代码.py index b2c60de..5e51ace 100644 --- a/flask_app/general/无效标和废标公共代码.py +++ b/flask_app/general/无效标和废标公共代码.py @@ -81,7 +81,7 @@ def preprocess_paragraphs(paragraphs): while index < len(paragraphs): current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白 # 检查当前段落是否匹配任一排除模式 - if pattern_numbered.match(current_text) or pattern_parentheses.match(current_text): + if (pattern_numbered.match(current_text) or pattern_parentheses.match(current_text)) and len(current_text)<8: # 如果匹配,则跳过当前段落,不添加到processed列表中 index += 1 continue @@ -224,7 +224,7 @@ def extract_text_with_keywords(doc_path, keywords, follow_up_keywords): next_text = processed_paragraphs[current_index].strip() if not found_next_number: # 修改后的正则,支持 '数字 、' 格式 - next_section_number = re.match(r'^([A-Za-z0-9]+(?:[..][A-Za-z0-9]+)*)|(\(\d+\))|(\d+\s*、)', + next_section_number = re.match(r'^([A-Za-z0-9]+(?:[..][A-Za-z0-9]+)*)|([((]\d+[))])|(\d+\s*、)', next_text) if next_section_number: found_next_number = True @@ -463,46 +463,49 @@ def extract_values_if_contains(data, includes): def handle_query(file_path, user_query, output_file, result_key, keywords): try: - excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注:", "本人保证:", "我方"] + excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "本人保证:", "我方"] follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下','其\s*他\s*情\s*形\s*:','其\s*他\s*情\s*形\s*:'] extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) # 字典结果 all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表 - # table_data_list=read_docx_last_column(file_path) #从投标人须知前附表中提取信息生成列表data,每个元素为'一行信息' - table_data_list = read_tables_from_docx(file_path) - all_tables1, all_tables2 = extract_table_with_keywords(table_data_list, keywords, follow_up_keywords) - qianwen_txt = all_texts1 + all_tables1 - # Proceed only if there is content to write - selected_contents = set() # 使用 set 去重 - if qianwen_txt: - with open(output_file, 'w', encoding='utf-8') as file: - counter = 1 - for content in qianwen_txt: - file.write(f"{counter}. {content}\n") - file.write("..............." + '\n') - counter += 1 - user_query = generate_full_user_query(output_file, user_query) - model_ans=doubao_model(user_query) #豆包 - # file_id = upload_file(output_file) - # model_ans = qianwen_long(file_id, user_query) - # model_ans = qianwen_long_text(file_id, user_query) - num_list = process_string_list(model_ans) - print(result_key + "选中的序号:" + str(num_list)) - - for index in num_list: - if index - 1 < len(qianwen_txt): - content = qianwen_txt[index - 1] - selected_contents.add(content) - - # 无论 qianwen_txt 是否为空,都添加 all_texts2 和 all_tables2 的内容 - selected_contents.update(all_texts2) - selected_contents.update(all_tables2) - - # 如果 selected_contents 不为空,则返回结果,否则返回空字符串 - if selected_contents: - res = {result_key: list(selected_contents)} - else: - res = {result_key: ""} - return res + print(all_texts1) + print(all_texts2) + # + # # table_data_list=read_docx_last_column(file_path) #从投标人须知前附表中提取信息生成列表data,每个元素为'一行信息' + # table_data_list = read_tables_from_docx(file_path) + # all_tables1, all_tables2 = extract_table_with_keywords(table_data_list, keywords, follow_up_keywords) + # qianwen_txt = all_texts1 + all_tables1 + # # Proceed only if there is content to write + # selected_contents = set() # 使用 set 去重 + # if qianwen_txt: + # with open(output_file, 'w', encoding='utf-8') as file: + # counter = 1 + # for content in qianwen_txt: + # file.write(f"{counter}. {content}\n") + # file.write("..............." + '\n') + # counter += 1 + # user_query = generate_full_user_query(output_file, user_query) + # model_ans=doubao_model(user_query) #豆包 + # # file_id = upload_file(output_file) + # # model_ans = qianwen_long(file_id, user_query) + # # model_ans = qianwen_long_text(file_id, user_query) + # num_list = process_string_list(model_ans) + # print(result_key + "选中的序号:" + str(num_list)) + # + # for index in num_list: + # if index - 1 < len(qianwen_txt): + # content = qianwen_txt[index - 1] + # selected_contents.add(content) + # + # # 无论 qianwen_txt 是否为空,都添加 all_texts2 和 all_tables2 的内容 + # selected_contents.update(all_texts2) + # selected_contents.update(all_tables2) + # + # # 如果 selected_contents 不为空,则返回结果,否则返回空字符串 + # if selected_contents: + # res = {result_key: list(selected_contents)} + # else: + # res = {result_key: ""} + # return res except Exception as e: print(f"handle_query 在处理 {result_key} 时发生异常: {e}") return {result_key: ""} @@ -581,8 +584,8 @@ if __name__ == '__main__': # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招标文件(实高电子显示屏)_tobidders_notice_part1.docx" # clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json" # doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx" - doc_path = r'C:\Users\Administrator\Desktop\fsdownload\140aae9a-0dac-4bba-be57-ea706af069d6\ztbfile_invalid.docx' - output_dir = r"C:\Users\Administrator\Desktop\fsdownload\140aae9a-0dac-4bba-be57-ea706af069d6\tmp" + doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx' + output_dir = r"C:\Users\Administrator\Desktop\new招标文件\tmp\new招标文件\tmp" results = combine_find_invalid(doc_path, output_dir) end_time = time.time() print("Results:", json.dumps(results, ensure_ascii=False, indent=4)) diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py index 2467a42..d584149 100644 --- a/flask_app/货物标/截取pdf货物标版.py +++ b/flask_app/货物标/截取pdf货物标版.py @@ -812,9 +812,9 @@ if __name__ == "__main__": output_folder = r"C:\Users\Administrator\Desktop\fsdownload\ff2acdba-3a55-48a7-aa7a-61d9f89b909a\tmp" # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2" # files = truncate_pdf_multiple(input_path, output_folder,logger) - selections = [1,4] - files=truncate_pdf_specific_goods(input_path,output_folder,selections) - print(files) - # selection = 2 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 - # generated_files = truncate_pdf_main(input_path, output_folder, selection) - # print(generated_files) + # selections = [1,4] + # files=truncate_pdf_specific_goods(input_path,output_folder,selections) + # print(files) + selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 + generated_files = truncate_pdf_main(input_path, output_folder, selection) + print(generated_files) diff --git a/flask_app/货物标/提取采购需求main.py b/flask_app/货物标/提取采购需求main.py index f1d25c8..1de4708 100644 --- a/flask_app/货物标/提取采购需求main.py +++ b/flask_app/货物标/提取采购需求main.py @@ -64,7 +64,7 @@ if __name__ == "__main__": start_time = time.time() output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output" # file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_procurement.pdf" - procurement_path = r"C:\Users\Administrator\Desktop\货物标\output1\陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1)_procurement.pdf" + procurement_path = r"C:\Users\Administrator\Downloads\ztbfile_procurement.pdf" procurement_docpath = r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a" invalid_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\db79e9e0-830e-442c-8cb6-1d036215f8ff\\ztbfile.pdf" res = fetch_procurement_reqs(procurement_path, invalid_path) diff --git a/requirements.txt b/requirements.txt index e7c5e2c..86b4fad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,5 @@ openai==1.56.2 pathlib==1.0.1 alibabacloud_bailian20231229==1.7.0 ratelimit==2.2.1 -waitress~=3.0.0 \ No newline at end of file +waitress~=3.0.0 +reportlab==4.2.2 \ No newline at end of file