From 702b8a39ef45294204ca679f3125cb526b8f10b2 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Wed, 22 Jan 2025 16:57:40 +0800 Subject: [PATCH] =?UTF-8?q?1.22=20=E9=99=90=E5=88=B6=E4=BA=86=E6=8B=9B?= =?UTF-8?q?=E6=A0=87=E5=85=AC=E7=A4=BA=E7=BD=91=E5=9D=80=E7=9A=84=E9=95=BF?= =?UTF-8?q?=E5=BA=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/format_change.py | 16 ++++++++-------- .../general/投标人须知正文条款提取成json文件.py | 8 ++++---- flask_app/货物标/截取pdf货物标版.py | 6 +++--- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/flask_app/general/format_change.py b/flask_app/general/format_change.py index f04342a..54a7376 100644 --- a/flask_app/general/format_change.py +++ b/flask_app/general/format_change.py @@ -303,17 +303,17 @@ if __name__ == '__main__': # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf" # local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf" # local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf" - local_path_in = r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\3496bb36-c476-42f0-947e-3e39c295f8bc\ztbfile.docx" + local_path_in = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\有问题的.doc" # downloaded_file=pdf2docx(local_path_in) # # downloaded_file=pdf2docx(local_path_in) - # # downloaded_file=docx2pdf(local_path_in) - # print(downloaded_file) + downloaded_file=docx2pdf(local_path_in) + print(downloaded_file) - test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7.pdf?Expires=1736852995&OSSAccessKeyId=TMP.3Kg1oKKcsSWb7DXNe4F56bfGfKY5nNWUi274p39HyY7GR3mghMCaFWy69Fi83SBab6PmSkErh4JUD4yAxAGzVVx2hxxoxm&Signature=rmxS5lett4MzWdksDI57EujCklw%3" - local_file_name = r'D:\flask_project\flask_app\static\output\output1\1d763771-f25a-4b65-839e-3b2ca56577b1\tmp\ztbfile.pdf' - downloaded = download_file(test_url, local_file_name) - if not downloaded: - print("下载文件失败或不支持的文件类型") + # test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7.pdf?Expires=1736852995&OSSAccessKeyId=TMP.3Kg1oKKcsSWb7DXNe4F56bfGfKY5nNWUi274p39HyY7GR3mghMCaFWy69Fi83SBab6PmSkErh4JUD4yAxAGzVVx2hxxoxm&Signature=rmxS5lett4MzWdksDI57EujCklw%3" + # local_file_name = r'D:\flask_project\flask_app\static\output\output1\1d763771-f25a-4b65-839e-3b2ca56577b1\tmp\ztbfile.pdf' + # downloaded = download_file(test_url, local_file_name) + # if not downloaded: + # print("下载文件失败或不支持的文件类型") # downloaded_filepath, file_type = downloaded # print(downloaded_filepath) # print(file_type) diff --git a/flask_app/general/投标人须知正文条款提取成json文件.py b/flask_app/general/投标人须知正文条款提取成json文件.py index 9f9649c..918f6bd 100644 --- a/flask_app/general/投标人须知正文条款提取成json文件.py +++ b/flask_app/general/投标人须知正文条款提取成json文件.py @@ -75,7 +75,7 @@ def parse_text_by_heading(text): initial_heading_pattern = None special_section_keywords = ['文件的组成', '文件的构成', '文件包括:', '文件包括:', '雷同认定', - '包括以下内容'] # 定义特殊章节关键词 + '包括以下','包括下列','情形之一','情况之一'] # 定义特殊章节关键词 in_special_section = False # 标志是否在特殊章节中 lines = text.split('\n') @@ -150,7 +150,7 @@ def parse_text_by_heading(text): dot_text_match = re.match(r'^[..、]\s*(\D.+)$', line_stripped) # 匹配不带点号的纯数字开头的情况,例如 '27xxxxx' - pure_number_match = re.match(r'^(\d+)([^.\d)()号条款].*)', line_stripped) # 不允许出现右括号 + pure_number_match = re.match(r'^(\d+)([^.\d)()号条款节章项例页段部步点年月日时分秒个元千万台份家].*)', line_stripped) # [^...]:表示匹配不在 [...] 中的字符集合(即排除这些字符) if match: new_key, line_content = match.groups() @@ -481,11 +481,11 @@ def convert_clause_to_json(file_path, output_folder, type=1): if __name__ == "__main__": - file_path = r'D:\flask_project\flask_app\static\output\output1\ce679bd7-a17a-4a95-bfc9-3801bd4a86e4\tmp\ztbfile_tobidders_notice_part2.pdf' + file_path = r'C:\Users\Administrator\Desktop\新建文件夹 (3)\test\泉州白濑水利枢纽工程高低压配电装置设计、制造及采购 (1)_tobidders_notice_part2.pdf' # file_path=r'C:\Users\Administrator\Desktop\招标文件-采购类\all\2024-陕西-陕西省某单位2024年执勤化妆服采购项目_tobidders_notice_part2.pdf' # file_path=r'C:\Users\Administrator\Desktop\货物标\output4\磋商文件_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\6.2定版视频会议磋商文件_tobidders_notice_part2.pdf' - output_folder = r'D:\flask_project\flask_app\static\output\output1\ce679bd7-a17a-4a95-bfc9-3801bd4a86e4\tmp' + output_folder = r'C:\Users\Administrator\Desktop\新建文件夹 (3)\test' try: output_path = convert_clause_to_json(file_path, output_folder) print(f"Final JSON result saved to: {output_path}") diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py index 096c81b..6184c3c 100644 --- a/flask_app/货物标/截取pdf货物标版.py +++ b/flask_app/货物标/截取pdf货物标版.py @@ -319,11 +319,11 @@ if __name__ == "__main__": logger = get_global_logger("123") # input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标" # pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf" - pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\0c80edcc-cc86-4d53-8bd4-78a531446760\ztbfile.pdf" + pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\test\泉州白濑水利枢纽工程高低压配电装置设计、制造及采购 (1).pdf" # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf" # pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf" - output_folder = r"C:\Users\Administrator\Desktop\fsdownload\0c80edcc-cc86-4d53-8bd4-78a531446760\tmp" + output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\test" # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2" - selection = 1 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path + selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger) print(generated_files)