diff --git a/flask_app/general/clean_pdf.py b/flask_app/general/clean_pdf.py index a374cb7..d1d3cdd 100644 --- a/flask_app/general/clean_pdf.py +++ b/flask_app/general/clean_pdf.py @@ -72,12 +72,12 @@ def clean_page_content(text, common_header): if header_line.strip(): # 只处理非空行 # 替换首次出现的完整行 text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1) - + # 删除文本开头的“第x页”格式的页码 + text = re.sub(r'^第\d+页\s*', '', text) # 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除 - text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 - text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码 - text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码 - text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码 + text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 投标人须知这块, 页码和顶部序号混在一起的时候也会把序号给去除了。'2018.' 20为页码 18.为序号 + text = re.sub(r'^\s*\/\s*(共\s*)?\d{1,3}\s*(页)?\s*', '', text) #删除/123 /共123 /共123页 /123页 + text = re.sub(r'^\s*[—-]\s*\d{1,3}\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码 return text diff --git a/flask_app/general/format_change.py b/flask_app/general/format_change.py index 3051eda..ced2134 100644 --- a/flask_app/general/format_change.py +++ b/flask_app/general/format_change.py @@ -259,22 +259,23 @@ if __name__ == '__main__': # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf" # local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf" # local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf" - local_path_in = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile.pdf" - downloaded_file=pdf2docx(local_path_in) + # local_path_in = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile.pdf" + # downloaded_file=pdf2docx(local_path_in) # # downloaded_file=pdf2docx(local_path_in) # # downloaded_file=docx2pdf(local_path_in) - print(downloaded_file) - # test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/094%E5%AE%9A%E7%A8%BF-%E6%B9%96%E5%8C%97%E5%B7%A5%E4%B8%9A%E5%A4%A7%E5%AD%A6%E8%BD%BB%E6%AD%A6%E5%99%A8%E6%A8%A1%E6%8B%9F%E5%B0%84%E5%87%BB%E8%AE%BE%E5%A4%87%E9%87%87%E8%B4%AD%E9%A1%B9%E7%9B%AE%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6.pdf?Expires=1733410874&OSSAccessKeyId=TMP.3KdesMxbSGgK41BgYKRtckUxdQ2LUh2YMYiH1FnuupFhvRuxMbiuBAsXK9oGvHxaLvsDLQjp3db28cUK5YJSEYTGPnUevo&Signature=EjLj3KeLtj337lS1DEJO56471Tg%3D" - # local_file_name = r'C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile' - # downloaded = download_file(test_url, local_file_name) - # if not downloaded: - # print("下载文件失败或不支持的文件类型") - # downloaded_filepath, file_type = downloaded - # print(downloaded_filepath) - # print(file_type) - # # 检查文件类型 - # if file_type == 4: - # print("error") + # print(downloaded_file) + + test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/094%E5%AE%9A%E7%A8%BF-%E6%B9%96%E5%8C%97%E5%B7%A5%E4%B8%9A%E5%A4%A7%E5%AD%A6%E8%BD%BB%E6%AD%A6%E5%99%A8%E6%A8%A1%E6%8B%9F%E5%B0%84%E5%87%BB%E8%AE%BE%E5%A4%87%E9%87%87%E8%B4%AD%E9%A1%B9%E7%9B%AE%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6%20-%20%E5%89%AF%E6%9C%AC.PDF?Expires=1733478585&OSSAccessKeyId=TMP.3KhfwZc3kpT9TUmsb46yBDdnRq8bbENcEWBbZP8nLMgmSjVkjg9edpTPUQUsH8VXtvvg839Xbm8N5paYxPKvxCGqx3Vx4m&Signature=RYOo7tMEyahaMA3cSsf2kkf8co8%3D" + local_file_name = r'C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile' + downloaded = download_file(test_url, local_file_name) + if not downloaded: + print("下载文件失败或不支持的文件类型") + downloaded_filepath, file_type = downloaded + print(downloaded_filepath) + print(file_type) + # 检查文件类型 + if file_type == 4: + print("error") diff --git a/flask_app/general/读取文件/按页读取pdf.py b/flask_app/general/读取文件/按页读取pdf.py index 5f53b42..c917e10 100644 --- a/flask_app/general/读取文件/按页读取pdf.py +++ b/flask_app/general/读取文件/按页读取pdf.py @@ -13,7 +13,9 @@ def extract_text_by_page(file_path): text = page.extract_text() if text: cleaned_text = clean_page_content(text,common_header) + # cleaned_text=text print(cleaned_text) + print("-----------------"+str(page_num)) result += cleaned_text # print(f"Page {page_num + 1} Content:\n{cleaned_text}") else: @@ -116,13 +118,14 @@ def save_extracted_text_to_txt(pdf_path, txt_path): if __name__ == '__main__': # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf' - file_path=r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile.pdf" - # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf' + # file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\东莞支队查验招标文件.pdf" + file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf' # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf" # file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf" - # ress = extract_common_header(file_path) - # print(ress) + ress = extract_common_header(file_path) + print(ress) + print("-----------------") res=extract_text_by_page(file_path) # print(res)磋商文件_tobidders_notice_part2.pdf # save_extracted_text_to_txt(file_path,"output.txt") \ No newline at end of file diff --git a/flask_app/general/通用功能函数.py b/flask_app/general/通用功能函数.py index a4f48ec..4601962 100644 --- a/flask_app/general/通用功能函数.py +++ b/flask_app/general/通用功能函数.py @@ -7,7 +7,7 @@ from collections import OrderedDict from flask_app.general.json_utils import clean_json_string from flask_app.general.多线程提问 import multi_threading from flask_app.general.通义千问long import upload_file -from flask_app.main.判断是否分包等 import read_questions_from_judge +from flask_app.工程标.判断是否分包等 import read_questions_from_judge def process_judge_questions(judge_file_path, chosen_numbers, merged_baseinfo_path, baseinfo_list1): judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers) diff --git a/flask_app/old_version/基础信息整合_old.py b/flask_app/old_version/基础信息整合_old.py index e8be244..cd1cf36 100644 --- a/flask_app/old_version/基础信息整合_old.py +++ b/flask_app/old_version/基础信息整合_old.py @@ -1,8 +1,8 @@ import json from flask_app.general.json_utils import clean_json_string, rename_outer_key -from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice -from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge +from flask_app.工程标.投标人须知正文提取指定内容工程标 import extract_from_notice +from flask_app.工程标.判断是否分包等 import judge_whether_main, read_questions_from_judge from flask_app.general.多线程提问 import read_questions_from_file, multi_threading from flask_app.general.通义千问long import upload_file from flask_app.general.通用功能函数 import judge_consortium_bidding diff --git a/flask_app/old_version/形式响应评审old.py b/flask_app/old_version/形式响应评审old.py index 193dc78..94ef52c 100644 --- a/flask_app/old_version/形式响应评审old.py +++ b/flask_app/old_version/形式响应评审old.py @@ -4,10 +4,10 @@ import json import time from flask_app.general.多线程提问 import multi_threading -from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2 +from flask_app.工程标.根据条款号整合json import process_and_merge_entries,process_and_merge2 from flask_app.general.json_utils import extract_content_from_json -from flask_app.main.截取pdf import truncate_pdf_main -from flask_app.main.提取json工程标版 import convert_clause_to_json +from flask_app.工程标.截取pdf import truncate_pdf_main +from flask_app.工程标.提取json工程标版 import convert_clause_to_json prompt = """ # 角色 你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。 diff --git a/flask_app/old_version/招标文件解析_old.py b/flask_app/old_version/招标文件解析_old.py index 7bcf816..f0bae9d 100644 --- a/flask_app/old_version/招标文件解析_old.py +++ b/flask_app/old_version/招标文件解析_old.py @@ -3,17 +3,17 @@ import json import logging import time from concurrent.futures import ThreadPoolExecutor -from flask_app.main.截取pdf import truncate_pdf_multiple +from flask_app.工程标.截取pdf import truncate_pdf_multiple from flask_app.general.table_content_extraction import extract_tables_main from flask_app.old_version.文档理解大模型版知识库处理.知识库操作_old import addfileToKnowledge, deleteKnowledge -from flask_app.main.提取json工程标版 import convert_clause_to_json +from flask_app.工程标.提取json工程标版 import convert_clause_to_json from flask_app.general.json_utils import transform_json_values -from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid -from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice +from flask_app.工程标.无效标和废标和禁止投标整合 import combine_find_invalid +from flask_app.工程标.投标人须知正文提取指定内容工程标 import extract_from_notice import concurrent.futures from flask_app.old_version.基础信息整合_old import combine_basic_info from flask_app.old_version.资格审查模块old_old import combine_review_standards -from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards +from flask_app.工程标.商务评分技术评分整合 import combine_evaluation_standards from flask_app.general.format_change import pdf2docx, docx2pdf from flask_app.general.docx截取docx import copy_docx diff --git a/flask_app/old_version/解析old_old.py b/flask_app/old_version/解析old_old.py index c977194..6aea0b4 100644 --- a/flask_app/old_version/解析old_old.py +++ b/flask_app/old_version/解析old_old.py @@ -3,16 +3,16 @@ import json import logging import time from concurrent.futures import ThreadPoolExecutor -from flask_app.main.截取pdf import truncate_pdf_multiple +from flask_app.工程标.截取pdf import truncate_pdf_multiple from flask_app.general.table_content_extraction import extract_tables_main -from flask_app.main.提取json工程标版 import convert_clause_to_json +from flask_app.工程标.提取json工程标版 import convert_clause_to_json from flask_app.general.json_utils import transform_json_values -from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid -from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice +from flask_app.工程标.无效标和废标和禁止投标整合 import combine_find_invalid +from flask_app.工程标.投标人须知正文提取指定内容工程标 import extract_from_notice import concurrent.futures -from flask_app.main.基础信息整合快速版 import combine_basic_info -from flask_app.main.资格审查模块 import combine_review_standards -from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards +from flask_app.工程标.基础信息整合快速版 import combine_basic_info +from flask_app.工程标.资格审查模块 import combine_review_standards +from flask_app.工程标.商务评分技术评分整合 import combine_evaluation_standards from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx from flask_app.general.docx截取docx import copy_docx diff --git a/flask_app/old_version/资格审查模块old_old.py b/flask_app/old_version/资格审查模块old_old.py index 6f93927..e7936a0 100644 --- a/flask_app/old_version/资格审查模块old_old.py +++ b/flask_app/old_version/资格审查模块old_old.py @@ -1,6 +1,6 @@ import os -from flask_app.main.提取json工程标版 import convert_clause_to_json +from flask_app.工程标.提取json工程标版 import convert_clause_to_json from flask_app.general.json_utils import extract_content_from_json from flask_app.old_version.形式响应评审old import process_reviews from flask_app.old_version.资格评审old_old import process_qualification diff --git a/flask_app/routes/工程标解析main.py b/flask_app/routes/工程标解析main.py index 5aa829b..4ae1822 100644 --- a/flask_app/routes/工程标解析main.py +++ b/flask_app/routes/工程标解析main.py @@ -7,16 +7,16 @@ from concurrent.futures import ThreadPoolExecutor from docx import Document -from flask_app.main.截取pdf import truncate_pdf_multiple +from flask_app.工程标.截取pdf import truncate_pdf_multiple from flask_app.general.merge_pdfs import merge_pdfs -from flask_app.main.提取json工程标版 import convert_clause_to_json +from flask_app.工程标.提取json工程标版 import convert_clause_to_json from flask_app.general.json_utils import transform_json_values from flask_app.general.无效标和废标公共代码 import combine_find_invalid -from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice +from flask_app.工程标.投标人须知正文提取指定内容工程标 import extract_from_notice import concurrent.futures -from flask_app.main.基础信息整合快速版 import combine_basic_info -from flask_app.main.资格审查模块 import combine_review_standards -from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards +from flask_app.工程标.基础信息整合快速版 import combine_basic_info +from flask_app.工程标.资格审查模块 import combine_review_standards +from flask_app.工程标.商务评分技术评分整合 import combine_evaluation_standards from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx diff --git a/flask_app/routes/接口_小解析.py b/flask_app/routes/接口_小解析.py index 6ac15e1..5e1164c 100644 --- a/flask_app/routes/接口_小解析.py +++ b/flask_app/routes/接口_小解析.py @@ -10,7 +10,7 @@ from flask_app.general.多线程提问 import read_questions_from_file, multi_th from flask_app.general.通义千问long import upload_file from flask_app.货物标.基础信息解析main import aggregate_basic_info_goods from flask_app.货物标.截取pdf货物标版 import truncate_pdf_specific_goods -from flask_app.main.截取pdf import truncate_pdf_specific_engineering,truncate_pdf_main +from flask_app.工程标.截取pdf import truncate_pdf_specific_engineering,truncate_pdf_main from flask_app.general.post_processing import inner_post_processing from flask_app.old_version.基础信息整合_old import aggregate_basic_info_engineering diff --git a/flask_app/routes/货物标解析main.py b/flask_app/routes/货物标解析main.py index 887e579..87fb1d2 100644 --- a/flask_app/routes/货物标解析main.py +++ b/flask_app/routes/货物标解析main.py @@ -274,8 +274,7 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id): #TODO:考虑把解析失败的调用豆包,全文上传。 #TODO:重置一下投标文件格式提取那部分的代码 - -#TODO:小解析考虑提速:1:直接pdf转文本,再切分 2.多线程读取每页是否有图片 +#TODO:小解析考虑提速:1:直接pdf转文本,再切分。后期考虑。 #商务标这里改为列表最里层 #good_list 金额 截取上下文 if __name__ == "__main__": diff --git a/flask_app/test_case/test_正则表达式.py b/flask_app/test_case/test_正则表达式.py index d75ea1f..f535a89 100644 --- a/flask_app/test_case/test_正则表达式.py +++ b/flask_app/test_case/test_正则表达式.py @@ -1,20 +1,60 @@ import re # 合并后的正则表达式 -begin_pattern_combined = re.compile('^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?|资格性检查|资格审查|符合性审查)', re.MULTILINE) +begin_pattern = re.compile( + r'(?