12.6 优化解析

This commit is contained in:
zy123 2024-12-06 14:40:22 +08:00
parent e0ea02544d
commit d2090e7fd6
30 changed files with 136 additions and 140 deletions

View File

@ -72,12 +72,12 @@ def clean_page_content(text, common_header):
if header_line.strip(): # 只处理非空行 if header_line.strip(): # 只处理非空行
# 替换首次出现的完整行 # 替换首次出现的完整行
text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1) text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
# 删除文本开头的“第x页”格式的页码
text = re.sub(r'^第\d+页\s*', '', text)
# 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除 # 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 投标人须知这块, 页码和顶部序号混在一起的时候也会把序号给去除了。'2018.' 20为页码 18.为序号
text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码 text = re.sub(r'^\s*\/\s*(共\s*)?\d{1,3}\s*(页)?\s*', '', text) #删除/123 /共123 /共123页 /123页
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码 text = re.sub(r'^\s*[—-]\s*\d{1,3}\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
return text return text

View File

@ -259,22 +259,23 @@ if __name__ == '__main__':
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf" # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
# local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf" # local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
# local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf" # local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf"
local_path_in = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile.pdf" # local_path_in = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile.pdf"
downloaded_file=pdf2docx(local_path_in) # downloaded_file=pdf2docx(local_path_in)
# # downloaded_file=pdf2docx(local_path_in) # # downloaded_file=pdf2docx(local_path_in)
# # downloaded_file=docx2pdf(local_path_in) # # downloaded_file=docx2pdf(local_path_in)
print(downloaded_file) # print(downloaded_file)
# test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/094%E5%AE%9A%E7%A8%BF-%E6%B9%96%E5%8C%97%E5%B7%A5%E4%B8%9A%E5%A4%A7%E5%AD%A6%E8%BD%BB%E6%AD%A6%E5%99%A8%E6%A8%A1%E6%8B%9F%E5%B0%84%E5%87%BB%E8%AE%BE%E5%A4%87%E9%87%87%E8%B4%AD%E9%A1%B9%E7%9B%AE%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6.pdf?Expires=1733410874&OSSAccessKeyId=TMP.3KdesMxbSGgK41BgYKRtckUxdQ2LUh2YMYiH1FnuupFhvRuxMbiuBAsXK9oGvHxaLvsDLQjp3db28cUK5YJSEYTGPnUevo&Signature=EjLj3KeLtj337lS1DEJO56471Tg%3D"
# local_file_name = r'C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile' test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/094%E5%AE%9A%E7%A8%BF-%E6%B9%96%E5%8C%97%E5%B7%A5%E4%B8%9A%E5%A4%A7%E5%AD%A6%E8%BD%BB%E6%AD%A6%E5%99%A8%E6%A8%A1%E6%8B%9F%E5%B0%84%E5%87%BB%E8%AE%BE%E5%A4%87%E9%87%87%E8%B4%AD%E9%A1%B9%E7%9B%AE%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6%20-%20%E5%89%AF%E6%9C%AC.PDF?Expires=1733478585&OSSAccessKeyId=TMP.3KhfwZc3kpT9TUmsb46yBDdnRq8bbENcEWBbZP8nLMgmSjVkjg9edpTPUQUsH8VXtvvg839Xbm8N5paYxPKvxCGqx3Vx4m&Signature=RYOo7tMEyahaMA3cSsf2kkf8co8%3D"
# downloaded = download_file(test_url, local_file_name) local_file_name = r'C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile'
# if not downloaded: downloaded = download_file(test_url, local_file_name)
# print("下载文件失败或不支持的文件类型") if not downloaded:
# downloaded_filepath, file_type = downloaded print("下载文件失败或不支持的文件类型")
# print(downloaded_filepath) downloaded_filepath, file_type = downloaded
# print(file_type) print(downloaded_filepath)
# # 检查文件类型 print(file_type)
# if file_type == 4: # 检查文件类型
# print("error") if file_type == 4:
print("error")

View File

@ -13,7 +13,9 @@ def extract_text_by_page(file_path):
text = page.extract_text() text = page.extract_text()
if text: if text:
cleaned_text = clean_page_content(text,common_header) cleaned_text = clean_page_content(text,common_header)
# cleaned_text=text
print(cleaned_text) print(cleaned_text)
print("-----------------"+str(page_num))
result += cleaned_text result += cleaned_text
# print(f"Page {page_num + 1} Content:\n{cleaned_text}") # print(f"Page {page_num + 1} Content:\n{cleaned_text}")
else: else:
@ -116,13 +118,14 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
if __name__ == '__main__': if __name__ == '__main__':
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf' # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
file_path=r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\ztbfile.pdf" # file_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\东莞支队查验招标文件.pdf"
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf' file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf" # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf" # file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
# ress = extract_common_header(file_path) ress = extract_common_header(file_path)
# print(ress) print(ress)
print("-----------------")
res=extract_text_by_page(file_path) res=extract_text_by_page(file_path)
# print(res)磋商文件_tobidders_notice_part2.pdf # print(res)磋商文件_tobidders_notice_part2.pdf
# save_extracted_text_to_txt(file_path,"output.txt") # save_extracted_text_to_txt(file_path,"output.txt")

View File

@ -7,7 +7,7 @@ from collections import OrderedDict
from flask_app.general.json_utils import clean_json_string from flask_app.general.json_utils import clean_json_string
from flask_app.general.多线程提问 import multi_threading from flask_app.general.多线程提问 import multi_threading
from flask_app.general.通义千问long import upload_file from flask_app.general.通义千问long import upload_file
from flask_app.main.判断是否分包等 import read_questions_from_judge from flask_app.工程标.判断是否分包等 import read_questions_from_judge
def process_judge_questions(judge_file_path, chosen_numbers, merged_baseinfo_path, baseinfo_list1): def process_judge_questions(judge_file_path, chosen_numbers, merged_baseinfo_path, baseinfo_list1):
judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers) judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)

View File

@ -1,8 +1,8 @@
import json import json
from flask_app.general.json_utils import clean_json_string, rename_outer_key from flask_app.general.json_utils import clean_json_string, rename_outer_key
from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice from flask_app.工程标.投标人须知正文提取指定内容工程标 import extract_from_notice
from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge from flask_app.工程标.判断是否分包等 import judge_whether_main, read_questions_from_judge
from flask_app.general.多线程提问 import read_questions_from_file, multi_threading from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
from flask_app.general.通义千问long import upload_file from flask_app.general.通义千问long import upload_file
from flask_app.general.通用功能函数 import judge_consortium_bidding from flask_app.general.通用功能函数 import judge_consortium_bidding

View File

@ -4,10 +4,10 @@ import json
import time import time
from flask_app.general.多线程提问 import multi_threading from flask_app.general.多线程提问 import multi_threading
from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2 from flask_app.工程标.根据条款号整合json import process_and_merge_entries,process_and_merge2
from flask_app.general.json_utils import extract_content_from_json from flask_app.general.json_utils import extract_content_from_json
from flask_app.main.截取pdf import truncate_pdf_main from flask_app.工程标.截取pdf import truncate_pdf_main
from flask_app.main.提取json工程标版 import convert_clause_to_json from flask_app.工程标.提取json工程标版 import convert_clause_to_json
prompt = """ prompt = """
# 角色 # 角色
你是一个文档处理专家专门负责理解和操作基于特定内容的文档任务这包括解析总结搜索或生成与给定文档相关的各类信息 你是一个文档处理专家专门负责理解和操作基于特定内容的文档任务这包括解析总结搜索或生成与给定文档相关的各类信息

View File

@ -3,17 +3,17 @@ import json
import logging import logging
import time import time
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from flask_app.main.截取pdf import truncate_pdf_multiple from flask_app.工程标.截取pdf import truncate_pdf_multiple
from flask_app.general.table_content_extraction import extract_tables_main from flask_app.general.table_content_extraction import extract_tables_main
from flask_app.old_version.文档理解大模型版知识库处理.知识库操作_old import addfileToKnowledge, deleteKnowledge from flask_app.old_version.文档理解大模型版知识库处理.知识库操作_old import addfileToKnowledge, deleteKnowledge
from flask_app.main.提取json工程标版 import convert_clause_to_json from flask_app.工程标.提取json工程标版 import convert_clause_to_json
from flask_app.general.json_utils import transform_json_values from flask_app.general.json_utils import transform_json_values
from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid from flask_app.工程标.无效标和废标和禁止投标整合 import combine_find_invalid
from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice from flask_app.工程标.投标人须知正文提取指定内容工程标 import extract_from_notice
import concurrent.futures import concurrent.futures
from flask_app.old_version.基础信息整合_old import combine_basic_info from flask_app.old_version.基础信息整合_old import combine_basic_info
from flask_app.old_version.资格审查模块old_old import combine_review_standards from flask_app.old_version.资格审查模块old_old import combine_review_standards
from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards from flask_app.工程标.商务评分技术评分整合 import combine_evaluation_standards
from flask_app.general.format_change import pdf2docx, docx2pdf from flask_app.general.format_change import pdf2docx, docx2pdf
from flask_app.general.docx截取docx import copy_docx from flask_app.general.docx截取docx import copy_docx

View File

@ -3,16 +3,16 @@ import json
import logging import logging
import time import time
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from flask_app.main.截取pdf import truncate_pdf_multiple from flask_app.工程标.截取pdf import truncate_pdf_multiple
from flask_app.general.table_content_extraction import extract_tables_main from flask_app.general.table_content_extraction import extract_tables_main
from flask_app.main.提取json工程标版 import convert_clause_to_json from flask_app.工程标.提取json工程标版 import convert_clause_to_json
from flask_app.general.json_utils import transform_json_values from flask_app.general.json_utils import transform_json_values
from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid from flask_app.工程标.无效标和废标和禁止投标整合 import combine_find_invalid
from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice from flask_app.工程标.投标人须知正文提取指定内容工程标 import extract_from_notice
import concurrent.futures import concurrent.futures
from flask_app.main.基础信息整合快速版 import combine_basic_info from flask_app.工程标.基础信息整合快速版 import combine_basic_info
from flask_app.main.资格审查模块 import combine_review_standards from flask_app.工程标.资格审查模块 import combine_review_standards
from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards from flask_app.工程标.商务评分技术评分整合 import combine_evaluation_standards
from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx
from flask_app.general.docx截取docx import copy_docx from flask_app.general.docx截取docx import copy_docx

View File

@ -1,6 +1,6 @@
import os import os
from flask_app.main.提取json工程标版 import convert_clause_to_json from flask_app.工程标.提取json工程标版 import convert_clause_to_json
from flask_app.general.json_utils import extract_content_from_json from flask_app.general.json_utils import extract_content_from_json
from flask_app.old_version.形式响应评审old import process_reviews from flask_app.old_version.形式响应评审old import process_reviews
from flask_app.old_version.资格评审old_old import process_qualification from flask_app.old_version.资格评审old_old import process_qualification

View File

@ -7,16 +7,16 @@ from concurrent.futures import ThreadPoolExecutor
from docx import Document from docx import Document
from flask_app.main.截取pdf import truncate_pdf_multiple from flask_app.工程标.截取pdf import truncate_pdf_multiple
from flask_app.general.merge_pdfs import merge_pdfs from flask_app.general.merge_pdfs import merge_pdfs
from flask_app.main.提取json工程标版 import convert_clause_to_json from flask_app.工程标.提取json工程标版 import convert_clause_to_json
from flask_app.general.json_utils import transform_json_values from flask_app.general.json_utils import transform_json_values
from flask_app.general.无效标和废标公共代码 import combine_find_invalid from flask_app.general.无效标和废标公共代码 import combine_find_invalid
from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice from flask_app.工程标.投标人须知正文提取指定内容工程标 import extract_from_notice
import concurrent.futures import concurrent.futures
from flask_app.main.基础信息整合快速版 import combine_basic_info from flask_app.工程标.基础信息整合快速版 import combine_basic_info
from flask_app.main.资格审查模块 import combine_review_standards from flask_app.工程标.资格审查模块 import combine_review_standards
from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards from flask_app.工程标.商务评分技术评分整合 import combine_evaluation_standards
from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx

View File

@ -10,7 +10,7 @@ from flask_app.general.多线程提问 import read_questions_from_file, multi_th
from flask_app.general.通义千问long import upload_file from flask_app.general.通义千问long import upload_file
from flask_app.货物标.基础信息解析main import aggregate_basic_info_goods from flask_app.货物标.基础信息解析main import aggregate_basic_info_goods
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_specific_goods from flask_app.货物标.截取pdf货物标版 import truncate_pdf_specific_goods
from flask_app.main.截取pdf import truncate_pdf_specific_engineering,truncate_pdf_main from flask_app.工程标.截取pdf import truncate_pdf_specific_engineering,truncate_pdf_main
from flask_app.general.post_processing import inner_post_processing from flask_app.general.post_processing import inner_post_processing
from flask_app.old_version.基础信息整合_old import aggregate_basic_info_engineering from flask_app.old_version.基础信息整合_old import aggregate_basic_info_engineering

View File

@ -274,8 +274,7 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
#TODO:考虑把解析失败的调用豆包,全文上传。 #TODO:考虑把解析失败的调用豆包,全文上传。
#TODO:重置一下投标文件格式提取那部分的代码 #TODO:重置一下投标文件格式提取那部分的代码
#TODO:小解析考虑提速1直接pdf转文本再切分。后期考虑。
#TODO:小解析考虑提速1直接pdf转文本再切分 2.多线程读取每页是否有图片
#商务标这里改为列表最里层 #商务标这里改为列表最里层
#good_list 金额 截取上下文 #good_list 金额 截取上下文
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,20 +1,60 @@
import re import re
# 合并后的正则表达式 # 合并后的正则表达式
begin_pattern_combined = re.compile('^(?:附录(?:一|1)?[:]?|附件(?:一|1)?[:]?|附表(?:一|1)?[:]?|资格性检查|资格审查|符合性审查)', re.MULTILINE) begin_pattern = re.compile(
r'(?<!见)' # 确保前面不是“见”
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
r'[\u4e00-\u9fff、()]*?' # 匹配允许的字符(中文、顿号、括号)
r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审”
r'(?=.*(?:办法|方法))' # 确保包含“办法”或“方法”
r'[\u4e00-\u9fff、()]*\s*$' # 继续匹配允许的字符直到行尾
r'|\s*评标(办法|方法)前附表\s*$', # 或匹配“评标办法前附表”或“评标方法前附表”
re.MULTILINE
)
# 测试字符串 # 测试字符串
test_strings = [ test_strings = [
"附件 4", """
"第三部分 商务要求哈哈", .4评标委员会成员因缺席回避擅评标办法前附表康等原因不能继续履评标办法前附表
"第六章 哈哈采购", 责的采购人或者采购代理机构有权向相关监督管理部门通报
"第八部分 需求说明", 17. 投标人资格审查和投标文件符合性审查
"第九章 技术要求", 17.1投标人资格审查指依据法律法规和招标文件的规定对投标文件中的资格资信证
明等进行审查以确定投标人是否具备投标资格投标文件符合性审查指依据招标文件的
规定从投标文件的有效性完整性和对招标文件的响应程度进行审查以确定是否对招
标文件的实质性要求作出响应
17.2投标人未通过资格审查的不得进入投标文件符合性审查 投标人未通过符合性审查的
不得进入投标文件的综合比较与评价
17.3品牌及型号必须为清单中有效期内产品并提供证明文件 否则其投标将作为无效投标
被拒绝
17.3.1如本项目使用最低评标价法 提供相同品牌产品的不同投标人以其中通过资格审查
符合性审查且报价最低的参加评标报价相同的由采购人或者采购人委托评标委员会按
照招标文件中评标办法规定的方式确定 一个参加评标的投标人其他投标无效
17.3.2如本项目使用综合评分法提供相同品牌产品且通过资格审查符合性审查的不同
投标人按一家投标人计算评审后得分最高的同品牌投标人获得中标人推荐资格评审
得分相同的由采购人或者采购人委托评标 委员会按照招标文件中评标办法规定的方式确
定一个投标人获得中标人推荐资格
17.4如一个分包内包含多种产品的 采购人或采购代理机构将在投标人须知前附表中载明
核心产品多家投标人提供的所有核心产品品牌均相同的 按第 18.3.2 条及相关法律法
规处理
17.5投标人所投产品如被列入财政部与国家主管部门颁发的节能产品目录或环境标志产
品目录应提供相关证明在评标时予以优先采购具体优先采购办见第五章评标方法
和标准如采购人所采购产品为政府强制采购的节能产品投标人所投产品的品牌及型号
必须为清单中有效期内产品并提供证明文件否则其投标将作为无效投标被拒绝
17.6投标人不良信用记录以采购人或采购代理机构查询结果为准
17.7资格审查和符合性审查标准详见第五章评标方法和标准
18. 投标文件的澄清和修正
18.1对于投标文件中含义不明确 同类问题表述不一致或者有明显文字和计算错误的内容
评标委员会应当以书面形式要求投标人作出必要的澄 说明或者补正
18.2投标人的澄清说明或者补正应当采用书面形式并加盖公章或者由法定代表人或
其授权的代表签字投标人的澄清说明或者补正不得超出投标文件的范围或者改变投标
文件的实质性内容澄清文件将作为投标文件内容的一部分
"""
] ]
for test_string in test_strings: for test_string in test_strings:
match = begin_pattern_combined.search(test_string) match = re.search(begin_pattern, test_string)
if match: if match:
print(f"匹配成功:{test_string}") print("Matched Content:", match.group()) # 输出匹配的内容
else: else:
print(f"匹配失败:{test_string}") print("No match found.")

View File

@ -3,7 +3,7 @@ import json
import os.path import os.path
import re import re
from flask_app.general.json_utils import extract_content_from_json, clean_json_string # 可以选择性地导入特定的函数 from flask_app.general.json_utils import extract_content_from_json, clean_json_string # 可以选择性地导入特定的函数
from flask_app.main.提取打勾符号 import read_pdf_and_judge_main from flask_app.工程标.提取打勾符号 import read_pdf_and_judge_main
from flask_app.general.多线程提问 import multi_threading from flask_app.general.多线程提问 import multi_threading
from flask_app.general.通义千问long import qianwen_long,upload_file from flask_app.general.通义千问long import qianwen_long,upload_file
#调用qianwen-ask之后组织提示词问百炼。 #调用qianwen-ask之后组织提示词问百炼。

View File

@ -4,8 +4,8 @@ import time
import concurrent.futures import concurrent.futures
from flask_app.general.json_utils import clean_json_string, rename_outer_key from flask_app.general.json_utils import clean_json_string, rename_outer_key
from flask_app.general.通用功能函数 import judge_consortium_bidding, process_judge_questions from flask_app.general.通用功能函数 import judge_consortium_bidding, process_judge_questions
from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice from flask_app.工程标.投标人须知正文提取指定内容工程标 import extract_from_notice
from flask_app.main.判断是否分包等 import read_questions_from_judge, merge_json_to_list from flask_app.工程标.判断是否分包等 import read_questions_from_judge, merge_json_to_list
from flask_app.general.多线程提问 import read_questions_from_file, multi_threading from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
from flask_app.general.通义千问long import upload_file,qianwen_long from flask_app.general.通义千问long import upload_file,qianwen_long

View File

@ -5,9 +5,9 @@ import json
import time import time
from flask_app.general.多线程提问 import multi_threading from flask_app.general.多线程提问 import multi_threading
from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2 from flask_app.工程标.根据条款号整合json import process_and_merge_entries,process_and_merge2
from flask_app.general.json_utils import clean_json_string from flask_app.general.json_utils import clean_json_string
from flask_app.main.提取json工程标版 import convert_clause_to_json from flask_app.工程标.提取json工程标版 import convert_clause_to_json
from flask_app.general.通义千问long import upload_file from flask_app.general.通义千问long import upload_file
from flask_app.general.merge_pdfs import merge_pdfs from flask_app.general.merge_pdfs import merge_pdfs
prompt = """ prompt = """

View File

@ -75,7 +75,7 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header, def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header,
is_secondary_match): is_secondary_match):
pdf_document = PdfReader(pdf_path) pdf_document = PdfReader(pdf_path)
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
def run_extraction(): def run_extraction():
start_page = None start_page = None
@ -195,7 +195,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
pdf_document = PdfReader(pdf_path) pdf_document = PdfReader(pdf_path)
start_page = None start_page = None
end_page = None end_page = None
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
# 遍历文档的每一页,查找开始和结束短语的位置 # 遍历文档的每一页,查找开始和结束短语的位置
for i in range(len(pdf_document.pages)): for i in range(len(pdf_document.pages)):
page = pdf_document.pages[i] page = pdf_document.pages[i]
@ -367,7 +367,13 @@ def truncate_pdf_main(input_path, output_folder, selection):
), ),
( (
re.compile( re.compile(
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]*?(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))[\u4e00-\u9fff、()]*\s*$|\s*评标(办法|方法)前附表\s*$', r'(?<!见)' # 确保前面不是“见”
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
r'[\u4e00-\u9fff、()]*?' # 匹配允许的字符(中文、顿号、括号)
r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审”
r'(?=.*(?:办法|方法))' # 确保包含“办法”或“方法”
r'[\u4e00-\u9fff、()]*\s*$' # 继续匹配允许的字符直到行尾
r'|\s*评标(办法|方法)前附表\s*$', # 或匹配“评标办法前附表”或“评标方法前附表”
re.MULTILINE re.MULTILINE
), ),
re.compile(r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE) re.compile(r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE)

View File

@ -5,8 +5,8 @@ import time
from flask_app.general.format_change import pdf2docx from flask_app.general.format_change import pdf2docx
from flask_app.general.json_utils import extract_content_from_json, clean_json_string from flask_app.general.json_utils import extract_content_from_json, clean_json_string
from flask_app.general.table_content_extraction import extract_tables_main from flask_app.general.table_content_extraction import extract_tables_main
from flask_app.main.形式响应评审 import process_reviews from flask_app.工程标.形式响应评审 import process_reviews
from flask_app.main.资格评审 import process_qualification from flask_app.工程标.资格评审 import process_qualification
from flask_app.general.通义千问long import upload_file, qianwen_long from flask_app.general.通义千问long import upload_file, qianwen_long
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from flask_app.货物标.资格审查main import combine_qualification_review from flask_app.货物标.资格审查main import combine_qualification_review

View File

@ -1,6 +1,6 @@
import json import json
from flask_app.main.资格评审 import process_qualification from flask_app.工程标.资格评审 import process_qualification
def test_process_qualification_type1(): def test_process_qualification_type1():

View File

@ -7,7 +7,7 @@ from flask_app.general.json_utils import clean_json_string, rename_outer_key
from flask_app.general.通用功能函数 import judge_consortium_bidding, process_judge_questions from flask_app.general.通用功能函数 import judge_consortium_bidding, process_judge_questions
from flask_app.general.多线程提问 import read_questions_from_file, multi_threading from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
from flask_app.general.通义千问long import upload_file from flask_app.general.通义千问long import upload_file
from flask_app.main.判断是否分包等 import merge_json_to_list, read_questions_from_judge from flask_app.工程标.判断是否分包等 import merge_json_to_list, read_questions_from_judge
from flask_app.货物标.投标人须知正文提取指定内容货物标版 import extract_from_notice from flask_app.货物标.投标人须知正文提取指定内容货物标版 import extract_from_notice
from flask_app.货物标.提取采购需求main import fetch_procurement_reqs from flask_app.货物标.提取采购需求main import fetch_procurement_reqs

View File

@ -105,7 +105,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
total_pages = len(pdf_document.pages) - 1 # 获取总页数 total_pages = len(pdf_document.pages) - 1 # 获取总页数
if output_suffix == "tobidders_notice": if output_suffix == "tobidders_notice":
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
start_page, mid_page, end_page = extract_pages_tobidders_notice( start_page, mid_page, end_page = extract_pages_tobidders_notice(
pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern
) )
@ -122,8 +122,8 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
else: else:
# 原有的处理逻辑保持不变 # 原有的处理逻辑保持不变
if output_suffix == "qualification1" or output_suffix=="procurement": if output_suffix == "qualification1" or output_suffix=="procurement" or output_suffix=="evaluation_method":
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern, output_suffix) start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern, output_suffix)
# 针对 selection = 6 的特殊处理 # 针对 selection = 6 的特殊处理
if output_suffix == "format": if output_suffix == "format":
@ -167,10 +167,11 @@ def get_patterns_for_evaluation_method():
# re.MULTILINE # re.MULTILINE
# ) # )
begin_pattern = re.compile( begin_pattern = re.compile(
r'(?<!见)'
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分” r'第[一二三四五六七八九1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
r'(?:[\u4e00-\u9fff、()]*?)' # 匹配允许的字符(中文、顿号、括号) r'(?:[\u4e00-\u9fff、()]*?)' # 匹配允许的字符(中文、顿号、括号)
r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审” 注意这里的'.*'是允许这些关键词出现在任意位置,但主体匹配部分仍然受到字符集的限制。 r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审” 注意这里的'.*'是允许这些关键词出现在任意位置,但主体匹配部分仍然受到字符集的限制。
r'(?=.*(?:办法|方法))' # 确保包含“办法”或“方法” r'(?=.*(?:办法|方法|内容))' # 确保包含“办法”或“方法”
r'[\u4e00-\u9fff、()]*\s*$', # 继续匹配允许的字符直到行尾 r'[\u4e00-\u9fff、()]*\s*$', # 继续匹配允许的字符直到行尾
re.MULTILINE re.MULTILINE
) )
@ -374,60 +375,6 @@ def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_h
return start_page, mid_page, end_page return start_page, mid_page, end_page
#投标人须知分为两个章节
# def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page):
# begin_pattern = re.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+'
# )
# end_pattern = re.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)' # 捕获中文部分
# )
# exclusion_words = ["合同", "评标", "开标","评审","采购","资格"] # 在这里添加需要排除的关键词
#
# pdf_document = PdfReader(pdf_path)
# exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
#
# # 提取第一部分
# start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
# if start_page1 is None or end_page1 is None:
# print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
# return "", ""
#
# # 保存第一部分的路径
# path1 = save_extracted_pages(pdf_document, start_page1, end_page1, pdf_path, output_folder,
# "tobidders_notice_part1")
#
# # 提取第二部分
# start_page2 = end_page1
#
# # 检查end_page1页面的内容
# text = pdf_document.pages[end_page1].extract_text() or ""
# cleaned_text = clean_page_content(text, common_header)
# match = end_pattern.search(cleaned_text)
#
# if match:
# # 获取匹配到的中文部分
# chapter_title = match.group(1)
# # 检查是否包含排除关键词
# if any(word in chapter_title for word in exclusion_words):
# # 如果包含排除关键词,直接返回相同的路径
# return path1, path1
#
# # 如果不包含排除关键词,继续提取第二部分
# _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
# exclusion_pattern)
#
# if end_page2 is None:
# print(f"second: {output_suffix} 未找到第二部分的结束页在文件 {pdf_path} 中!")
# return path1, path1
#
# # 保存第二部分的路径
# path2 = save_extracted_pages(pdf_document, start_page2, end_page2, pdf_path, output_folder,
# "tobidders_notice_part2")
#
# return path1, path2
def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page): def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page):
output_suffix="tobidders_notice" output_suffix="tobidders_notice"
begin_pattern = re.compile( begin_pattern = re.compile(
@ -438,7 +385,7 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page)
) )
exclusion_words = ["合同", "评标", "开标","评审","采购","资格"] # 在这里添加需要排除的关键词 exclusion_words = ["合同", "评标", "开标","评审","采购","资格"] # 在这里添加需要排除的关键词
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件构成|文件组成')
# 提取第一部分 # 提取第一部分
start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header) start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
@ -548,7 +495,7 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page): def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page):
try: try:
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
pdf_document = PdfReader(pdf_path) pdf_document = PdfReader(pdf_path)
patterns = None patterns = None
start_page = None start_page = None
@ -688,8 +635,7 @@ def process_input(input_path, output_folder, selection, output_suffix):
end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE) end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE)
local_output_suffix = "notice" local_output_suffix = "notice"
elif selection == 2: elif selection == 2:
begin_pattern = re.compile( begin_pattern = re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法|内容))')
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(磋商|谈判|评标|评定|评审|办法|方法).*(磋商|谈判|评标|评定|评审|办法|方法)')
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+') end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
local_output_suffix = "evaluation_method" local_output_suffix = "evaluation_method"
elif selection == 3: elif selection == 3:
@ -827,17 +773,18 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
#ztbfile.pdf少资格评审 包头少符合性评审 #ztbfile.pdf少资格评审 包头少符合性评审
if __name__ == "__main__": if __name__ == "__main__":
logger=get_global_logger("123")
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标" input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2020-安徽-安徽省生态环境厅电梯采购.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp" # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
output_folder=r"C:\Users\Administrator\Desktop\new招标文件\output5" output_folder=r"C:\Users\Administrator\Desktop\招标文件-采购类\tmp2"
files = truncate_pdf_multiple(input_path, output_folder) # files = truncate_pdf_multiple(input_path, output_folder,logger)
# selections = [3,5] # selections = [3,5]
# files=truncate_pdf_specific_goods(input_path,output_folder,selections) # files=truncate_pdf_specific_goods(input_path,output_folder,selections)
# print(files) # print(files)
selection = 5# 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 selection = 2# 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
generated_files = truncate_pdf_main(input_path, output_folder, selection) generated_files = truncate_pdf_main(input_path, output_folder, selection)
print(generated_files) print(generated_files)

View File

@ -177,9 +177,9 @@ def process_folder(input_folder, output_folder):
#TODO:招标文件111_tobidders_notice_part2.pdf 陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目(1)_tobidders_notice_part2.pdf #TODO:招标文件111_tobidders_notice_part2.pdf 陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目(1)_tobidders_notice_part2.pdf
if __name__ == "__main__": if __name__ == "__main__":
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf' # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
file_path=r'C:\Users\Administrator\Desktop\货物标\output4\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次_tobidders_notice_part2.pdf' file_path=r'C:\Users\Administrator\Desktop\货物标\output4\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf'
output_folder = r'C:\Users\Administrator\Desktop\招标文件\output3\tmp' output_folder = r'C:\Users\Administrator\Desktop\招标文件\output4\tmp'
try: try:
output_path = convert_clause_to_json(file_path,output_folder,1) output_path = convert_clause_to_json(file_path,output_folder,1)
print(f"Final JSON result saved to: {output_path}") print(f"Final JSON result saved to: {output_path}")