This commit is contained in:
zy123 2024-09-27 15:00:30 +08:00
parent e6cedef2c2
commit d8247c95f4
5 changed files with 153 additions and 70 deletions

View File

@ -74,9 +74,9 @@ def combine_technical_and_business(data, target_values):
# 默认其他所有内容都归为商务标
else:
if not is_business:
if '商务' not in extracted_data:
extracted_data['商务'] = {}
extracted_data['商务'][key] = value
if '商务评分' not in extracted_data:
extracted_data['商务评分'] = {}
extracted_data['商务评分'][key] = value
business_found = True
continue
@ -90,18 +90,18 @@ def combine_technical_and_business(data, target_values):
extract_nested(data)
if not technical_found:
extracted_data['技术'] = ''
extracted_data['技术评分'] = ''
if not business_found:
extracted_data['商务'] = ''
extracted_data['商务评分'] = ''
return extracted_data
def combine_evaluation_standards(truncate2):
# 商务标、技术标评分项:千问
file_id = upload_file(truncate2)
user_query_2 = (
"根据该文档中的评标办法前附表,请你列出该文件的技术标,商务标,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术'中新增子键名'备注'存放该信息。如果评分内容不是这3个则返回文档中给定的评分内容以及它的评分要求都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容")
"根据该文档中的评标办法前附表,请你列出该文件的技术评分,商务评分,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术评分'中新增子键名'备注'存放该信息。如果评分内容不是这3个则返回文档中给定的评分内容以及它的评分要求都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容")
evaluation_res = qianwen_long(file_id, user_query_2)
target_values1 = ['技术标','技术部分','设计', '实施']
target_values1 = ['技术标','技术部分','设计', '实施',"技术评分"]
# target_values2=['投标报价','商务标','商务部分','报价部分','业绩','信誉','分值','计算公式','信用','人员','资格','奖项','认证','荣誉']
# update_json=combine_technical_and_business(clean_json_string(evaluation_res),target_values1,target_values2)
update_json = combine_technical_and_business(clean_json_string(evaluation_res), target_values1)
@ -112,8 +112,8 @@ if __name__ == "__main__":
truncate2="C:\\Users\\Administrator\\Desktop\\fsdownload\\0883895c-e61f-4a99-9308-697fca1d4b77\\ztbfile_evaluation_method.pdf"
evaluation_standards_res=combine_evaluation_standards(truncate2)
# 从结果中提取"商务标"和"技术标"
technical_standards = {"技术": evaluation_standards_res.get("技术", {})}
commercial_standards = {"商务": evaluation_standards_res.get("商务", {})}
technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})}
commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})}
# 返回技术标和商务标
print(json.dumps(technical_standards,ensure_ascii=False,indent=4))
print(json.dumps(commercial_standards, ensure_ascii=False, indent=4))

View File

@ -119,8 +119,8 @@ def fetch_evaluation_standards(truncate1): # 评标办法前附表
evaluation_standards_res = combine_evaluation_standards(truncate1)
# 获取技术标和商务标
technical_standards = {"技术": evaluation_standards_res.get("技术", {})}
commercial_standards = {"商务": evaluation_standards_res.get("商务", {})}
technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})}
commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})}
logger.info("商务标和技术标 done")

View File

@ -72,6 +72,6 @@ def extract_text_by_page(file_path):
print(f"Page {page_num + 1} is empty or text could not be extracted.")
return result
if __name__ == '__main__':
file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\ztbfile.pdf"
file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\中国电信股份有限公司随州分公司广水市雪亮工程信息化项目-招标文件(定稿).pdf"
res=extract_text_by_page(file_path)
# print(res)

View File

@ -1,42 +1,79 @@
import re
from unittest.mock import Mock
def contains_number_or_index(key, value):
return (isinstance(value, (int, float)) or
(isinstance(value, str) and value.isdigit()) or
'序号' in key or
(isinstance(value, str) and re.search(r'\d+', value)))
def clean_page_content(text, common_header):
# 模拟清理页面内容的函数
return text.replace(common_header, '')
def preprocess_dict(data):
if isinstance(data, dict):
if len(data) > 1:
# 检查是否所有值都是 "" 或 "/"
if all(v == "" or v == "/" for v in data.values()):
return list(data.keys())
else:
processed = {}
for k, v in data.items():
if not contains_number_or_index(k, v):
processed_v = preprocess_dict(v)
# 即使 processed_v 是空字符串,也保留键
processed[k] = processed_v
return processed
else:
return {k: preprocess_dict(v) for k, v in data.items()}
elif isinstance(data, list):
return [preprocess_dict(item) for item in data]
else:
return data
def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
start_page = None
mid_page = None
end_page = None
for i, page in enumerate(pdf_document.pages):
text = page.extract_text() or ""
cleaned_text = clean_page_content(text, common_header)
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
start_page = i
if start_page is not None and mid_page is None and re.search(r'^一\s*[、.]\s*(说\s*明|总\s*则)', cleaned_text):
mid_page = i
if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page:
end_page = i
break
return start_page, mid_page, end_page
input_data = {
"具备法律、行政法规规定的其他条件的证明材料": {
"国家对生产和销售相关产品或提供相关服务有专门法律、行政法规规定的": "则 必须提供取得国家有关主管部门行政许可的证明材料",
"未被列入“信用中国”网站(www.creditchina.gov.cn)信用服务栏失信被执行人、 重大税收违法案件当事人名单、政府采购严重违法失信行为记录名单和“中国 政府采购”网站www.ccgp.gov.cn政府采购严重违法失信行为记录名单": "并 提供网页截图以证明",
"招标文件第一章“投标人资格要求”中有特殊要求的": "投标人应提供其符合特 殊要求的证明材料或者情况说明",
"不符合联合体投标相关规定和要求的": "",
"投标人认为需提供的其它相关资格证明材料": "",
"资格证明文件正本应为清晰彩色影印件且加盖单位公章": ""
}
}
# 创建模拟的PDF文档
def create_mock_pdf(page_contents):
pdf_mock = Mock()
pdf_mock.pages = [Mock(extract_text=lambda: content) for content in page_contents]
return pdf_mock
processed_data = preprocess_dict(input_data)
print(processed_data)
# 测试用例
def test_extract_pages():
# 测试用例1正常情况
pdf1 = create_mock_pdf([
"Page 0",
"第一章 投标人须知",
"其他内容",
"一、说明",
"更多内容",
"结束标记"
])
result1 = extract_pages_tobidders_notice(pdf1, r'第.+章\s*投标人须知', r'结束标记', -1, '')
print("Test Case 1:", result1) # 期望输出:(1, 3, 5)
# 测试用例2没有中间页
pdf2 = create_mock_pdf([
"Page 0",
"第一章 投标人须知",
"其他内容",
"结束标记"
])
result2 = extract_pages_tobidders_notice(pdf2, r'第.+章\s*投标人须知', r'结束标记', -1, '')
print("Test Case 2:", result2) # 期望输出:(1, None, 3)
# 测试用例3使用"总则"而不是"说明"
pdf3 = create_mock_pdf([
"Page 0",
"第一章 投标人须知",
"其他内容",
"一、总则",
"更多内容",
"结束标记"
])
result3 = extract_pages_tobidders_notice(pdf3, r'第.+章\s*投标人须知', r'结束标记', -1, '')
print("Test Case 3:", result3) # 期望输出:(1, 3, 5)
# 测试用例4中间页格式变化
pdf4 = create_mock_pdf([
"Page 0",
"第一章 投标人须知",
"其他内容",
"一.说 明",
"更多内容",
"结束标记"
])
result4 = extract_pages_tobidders_notice(pdf4, r'第.+章\s*投标人须知', r'结束标记', -1, '')
print("Test Case 4:", result4) # 期望输出:(1, 3, 5)
# 运行测试
test_extract_pages()

View File

@ -123,11 +123,13 @@ def merge_and_cleanup(output_pdf_path, suffix_to_merge):
def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
pdf_path = convert_to_pdf(file_path)
output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
if output_pdf_path:
if output_suffix == "qualification1":
merge_and_cleanup(output_pdf_path, "qualification3")
return output_pdf_path
result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
if result:
if output_suffix == "tobidders_notice":
return result # 返回包含两个路径的元组
elif output_suffix == "qualification1":
merge_and_cleanup(result, "qualification3")
return result
return None
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
@ -142,10 +144,16 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
if is_pdf_or_doc(file_path):
result = process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
if result:
if isinstance(result, tuple):
generated_files.extend(result)
else:
generated_files.append(result)
elif os.path.isfile(input_path) and is_pdf_or_doc(input_path):
result = process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
if result:
if isinstance(result, tuple):
generated_files.extend(result)
else:
generated_files.append(result)
else:
print("提供的路径既不是文件夹也不是PDF文件。")
@ -174,10 +182,22 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
common_header = extract_common_header(pdf_path)
pdf_document = PdfReader(pdf_path)
exclusion_pattern = None
if output_suffix == "tobidders_notice":
start_page, mid_page, end_page = extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern,
begin_page, common_header)
if start_page is None or mid_page is None or end_page is None:
print(f"未找到所需页面在文件 {pdf_path} 中!")
return None
path1 = save_extracted_pages(pdf_document, start_page, mid_page - 1, pdf_path, output_folder,
"tobidders_notice_part1")
path2 = save_extracted_pages(pdf_document, mid_page, end_page, pdf_path, output_folder,
"tobidders_notice_part2")
return path1, path2
else:
# 原有的处理逻辑保持不变
if output_suffix == "qualification1":
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
common_header,exclusion_pattern)
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern)
if start_page is None or end_page is None:
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header)
@ -188,6 +208,23 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
print(f"Error processing {pdf_path}: {e}")
return None
def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
start_page = None
mid_page = None
end_page = None
for i, page in enumerate(pdf_document.pages):
text = page.extract_text() or ""
cleaned_text = clean_page_content(text, common_header)
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
start_page = i
if start_page is not None and mid_page is None and re.search(
r'^\s*[(]?\s*[一1]\s*[)]?\s*[、.]*\s*(说\s*明|总\s*则)', cleaned_text, re.MULTILINE):
mid_page = i
if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page:
end_page = i
break
return start_page, mid_page, end_page
def get_patterns_for_procurement():
begin_pattern = re.compile(
@ -298,6 +335,15 @@ def truncate_pdf_main(input_path, output_folder, selection, output_suffix="defau
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE
)
local_output_suffix = "qualification1"
elif selection ==4: #投标人须知前附表和正文
begin_page=1
begin_pattern = re.compile(
r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)', re.MULTILINE
)
end_pattern=re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE
)
local_output_suffix = "tobidders_notice"
else:
print("无效的选择")
return None
@ -313,7 +359,7 @@ def truncate_pdf_main(input_path, output_folder, selection, output_suffix="defau
def truncate_pdf_multiple(input_path, output_folder):
truncate_files = []
for selection in range(1, 4):
for selection in range(1, 5):
files = truncate_pdf_main(input_path, output_folder, selection)
truncate_files.extend(files)
return truncate_files
@ -321,8 +367,8 @@ def truncate_pdf_multiple(input_path, output_folder):
# TODO:交通智能系统和招标(1)(1)文件有问题
if __name__ == "__main__":
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output3"
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次).pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output4"
# truncate_pdf_multiple(input_path,output_folder)
selection = 3 # 例如1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查
selection = 4 # 例如1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1和qualification2 4.投标人须知前附表
generated_files = truncate_pdf_main(input_path, output_folder, selection)