9.27
This commit is contained in:
parent
e6cedef2c2
commit
d8247c95f4
@ -74,9 +74,9 @@ def combine_technical_and_business(data, target_values):
|
||||
# 默认其他所有内容都归为商务标
|
||||
else:
|
||||
if not is_business:
|
||||
if '商务标' not in extracted_data:
|
||||
extracted_data['商务标'] = {}
|
||||
extracted_data['商务标'][key] = value
|
||||
if '商务评分' not in extracted_data:
|
||||
extracted_data['商务评分'] = {}
|
||||
extracted_data['商务评分'][key] = value
|
||||
business_found = True
|
||||
continue
|
||||
|
||||
@ -90,18 +90,18 @@ def combine_technical_and_business(data, target_values):
|
||||
extract_nested(data)
|
||||
|
||||
if not technical_found:
|
||||
extracted_data['技术标'] = ''
|
||||
extracted_data['技术评分'] = ''
|
||||
if not business_found:
|
||||
extracted_data['商务标'] = ''
|
||||
extracted_data['商务评分'] = ''
|
||||
|
||||
return extracted_data
|
||||
def combine_evaluation_standards(truncate2):
|
||||
# 商务标、技术标评分项:千问
|
||||
file_id = upload_file(truncate2)
|
||||
user_query_2 = (
|
||||
"根据该文档中的评标办法前附表,请你列出该文件的技术标,商务标,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术标'中新增子键名'备注'存放该信息。如果评分内容不是这3个,则返回文档中给定的评分内容以及它的评分要求,都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容")
|
||||
"根据该文档中的评标办法前附表,请你列出该文件的技术评分,商务评分,投标报价评审标准以及它们对应的具体评分要求,若对应内容中存在其他信息,在键名如'技术评分'中新增子键名'备注'存放该信息。如果评分内容不是这3个,则返回文档中给定的评分内容以及它的评分要求,都以json的格式返回结果。请不要回答有关形式、资格、响应性评审标准的内容")
|
||||
evaluation_res = qianwen_long(file_id, user_query_2)
|
||||
target_values1 = ['技术标','技术部分','设计', '实施']
|
||||
target_values1 = ['技术标','技术部分','设计', '实施',"技术评分"]
|
||||
# target_values2=['投标报价','商务标','商务部分','报价部分','业绩','信誉','分值','计算公式','信用','人员','资格','奖项','认证','荣誉']
|
||||
# update_json=combine_technical_and_business(clean_json_string(evaluation_res),target_values1,target_values2)
|
||||
update_json = combine_technical_and_business(clean_json_string(evaluation_res), target_values1)
|
||||
@ -112,8 +112,8 @@ if __name__ == "__main__":
|
||||
truncate2="C:\\Users\\Administrator\\Desktop\\fsdownload\\0883895c-e61f-4a99-9308-697fca1d4b77\\ztbfile_evaluation_method.pdf"
|
||||
evaluation_standards_res=combine_evaluation_standards(truncate2)
|
||||
# 从结果中提取"商务标"和"技术标"
|
||||
technical_standards = {"技术标": evaluation_standards_res.get("技术标", {})}
|
||||
commercial_standards = {"商务标": evaluation_standards_res.get("商务标", {})}
|
||||
technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})}
|
||||
commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})}
|
||||
# 返回技术标和商务标
|
||||
print(json.dumps(technical_standards,ensure_ascii=False,indent=4))
|
||||
print(json.dumps(commercial_standards, ensure_ascii=False, indent=4))
|
@ -119,8 +119,8 @@ def fetch_evaluation_standards(truncate1): # 评标办法前附表
|
||||
evaluation_standards_res = combine_evaluation_standards(truncate1)
|
||||
|
||||
# 获取技术标和商务标
|
||||
technical_standards = {"技术标": evaluation_standards_res.get("技术标", {})}
|
||||
commercial_standards = {"商务标": evaluation_standards_res.get("商务标", {})}
|
||||
technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})}
|
||||
commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})}
|
||||
|
||||
logger.info("商务标和技术标 done")
|
||||
|
||||
|
@ -72,6 +72,6 @@ def extract_text_by_page(file_path):
|
||||
print(f"Page {page_num + 1} is empty or text could not be extracted.")
|
||||
return result
|
||||
if __name__ == '__main__':
|
||||
file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\ztbfile.pdf"
|
||||
file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\中国电信股份有限公司随州分公司广水市雪亮工程信息化项目-招标文件(定稿).pdf"
|
||||
res=extract_text_by_page(file_path)
|
||||
# print(res)
|
@ -1,42 +1,79 @@
|
||||
import re
|
||||
from unittest.mock import Mock
|
||||
|
||||
def contains_number_or_index(key, value):
|
||||
return (isinstance(value, (int, float)) or
|
||||
(isinstance(value, str) and value.isdigit()) or
|
||||
'序号' in key or
|
||||
(isinstance(value, str) and re.search(r'\d+', value)))
|
||||
def clean_page_content(text, common_header):
|
||||
# 模拟清理页面内容的函数
|
||||
return text.replace(common_header, '')
|
||||
|
||||
def preprocess_dict(data):
|
||||
if isinstance(data, dict):
|
||||
if len(data) > 1:
|
||||
# 检查是否所有值都是 "" 或 "/"
|
||||
if all(v == "" or v == "/" for v in data.values()):
|
||||
return list(data.keys())
|
||||
else:
|
||||
processed = {}
|
||||
for k, v in data.items():
|
||||
if not contains_number_or_index(k, v):
|
||||
processed_v = preprocess_dict(v)
|
||||
# 即使 processed_v 是空字符串,也保留键
|
||||
processed[k] = processed_v
|
||||
return processed
|
||||
else:
|
||||
return {k: preprocess_dict(v) for k, v in data.items()}
|
||||
elif isinstance(data, list):
|
||||
return [preprocess_dict(item) for item in data]
|
||||
else:
|
||||
return data
|
||||
def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
|
||||
start_page = None
|
||||
mid_page = None
|
||||
end_page = None
|
||||
for i, page in enumerate(pdf_document.pages):
|
||||
text = page.extract_text() or ""
|
||||
cleaned_text = clean_page_content(text, common_header)
|
||||
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
|
||||
start_page = i
|
||||
if start_page is not None and mid_page is None and re.search(r'^一\s*[、.]\s*(说\s*明|总\s*则)', cleaned_text):
|
||||
mid_page = i
|
||||
if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page:
|
||||
end_page = i
|
||||
break
|
||||
return start_page, mid_page, end_page
|
||||
|
||||
input_data = {
|
||||
"具备法律、行政法规规定的其他条件的证明材料": {
|
||||
"国家对生产和销售相关产品或提供相关服务有专门法律、行政法规规定的": "则 必须提供取得国家有关主管部门行政许可的证明材料",
|
||||
"未被列入“信用中国”网站(www.creditchina.gov.cn)信用服务栏失信被执行人、 重大税收违法案件当事人名单、政府采购严重违法失信行为记录名单和“中国 政府采购”网站(www.ccgp.gov.cn)政府采购严重违法失信行为记录名单": "并 提供网页截图以证明",
|
||||
"招标文件第一章“投标人资格要求”中有特殊要求的": "投标人应提供其符合特 殊要求的证明材料或者情况说明",
|
||||
"不符合联合体投标相关规定和要求的": "",
|
||||
"投标人认为需提供的其它相关资格证明材料": "",
|
||||
"资格证明文件正本应为清晰彩色影印件且加盖单位公章": ""
|
||||
}
|
||||
}
|
||||
# 创建模拟的PDF文档
|
||||
def create_mock_pdf(page_contents):
|
||||
pdf_mock = Mock()
|
||||
pdf_mock.pages = [Mock(extract_text=lambda: content) for content in page_contents]
|
||||
return pdf_mock
|
||||
|
||||
processed_data = preprocess_dict(input_data)
|
||||
print(processed_data)
|
||||
# 测试用例
|
||||
def test_extract_pages():
|
||||
# 测试用例1:正常情况
|
||||
pdf1 = create_mock_pdf([
|
||||
"Page 0",
|
||||
"第一章 投标人须知",
|
||||
"其他内容",
|
||||
"一、说明",
|
||||
"更多内容",
|
||||
"结束标记"
|
||||
])
|
||||
result1 = extract_pages_tobidders_notice(pdf1, r'第.+章\s*投标人须知', r'结束标记', -1, '')
|
||||
print("Test Case 1:", result1) # 期望输出:(1, 3, 5)
|
||||
|
||||
# 测试用例2:没有中间页
|
||||
pdf2 = create_mock_pdf([
|
||||
"Page 0",
|
||||
"第一章 投标人须知",
|
||||
"其他内容",
|
||||
"结束标记"
|
||||
])
|
||||
result2 = extract_pages_tobidders_notice(pdf2, r'第.+章\s*投标人须知', r'结束标记', -1, '')
|
||||
print("Test Case 2:", result2) # 期望输出:(1, None, 3)
|
||||
|
||||
# 测试用例3:使用"总则"而不是"说明"
|
||||
pdf3 = create_mock_pdf([
|
||||
"Page 0",
|
||||
"第一章 投标人须知",
|
||||
"其他内容",
|
||||
"一、总则",
|
||||
"更多内容",
|
||||
"结束标记"
|
||||
])
|
||||
result3 = extract_pages_tobidders_notice(pdf3, r'第.+章\s*投标人须知', r'结束标记', -1, '')
|
||||
print("Test Case 3:", result3) # 期望输出:(1, 3, 5)
|
||||
|
||||
# 测试用例4:中间页格式变化
|
||||
pdf4 = create_mock_pdf([
|
||||
"Page 0",
|
||||
"第一章 投标人须知",
|
||||
"其他内容",
|
||||
"一.说 明",
|
||||
"更多内容",
|
||||
"结束标记"
|
||||
])
|
||||
result4 = extract_pages_tobidders_notice(pdf4, r'第.+章\s*投标人须知', r'结束标记', -1, '')
|
||||
print("Test Case 4:", result4) # 期望输出:(1, 3, 5)
|
||||
|
||||
# 运行测试
|
||||
test_extract_pages()
|
@ -123,11 +123,13 @@ def merge_and_cleanup(output_pdf_path, suffix_to_merge):
|
||||
|
||||
def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
||||
pdf_path = convert_to_pdf(file_path)
|
||||
output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||
if output_pdf_path:
|
||||
if output_suffix == "qualification1":
|
||||
merge_and_cleanup(output_pdf_path, "qualification3")
|
||||
return output_pdf_path
|
||||
result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||
if result:
|
||||
if output_suffix == "tobidders_notice":
|
||||
return result # 返回包含两个路径的元组
|
||||
elif output_suffix == "qualification1":
|
||||
merge_and_cleanup(result, "qualification3")
|
||||
return result
|
||||
return None
|
||||
|
||||
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
||||
@ -142,10 +144,16 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
|
||||
if is_pdf_or_doc(file_path):
|
||||
result = process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||
if result:
|
||||
if isinstance(result, tuple):
|
||||
generated_files.extend(result)
|
||||
else:
|
||||
generated_files.append(result)
|
||||
elif os.path.isfile(input_path) and is_pdf_or_doc(input_path):
|
||||
result = process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||
if result:
|
||||
if isinstance(result, tuple):
|
||||
generated_files.extend(result)
|
||||
else:
|
||||
generated_files.append(result)
|
||||
else:
|
||||
print("提供的路径既不是文件夹也不是PDF文件。")
|
||||
@ -174,10 +182,22 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
||||
common_header = extract_common_header(pdf_path)
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
exclusion_pattern = None
|
||||
if output_suffix == "tobidders_notice":
|
||||
start_page, mid_page, end_page = extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern,
|
||||
begin_page, common_header)
|
||||
if start_page is None or mid_page is None or end_page is None:
|
||||
print(f"未找到所需页面在文件 {pdf_path} 中!")
|
||||
return None
|
||||
path1 = save_extracted_pages(pdf_document, start_page, mid_page - 1, pdf_path, output_folder,
|
||||
"tobidders_notice_part1")
|
||||
path2 = save_extracted_pages(pdf_document, mid_page, end_page, pdf_path, output_folder,
|
||||
"tobidders_notice_part2")
|
||||
return path1, path2
|
||||
else:
|
||||
# 原有的处理逻辑保持不变
|
||||
if output_suffix == "qualification1":
|
||||
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
|
||||
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
|
||||
common_header,exclusion_pattern)
|
||||
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern)
|
||||
if start_page is None or end_page is None:
|
||||
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
|
||||
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header)
|
||||
@ -188,6 +208,23 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
||||
print(f"Error processing {pdf_path}: {e}")
|
||||
return None
|
||||
|
||||
def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
|
||||
start_page = None
|
||||
mid_page = None
|
||||
end_page = None
|
||||
for i, page in enumerate(pdf_document.pages):
|
||||
text = page.extract_text() or ""
|
||||
cleaned_text = clean_page_content(text, common_header)
|
||||
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
|
||||
start_page = i
|
||||
if start_page is not None and mid_page is None and re.search(
|
||||
r'^\s*[((]?\s*[一1]\s*[))]?\s*[、.]*\s*(说\s*明|总\s*则)', cleaned_text, re.MULTILINE):
|
||||
mid_page = i
|
||||
if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page:
|
||||
end_page = i
|
||||
break
|
||||
return start_page, mid_page, end_page
|
||||
|
||||
|
||||
def get_patterns_for_procurement():
|
||||
begin_pattern = re.compile(
|
||||
@ -298,6 +335,15 @@ def truncate_pdf_main(input_path, output_folder, selection, output_suffix="defau
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE
|
||||
)
|
||||
local_output_suffix = "qualification1"
|
||||
elif selection ==4: #投标人须知前附表和正文
|
||||
begin_page=1
|
||||
begin_pattern = re.compile(
|
||||
r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人须知|磋商须知|供应商须知)+|(?:一\s*、\s*)?(?:投标人须知|磋商须知|供应商须知)前附表)', re.MULTILINE
|
||||
)
|
||||
end_pattern=re.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE
|
||||
)
|
||||
local_output_suffix = "tobidders_notice"
|
||||
else:
|
||||
print("无效的选择")
|
||||
return None
|
||||
@ -313,7 +359,7 @@ def truncate_pdf_main(input_path, output_folder, selection, output_suffix="defau
|
||||
|
||||
def truncate_pdf_multiple(input_path, output_folder):
|
||||
truncate_files = []
|
||||
for selection in range(1, 4):
|
||||
for selection in range(1, 5):
|
||||
files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
truncate_files.extend(files)
|
||||
return truncate_files
|
||||
@ -321,8 +367,8 @@ def truncate_pdf_multiple(input_path, output_folder):
|
||||
|
||||
# TODO:交通智能系统和招标(1)(1)文件有问题
|
||||
if __name__ == "__main__":
|
||||
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
||||
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output3"
|
||||
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次).pdf"
|
||||
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output4"
|
||||
# truncate_pdf_multiple(input_path,output_folder)
|
||||
selection = 3 # 例如:1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查
|
||||
selection = 4 # 例如:1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1和qualification2 4.投标人须知前附表
|
||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||
|
Loading…
x
Reference in New Issue
Block a user