From ccab078ac36690a4bb9758b74d08b0a778c7a133 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Tue, 5 Nov 2024 16:29:32 +0800 Subject: [PATCH] =?UTF-8?q?11.5=E8=B4=A7=E7=89=A9=E6=A0=87=E6=88=AA?= =?UTF-8?q?=E5=8F=96=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/format_change.py | 4 +- flask_app/general/读取文件/按页读取pdf.py | 2 +- flask_app/main/截取pdf.py | 4 +- flask_app/货物标/截取pdf货物标版.py | 228 +++++++++++++++------- flask_app/货物标/资格审查main.py | 64 ++++-- 5 files changed, 208 insertions(+), 94 deletions(-) diff --git a/flask_app/general/format_change.py b/flask_app/general/format_change.py index f350507..e6dc960 100644 --- a/flask_app/general/format_change.py +++ b/flask_app/general/format_change.py @@ -184,9 +184,9 @@ if __name__ == '__main__': # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf" # local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf" local_path_in="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\6.2定版视频会议磋商文件.doc" - # downloaded_file=doc2docx(local_path_in) + downloaded_file=doc2docx(local_path_in) # downloaded_file=pdf2docx(local_path_in) - downloaded_file=docx2pdf(local_path_in) + # downloaded_file=docx2pdf(local_path_in) print(downloaded_file) diff --git a/flask_app/general/读取文件/按页读取pdf.py b/flask_app/general/读取文件/按页读取pdf.py index 1828641..5e0687d 100644 --- a/flask_app/general/读取文件/按页读取pdf.py +++ b/flask_app/general/读取文件/按页读取pdf.py @@ -95,7 +95,7 @@ def extract_text_by_page(file_path): if __name__ == '__main__': - file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf' + file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\交警支队机动车查验监管系统项目采购.pdf" # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf' diff --git a/flask_app/main/截取pdf.py b/flask_app/main/截取pdf.py index 66e388b..c6b09be 100644 --- a/flask_app/main/截取pdf.py +++ b/flask_app/main/截取pdf.py @@ -16,8 +16,6 @@ def get_global_logger(unique_id): logger = None - - def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header): try: # 获取文件基本名称 @@ -238,7 +236,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l begin_pattern = r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::])' end_pattern = re.compile( r'^(?:附录[一二三四五六七八九1-9]*[::]|附件[一二三四五六七八九1-9]*[::]|附表[一二三四五六七八九1-9]*[::])(?!.*(?:资质|能力|信誉)).*$|' - r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、]\s*$|\s*评标(办法|方法)前附表\s*$|投标人须知', + r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()()]\s*$|\s*评标(办法|方法)前附表\s*$|投标人须知', re.MULTILINE ) start_page = None diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py index e051811..a3c109e 100644 --- a/flask_app/货物标/截取pdf货物标版.py +++ b/flask_app/货物标/截取pdf货物标版.py @@ -110,12 +110,14 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter if output_suffix == "tobidders_notice": exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') start_page, mid_page, end_page = extract_pages_tobidders_notice( - pdf_document, begin_pattern, begin_page, common_header, exclusion_pattern + pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern ) - - if start_page is None or end_page is None or mid_page is None: - print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") - return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header) + if not start_page or not mid_page or not end_page: + print(f"三次提取tobidders_notice均失败!!{pdf_path}") + return "","" + # if start_page is None or end_page is None or mid_page is None: + # print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") + # return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page) path1 = save_extracted_pages(pdf_document, start_page, mid_page, pdf_path, output_folder, "tobidders_notice_part1") path2 = save_extracted_pages(pdf_document, mid_page, end_page, pdf_path, output_folder, "tobidders_notice_part2") @@ -138,7 +140,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix) if start_page is None or end_page is None: print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") - return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header) + return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page) elif output_suffix == "qualification1": truncate_pdf_main(pdf_path, output_folder, 2, "qualification3") # 合并'资格审查'章节和'评标办法'章节 return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix) @@ -175,28 +177,33 @@ def get_patterns_for_evaluation_method(): return begin_pattern, end_pattern +# def get_patterns_for_qualification(): +# # # 原始匹配逻辑 +# # begin_pattern_original = re.compile( +# # r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE) +# # end_pattern_original = re.compile( +# # r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) +# +# # 新匹配逻辑 +# begin_pattern_new = re.compile( +# r'^资格性检查', re.MULTILINE) +# end_pattern_new = re.compile( +# r'^附件\s*\d+|^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) +# +# return begin_pattern_new, end_pattern_new def get_patterns_for_qualification(): - # # 原始匹配逻辑 - # begin_pattern_original = re.compile( - # r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE) - # end_pattern_original = re.compile( - # r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) - - # 新匹配逻辑 - begin_pattern_new = re.compile( - r'^资格性检查', re.MULTILINE) - end_pattern_new = re.compile( - r'^附件\s*\d+|^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) - - return begin_pattern_new, end_pattern_new -def get_patterns_for_qualification2(): + # begin_pattern = re.compile( + # r'^(?:附录(?:一|1)?[::]|附件(?:一|1)?[::]|附表(?:一|1)?[::]|资格性检查|资格审查表).*(?:资格|符合性)?.*$', + # re.MULTILINE + # ) begin_pattern = re.compile( - r'^(?:附录(?:一|1)?[::]|附件(?:一|1)?[::]|附表(?:一|1)?[::]).*(?:资格|符合性).*$', + r'^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?|资格性检查|资格审查表|符合性审查表)', #目前可以匹配 附录3,因为允许了'附录'匹配 re.MULTILINE ) end_pattern = re.compile( - r'^(?!.*(?:资格|符合))(?:附录.*?[::]|附件.*?[::]|附表.*?[::]).*$|' - r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', + r'^(?!.*(?:资格|符合))(?:附录.*?[::]|附件.*?[::]|附表.*?[::]|附件\s*\d+).*$|' + # r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', + r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', #更宽松的匹配 re.MULTILINE ) return begin_pattern, end_pattern @@ -239,7 +246,7 @@ def get_patterns_for_notice_twice(): # break # return start_page, mid_page, end_page -def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, common_header, exclusion_pattern): +def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern): """ 从PDF文档中提取起始页、中间页和结束页。 @@ -274,7 +281,7 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, comm end_page = None chapter_type = None # 用于存储“章”或“部分” combined_mid_pattern = None # 中间页的组合模式 - + pdf_document = PdfReader(pdf_path) for i, page in enumerate(pdf_document.pages): text = page.extract_text() or "" cleaned_text = clean_page_content(text, common_header) @@ -345,8 +352,7 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, comm ) else: # 如果提供了固定的 end_pattern,则使用默认的 mid_pattern - base_mid_pattern = r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|' \ - r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则)' + base_mid_pattern = r'.*[((]?\s*[一二12]?[))]?\s*[、..]*\s*(说\s*明|总\s*则)\s*$' #可以匹配"东莞市粤隆招标有限公司编制 46一、说明" combined_mid_pattern = re.compile( rf'{base_mid_pattern}', re.MULTILINE @@ -355,6 +361,8 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, comm # 识别中间页 if start_page is not None and mid_page is None and combined_mid_pattern: + if (start_page+1==i) and re.search(local_begin_pattern,cleaned_text): + continue if re.search(combined_mid_pattern, cleaned_text): mid_page = i @@ -374,23 +382,83 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, comm # 如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取 if not (start_page and mid_page and end_page): - # 定义新的 begin_pattern 和 end_pattern - new_begin_pattern = re.compile( - r'.*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|' - r'(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表', - re.MULTILINE - ) - new_end_pattern = re.compile( - r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*$', - re.MULTILINE - ) - # 第二次提取尝试,使用新的模式 - start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern) + print(f"第二次尝试 tobidders_notice!{pdf_path}") + pdf_document = PdfReader(pdf_path) + start_page,mid_page,end_page=extract_pages_twice_tobidders_notice(pdf_document,common_header,begin_page) + if start_page and end_page and mid_page: + return start_page, mid_page, end_page + else: + # 定义新的 begin_pattern 和 end_pattern + new_begin_pattern = re.compile( + r'.*(?:投标人|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|' + r'(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表', + re.MULTILINE + ) + new_end_pattern = re.compile( + r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*$', + re.MULTILINE + ) + print("第三次尝试 tobidders_notice! ") + # 第二次提取尝试,使用新的模式 + start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern) return start_page, mid_page, end_page -def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header): +#投标人须知分为两个章节 +# def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page): +# begin_pattern = re.compile( +# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+' +# ) +# end_pattern = re.compile( +# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)' # 捕获中文部分 +# ) +# exclusion_words = ["合同", "评标", "开标","评审","采购","资格"] # 在这里添加需要排除的关键词 +# +# pdf_document = PdfReader(pdf_path) +# exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') +# +# # 提取第一部分 +# start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header) +# if start_page1 is None or end_page1 is None: +# print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!") +# return "", "" +# +# # 保存第一部分的路径 +# path1 = save_extracted_pages(pdf_document, start_page1, end_page1, pdf_path, output_folder, +# "tobidders_notice_part1") +# +# # 提取第二部分 +# start_page2 = end_page1 +# +# # 检查end_page1页面的内容 +# text = pdf_document.pages[end_page1].extract_text() or "" +# cleaned_text = clean_page_content(text, common_header) +# match = end_pattern.search(cleaned_text) +# +# if match: +# # 获取匹配到的中文部分 +# chapter_title = match.group(1) +# # 检查是否包含排除关键词 +# if any(word in chapter_title for word in exclusion_words): +# # 如果包含排除关键词,直接返回相同的路径 +# return path1, path1 +# +# # 如果不包含排除关键词,继续提取第二部分 +# _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header, +# exclusion_pattern) +# +# if end_page2 is None: +# print(f"second: {output_suffix} 未找到第二部分的结束页在文件 {pdf_path} 中!") +# return path1, path1 +# +# # 保存第二部分的路径 +# path2 = save_extracted_pages(pdf_document, start_page2, end_page2, pdf_path, output_folder, +# "tobidders_notice_part2") +# +# return path1, path2 + +def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page): begin_pattern = re.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知)+' ) @@ -399,18 +467,16 @@ def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, ) exclusion_words = ["合同", "评标", "开标","评审","采购","资格"] # 在这里添加需要排除的关键词 - pdf_document = PdfReader(pdf_path) exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') # 提取第一部分 - start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, -1, common_header) + start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header) if start_page1 is None or end_page1 is None: - print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!") - return "", "" + return "", "","" - # 保存第一部分的路径 - path1 = save_extracted_pages(pdf_document, start_page1, end_page1, pdf_path, output_folder, - "tobidders_notice_part1") + # # 保存第一部分的路径 + # path1 = save_extracted_pages(pdf_document, start_page1, end_page1, pdf_path, output_folder, + # "tobidders_notice_part1") # 提取第二部分 start_page2 = end_page1 @@ -426,52 +492,71 @@ def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, # 检查是否包含排除关键词 if any(word in chapter_title for word in exclusion_words): # 如果包含排除关键词,直接返回相同的路径 - return path1, path1 + return start_page1, end_page1,end_page1 # 如果不包含排除关键词,继续提取第二部分 _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header, exclusion_pattern) if end_page2 is None: - print(f"second: {output_suffix} 未找到第二部分的结束页在文件 {pdf_path} 中!") - return path1, path1 + return start_page1, end_page1,end_page1 - # 保存第二部分的路径 - path2 = save_extracted_pages(pdf_document, start_page2, end_page2, pdf_path, output_folder, - "tobidders_notice_part2") + # # 保存第二部分的路径 + # path2 = save_extracted_pages(pdf_document, start_page2, end_page2, pdf_path, output_folder, + # "tobidders_notice_part2") - return path1, path2 + return start_page1, end_page1,end_page2 +def extract_pages_qualification(pdf_document,begin_page,begin_pattern,end_pattern, common_header): + start_page = None + end_page = None + include_keywords = ["资格审查", "资质审查", "符合性审查"] + exclude_keywords = ["声明函", "承诺函"] + # 从章节开始后的位置进行检查 + for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page): + text = page.extract_text() + if text: + cleaned_text = clean_page_content(text, common_header) + # 确定起始页,需在last_begin_index之后 + if ( + any(keyword in cleaned_text for keyword in include_keywords) and + all(keyword not in cleaned_text for keyword in exclude_keywords) and + start_page is None + ): + if begin_pattern.search(cleaned_text): + print("附件") + start_page = i + # 确定结束页 + if start_page is not None and end_pattern.search(cleaned_text): + if i > start_page: + end_page = i + break # 找到结束页后退出循环 + return start_page,end_page -def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header): +def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,begin_page): try: exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') pdf_document = PdfReader(pdf_path) patterns = None - begin_page = 0 start_page = None end_page = None if output_suffix == "procurement": patterns = [get_patterns_for_procurement()] - begin_page = 5 elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3": patterns = [get_patterns_for_evaluation_method()] - begin_page = 5 elif output_suffix == "qualification1": - patterns = [get_patterns_for_qualification(),get_patterns_for_qualification2()] - begin_page = 5 + patterns = [get_patterns_for_qualification()] elif output_suffix == "notice": patterns = [get_patterns_for_notice(),get_patterns_for_notice_twice()] - begin_page = 0 if patterns: for pattern_pair in patterns: - # print(pattern_pair[0]) - # print(pattern_pair[1]) - start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page, - common_header, - exclusion_pattern, output_suffix) + if output_suffix=="qualification1": + start_page,end_page=extract_pages_qualification(pdf_document,begin_page,pattern_pair[0],pattern_pair[1],common_header) + else: + start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page, + common_header,exclusion_pattern, output_suffix) if start_page is not None and end_page is not None: break @@ -544,7 +629,7 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo def get_start_and_common_header(input_path): common_header = extract_common_header(input_path) last_begin_index = 0 - begin_pattern = re.compile(r'.*(?:招标公告|邀请书|邀请函|投标邀请)[\))]?\s*$',re.MULTILINE) + begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',re.MULTILINE) pdf_document = PdfReader(input_path) for i, page in enumerate(pdf_document.pages): if i > 10: @@ -604,8 +689,8 @@ def process_input(input_path, output_folder, selection, output_suffix): # 根据选择设置对应的模式和结束模式 if selection == 1: - begin_pattern = re.compile(r'.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$', re.MULTILINE) - end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) + begin_pattern = re.compile(r'.*(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$', re.MULTILINE) + end_pattern = re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE) local_output_suffix = "notice" elif selection == 2: begin_pattern = re.compile( @@ -737,9 +822,10 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1 logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}") return truncate_files -#TODO:截取还有问题 # TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况,类似工程标 唐山投标只有正文,没有附表 + +#ztbfile.pdf jiao通 广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次)_qualification2.pdf 唐山 包头 if __name__ == "__main__": input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles" # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\f8b793b5-aa60-42d3-ae59-a3f474e06610\\ztbfile.pdf" @@ -752,6 +838,6 @@ if __name__ == "__main__": # selections = [1,4] # files=truncate_pdf_specific_goods(input_path,output_folder,selections) # print(files) - selection = 1# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 + selection = 4# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 generated_files = truncate_pdf_main(input_path, output_folder, selection) # print(generated_files) \ No newline at end of file diff --git a/flask_app/货物标/资格审查main.py b/flask_app/货物标/资格审查main.py index 9499949..34ae4ec 100644 --- a/flask_app/货物标/资格审查main.py +++ b/flask_app/货物标/资格审查main.py @@ -2,12 +2,12 @@ import json import os import re -from flask_app.general.通义千问long import upload_file +from flask_app.general.通义千问long import upload_file, qianwen_long from flask_app.general.多线程提问 import multi_threading from flask_app.general.json_utils import clean_json_string from flask_app.货物标.投标人须知正文条款提取成json文件货物标版 import convert_clause_to_json import copy - +import concurrent.futures # 这个字典可能有嵌套,你需要遍历里面的键名,对键名作判断,而不是键值,具体是这样的:如果处于同一层级的键的数量>1并且键名全由数字或点号组成。那么就将这些序号键名全部删除,重新组织成一个字典格式的数据,你可以考虑用字符串列表来保持部分平级的数据 # 对于同级的键,如果数量>1且键名都统一,那么将键名去掉,用列表保持它们的键值 @@ -379,23 +379,53 @@ def combine_qualification_review(invalid_path, output_folder, qualification_path } } - def process_file(file_path): + def process_file(file_path, invalid_path): file_id = upload_file(file_path) - user_queries = [ - "该招标文件中规定的资格性审查标准是怎样的?请以json格式给出,外层为'资格性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关符合性性审查的内容。", - "该招标文件中规定的符合性审查标准是怎样的?请以json格式给出,外层为'符合性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关资格性审查的内容。" - ] - results = multi_threading(user_queries, "", file_id, 2) - combined_res = {} - for _, response in results: - if response: - # print(response) - cleaned_data = clean_json_string(response) - processed = process_dict(preprocess_dict(cleaned_data)) - combined_res.update(processed) - else: - print(f"Warning: No response for a query.") + first_query = """ + 该文档中是否有关于资格性审查标准的具体内容,是否有关于符合性审查标准的具体内容?请以json格式给出回答,外键分别为'资格性审查'和'符合性审查',键值仅限于'是','否',输出格式示例如下: + { + "资格性审查":"是", + "符合性审查":"是" + } + """ + qianwen_ans = clean_json_string(qianwen_long(file_id, first_query)) + user_queries = [ + { + "key": "资格性审查", + "query": "该招标文件中规定的资格性审查标准是怎样的?请以json格式给出,外层为'资格性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关符合性性审查的内容。" + }, + { + "key": "符合性审查", + "query": "该招标文件中规定的符合性审查标准是怎样的?请以json格式给出,外层为'符合性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关资格性审查的内容。" + } + ] + combined_res = {} + file_id2 = None # 延迟上传 invalid_path + def process_single_query(query_info): + nonlocal file_id2 + key = query_info["key"] + query = query_info["query"] + # 根据键值决定使用哪个 file_id + if qianwen_ans.get(key) == "否": + if not file_id2: + file_id2 = upload_file(invalid_path) + current_file_id = file_id2 + else: + current_file_id = file_id + + # 调用大模型获取回答 + ans = qianwen_long(current_file_id, query) + cleaned_data = clean_json_string(ans) + processed = process_dict(preprocess_dict(cleaned_data)) + return processed + + # 使用线程池并行处理查询 + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + futures = [executor.submit(process_single_query, q) for q in user_queries] + for future in concurrent.futures.as_completed(futures): + result = future.result() + combined_res.update(result) return combined_res try: