diff --git a/flask_app/main/download.py b/flask_app/main/download.py index 5b4a481..e7032c6 100644 --- a/flask_app/main/download.py +++ b/flask_app/main/download.py @@ -62,9 +62,14 @@ def download_file(url, local_filename): if __name__ == '__main__': # 测试下载的URL - test_url ="https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/01%20%E6%8B%9B%E6%A0%87%E6%96%87%E4%BB%B6-%E5%A4%A7%E8%8D%94%E5%8E%BF%E5%85%AC%E5%AE%89%E5%B1%80%E6%83%85%E6%8C%87%E5%8B%A4%E8%88%86%E4%B8%80%E4%BD%93%E5%8C%96%E5%B9%B3%E5%8F%B0%E5%BB%BA%E8%AE%BE%E9%A1%B9%E7%9B%AE%28%E4%BA%8C%E6%AC%A1%29.pdf?Expires=1728751377&OSSAccessKeyId=TMP.3KeYhAGeJr2LGiNctSPvSmdSwxhrU8pbaDYRJcNCCgv8ijyWN613QahKb3nhXydfAvHaqpw4nTHXMzq7hmTHmNnPA77DgL&Signature=mwPHW8v7dVmHP1udTDL%2ByzllwCE%3D" + test_url ="http://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/tender/444d68db95c3438ca481f029500f4f9b.pdf?Expires=1729325467&OSSAccessKeyId=LTAI5tJmL4Nvg95VjgUNvVL6&Signature=2%2Bax895ik7DoFs%2FHVYeN29%2BEiTo%3D&x-oss-process=doc%2Fedit" local_file_name = 'C:\\Users\\Administrator\\Desktop\\招标文件\\output\\testdownload' - file_path,file_type = download_file(test_url, local_file_name) - if file_path: - print(f"Downloaded file path: {file_path}") - print(file_type) + downloaded = download_file(test_url, local_file_name) + if not downloaded: + print("下载文件失败或不支持的文件类型") + downloaded_filepath, file_type = downloaded + print(downloaded_filepath) + print(file_type) + # 检查文件类型 + if file_type == 4: + print("error") diff --git a/flask_app/main/start_up.py b/flask_app/main/start_up.py index 235fa6b..fb0b2ab 100644 --- a/flask_app/main/start_up.py +++ b/flask_app/main/start_up.py @@ -257,7 +257,7 @@ def process_and_stream(file_url, zb_type): """ logger = g.logger unique_id = g.unique_id - output_folder = f"flask_app/static/output1/{unique_id}" + output_folder = f"flask_app/static/output/{unique_id}" filename = "ztbfile" downloaded_filename = os.path.join(output_folder, filename) diff --git a/flask_app/main/test.py b/flask_app/main/test.py index b618d15..4c789b6 100644 --- a/flask_app/main/test.py +++ b/flask_app/main/test.py @@ -1,36 +1,43 @@ -def second_query(baseinfo_list): - result_list = [] - for item in baseinfo_list: - for key, value in item.items(): - if isinstance(value, dict): - # 检查所有最内层的值是否都是 "未知" - if all(v == "未知" for v in value.values()): - result_list.append(key) - return result_list +import re -# 示例用法 -baseinfo_list = [ - { - "招标人联系方式": { - "名称": "未知", - "地址": "未知", - }, - "项目名称": { - "名称": "项目A", - "地址": "未知", - } - }, - { - "供应商信息": { - "名称": "未知", - "联系方式": "未知", - }, - "合同详情": { - "金额": "100万", - "期限": "未知", - } - } -] -result = second_query(baseinfo_list) -print(result) # 输出: ['招标人联系方式', '供应商信息'] +def read_questions_from_file(file_path): + questions = [] + current_question = "" + current_number = 0 + + with open(file_path, 'r', encoding='utf-8') as file: + for line in file: + line = line.strip() + + if not line: # 跳过空行 + continue + + if line.startswith('#'): # 跳过以#开头的行 + continue + + # 检查是否是新的问题编号,例如 "1." + match = re.match(r'^(\d+)\.', line) + if match: + # 如果有之前的问题,保存它 + if current_question: + questions.append(current_question.strip()) + + # 开始新的问题 + current_number = int(match.group(1)) + # 提取问题内容,去掉编号和点 + current_question = line.split('.', 1)[1].strip() + "\n" + else: + # 继续添加到当前问题 + current_question += line + "\n" + + # 添加最后一个问题(如果存在) + if current_question: + questions.append(current_question.strip()) + + return questions + + +qualification_review_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\资格评审.txt' +ques=read_questions_from_file(qualification_review_file_path) +print(ques) \ No newline at end of file diff --git a/flask_app/main/基础信息整合快速版.py b/flask_app/main/基础信息整合快速版.py index 3d44dde..13858ae 100644 --- a/flask_app/main/基础信息整合快速版.py +++ b/flask_app/main/基础信息整合快速版.py @@ -1,7 +1,8 @@ +import copy import json import threading import time - +import concurrent.futures from flask_app.main.json_utils import clean_json_string, nest_json_under_key,rename_outer_key from flask_app.main.投标人须知正文提取指定内容 import extract_from_notice from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge, merge_json_to_list @@ -113,20 +114,6 @@ def judge_consortium_bidding(baseinfo_list): baseinfo_list[:] = updated_list return accept_bidding -def generate_query(baseinfo_list): - questions_list = [] - for item in baseinfo_list: - # print(json.dumps(item, ensure_ascii=False, indent=4)) - for key, value in item.items(): - if value == "未知" or (isinstance(value, dict) and all(v == "未知" for v in value.values())): - question = ( - f"根据该招标文件中的信息,{key}的内容是怎样的?" - f"请按json格式给我提供信息,键名是'{key}'," - f"若存在未知信息,在对应的键值中填'未知'。" - ) - questions_list.append(question) - return questions_list - def update_baseinfo_lists(baseinfo_list1, baseinfo_list2): # 创建一个字典,用于存储 baseinfo_list1 中的所有键值对 combined_dict = {} @@ -167,7 +154,18 @@ def process_judge_questions(judge_file_path, chosen_numbers, truncate0, baseinfo for question, response in res2: baseinfo_list1.append(clean_json_string(response)) -def process_questions_list(questions_list, truncate2): +def process_baseinfo_list(baseinfo_list, truncate2): + questions_list = [] + for item in baseinfo_list: + # print(json.dumps(item, ensure_ascii=False, indent=4)) + for key, value in item.items(): + if value == "未知" or (isinstance(value, dict) and all(v == "未知" for v in value.values())): + question = ( + f"根据该招标文件中的信息,{key}的内容是怎样的?" + f"请按json格式给我提供信息,键名是'{key}'," + f"若存在未知信息,在对应的键值中填'未知'。" + ) + questions_list.append(question) if questions_list: file_id = upload_file(truncate2) baseinfo_results = multi_threading(questions_list, "", file_id, 2) @@ -200,35 +198,27 @@ def combine_basic_info(merged_baseinfo_path,truncate0,truncate2, clause_path): baseinfo_list1 = [clean_json_string(res) for _, res in baseinfo_results] if baseinfo_results else [] chosen_numbers, merged = merge_json_to_list(baseinfo_list1.pop()) + baseinfo_list1_copy = copy.deepcopy(baseinfo_list1) baseinfo_list1.append(merged) - - questions_list=generate_query(baseinfo_list1) - # judge_file_path = 'flask_app/static/提示词/是否相关问题qianwen-long.txt' judge_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题qianwen-long.txt' - # 创建两个线程 - thread1 = threading.Thread(target=process_judge_questions, - args=(judge_file_path, chosen_numbers, truncate0, baseinfo_list1)) - thread2 = threading.Thread(target=process_questions_list, args=(questions_list, truncate2)) - # 启动线程 - thread1.start() - thread2.start() + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + # 提交两个任务 + future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, truncate0, baseinfo_list1) + future2 = executor.submit(process_baseinfo_list, baseinfo_list1_copy, truncate2) - # 等待两个线程完成 - thread1.join() - thread2.join() + # 等待两个任务完成并获取结果 + future1.result() # process_judge_questions 直接修改 baseinfo_list1,不需要返回值 + baseinfo_list2 = future2.result() - # 处理结果 - # baseinfo_list1 已经在 process_judge_questions 中被更新 - baseinfo_list2 = process_questions_list(questions_list, truncate2) + # 如果需要,合并或处理 baseinfo_list1 和 baseinfo_list2 + updated_list = update_baseinfo_lists(baseinfo_list1, baseinfo_list2) - updated_list=update_baseinfo_lists(baseinfo_list1,baseinfo_list2) - - rebidding_situation = extract_from_notice(clause_path, 3) # "重新招标, 不再招标和终止招标"需从投标人须知正文提取 + rebidding_situation = extract_from_notice(clause_path, 3) update_json = rename_outer_key(rebidding_situation, "重新招标、不再招标和终止招标") updated_list.append(update_json) - aggregated_baseinfo = aggregate_basic_info_engineering(updated_list) # 现在是一个字典 + aggregated_baseinfo = aggregate_basic_info_engineering(updated_list) return {"基础信息": aggregated_baseinfo} #TODO:先不带投标人须知正文,如果是未知,再直接问正文, diff --git a/flask_app/main/多线程提问.py b/flask_app/main/多线程提问.py index 5438deb..3fd104d 100644 --- a/flask_app/main/多线程提问.py +++ b/flask_app/main/多线程提问.py @@ -40,10 +40,14 @@ def read_questions_from_file(file_path): with open(file_path, 'r', encoding='utf-8') as file: for line in file: line = line.strip() + if not line: # 跳过空行 continue - # 检查是否是新的问题编号 + if line.startswith('#'): # 跳过以#开头的行 + continue + + # 检查是否是新的问题编号,例如 "1." match = re.match(r'^(\d+)\.', line) if match: # 如果有之前的问题,保存它 @@ -52,12 +56,13 @@ def read_questions_from_file(file_path): # 开始新的问题 current_number = int(match.group(1)) + # 提取问题内容,去掉编号和点 current_question = line.split('.', 1)[1].strip() + "\n" else: # 继续添加到当前问题 current_question += line + "\n" - # 添加最后一个问题 + # 添加最后一个问题(如果存在) if current_question: questions.append(current_question.strip()) diff --git a/flask_app/main/截取pdf.py b/flask_app/main/截取pdf.py index 3ef77bf..0a13bfe 100644 --- a/flask_app/main/截取pdf.py +++ b/flask_app/main/截取pdf.py @@ -58,7 +58,7 @@ def extract_common_header(pdf_path): return '\n'.join(common_headers) -def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page): +def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header): """ 从原始PDF截取指定范围的页面并保存到新的PDF文件中。 如果 output_suffix 是 'notice',则额外保存 start_page 之前的页面。 @@ -89,11 +89,25 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en print(f"无效的页面范围: {start_page} 到 {end_page}") return "" - # 如果 output_suffix 是 'notice',保存 start_page 之前的页面 + # 如果 output_suffix 是 'notice',保存 start_page 之前的页面(若遇到'目录',则截取到'目录') if output_suffix == 'notice' and start_page > 0: before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf") before_doc = PdfWriter() - for page_num in range(0, start_page): + toc_page = -1 + + # 查找目录页 + for page_num in range(min(start_page, total_pages)): + page_text = pdf_document.pages[page_num].extract_text() + cleaned_text = clean_page_content(page_text, common_header) + if re.search(r'目\s*录', cleaned_text, re.MULTILINE): + toc_page = page_num + break + + # 确定截取的页数 + pages_to_extract = toc_page + 1 if toc_page != -1 else start_page + + # 提取页面 + for page_num in range(pages_to_extract): before_doc.add_page(pdf_document.pages[page_num]) with open(before_pdf_path, 'wb') as f_before: before_doc.write(f_before) @@ -159,7 +173,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix): print(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!") return "" else: - return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page) + return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header) elif output_suffix == "invalid": pdf_document = PdfReader(pdf_path) total_pages = len(pdf_document.pages) @@ -167,7 +181,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix): total = int(total_pages * 2 / 3) start_page = last_begin_index end_page = min(90, total) - return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page) + return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header) def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): @@ -208,7 +222,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter print(f"first: 未找到起始或结束页在文件 {pdf_path} 中!") return "" else: - return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page) + return save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page,common_header) def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): @@ -337,8 +351,7 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na desired_suffixes = [ f'{base_file_name}_before.pdf', f'{base_file_name}_notice.pdf', - f'{base_file_name}_tobidders_notice_table.pdf', - f'{base_file_name}_tobidders_notice.pdf' + f'{base_file_name}_tobidders_notice_table.pdf' ] all_pdfs_to_merge = [] @@ -416,7 +429,7 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder,selections): if __name__ == "__main__": # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74\\ztbfile.pdf" # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74" - input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf" + input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest16.pdf" output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\special_output" files=truncate_pdf_multiple(input_path,output_folder) # selections = [5, 1] # 仅处理 selection 5、1 和 3 diff --git a/flask_app/main/招标文件解析.py b/flask_app/main/招标文件解析.py index 911c118..275f930 100644 --- a/flask_app/main/招标文件解析.py +++ b/flask_app/main/招标文件解析.py @@ -289,7 +289,6 @@ def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_ if __name__ == "__main__": output_folder = "flask_app/static/output/zytest1" - # truncate0 = os.path.join(output_folder, "ztb_tobidders_notice_table.pdf") # truncate1=os.path.join(output_folder,"ztb_evaluation_method.pdf") # clause_path = convert_clause_to_json(truncate1, output_folder) @@ -299,7 +298,6 @@ if __name__ == "__main__": file_type = 1 #1:docx 2:pdf 3:其他 input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest11.pdf" # file_path = main_processing(output_folder, input_file, file_type, "uuidzyzy11") - print("zy") preprocess_files(output_folder, input_file, file_type, "zytest1") end_time = time.time() diff --git a/flask_app/main/资格审查模块.py b/flask_app/main/资格审查模块.py index 471bd3f..e8dc177 100644 --- a/flask_app/main/资格审查模块.py +++ b/flask_app/main/资格审查模块.py @@ -3,7 +3,7 @@ import os from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json from flask_app.main.json_utils import nest_json_under_key, extract_content_from_json from flask_app.main.形式响应评审 import process_reviews -from flask_app.main.资格评审 import process_qualification +from flask_app.main.资格评审old import process_qualification from flask_app.main.通义千问long import upload_file, qianwen_long from concurrent.futures import ThreadPoolExecutor diff --git a/flask_app/货物标/货物标截取pdf.py b/flask_app/货物标/货物标截取pdf.py index 1db6b4d..ab366ff 100644 --- a/flask_app/货物标/货物标截取pdf.py +++ b/flask_app/货物标/货物标截取pdf.py @@ -340,12 +340,16 @@ def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, return path1, path2 + def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header): try: exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') pdf_document = PdfReader(pdf_path) patterns = None begin_page = 0 + start_page = None + end_page = None + if output_suffix == "procurement": patterns = [get_patterns_for_procurement()] begin_page = 5 @@ -353,18 +357,20 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header): patterns = [get_patterns_for_evaluation_method()] begin_page = 5 elif output_suffix == "qualification1": - patterns = [get_patterns_for_qualification()] # This now returns a tuple of pattern pairs + patterns = [get_patterns_for_qualification()] begin_page = 5 elif output_suffix == "notice": patterns = [get_patterns_for_notice()] begin_page = 0 - # Try each set of patterns until a valid range is found - for pattern_pair in patterns: - start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page, - common_header, - exclusion_pattern, output_suffix) - if start_page is not None and end_page is not None: - break + + if patterns: + for pattern_pair in patterns: + start_page, end_page = extract_pages_generic(pdf_document, pattern_pair[0], pattern_pair[1], begin_page, + common_header, + exclusion_pattern, output_suffix) + if start_page is not None and end_page is not None: + break + if start_page is None or end_page is None: if output_suffix == "qualification1": print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!") @@ -373,14 +379,16 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header): if len(temp) > 0: return temp[0] else: - return "" # 返回空字符串 + return "" else: print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!") - return "" # 返回空字符串 - return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix) + return "" + + return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix, + ) except Exception as e: print(f"Error in extract_pages_twice: {e}") - return "" # 返回空字符串 + return "" # def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):