diff --git a/flask_app/general/doubao.py b/flask_app/general/doubao.py index f3f1562..119c782 100644 --- a/flask_app/general/doubao.py +++ b/flask_app/general/doubao.py @@ -7,7 +7,7 @@ import requests from ratelimit import sleep_and_retry, limits from flask_app.general import table_ocr -from flask_app.general.file2markdown import convert_pdf_to_markdown +from flask_app.general.file2markdown import convert_file_to_markdown from flask_app.general.clean_pdf import extract_common_header, clean_page_content from flask_app.general.table_ocr import CommonOcr diff --git a/flask_app/general/file2markdown.py b/flask_app/general/file2markdown.py index 70e1351..1bfc1b0 100644 --- a/flask_app/general/file2markdown.py +++ b/flask_app/general/file2markdown.py @@ -40,8 +40,7 @@ class TextinOcr(object): } return requests.post(url, data=image, headers=headers, params=options) - -def convert_pdf_to_markdown(file_path): +def convert_file_to_markdown(file_path): output_folder=os.path.dirname(os.path.abspath(file_path)) app_id=os.getenv("TEXTIN_APP_ID") app_key=os.getenv("TEXTIN_APP_KEY") @@ -75,6 +74,6 @@ def convert_pdf_to_markdown(file_path): if __name__ == "__main__": # file_path=r"C:\Users\Administrator\Desktop\fsdownload\e702f1e6-095d-443d-bb7d-ef2e42037cb1\ztbfile_procurement.pdf" - file_path=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0516-001-招标文件_procurement.pdf" - res=convert_pdf_to_markdown(file_path) + file_path=r"C:\Users\Administrator\Desktop\fsdownload\1073c74f-02b5-463e-a129-23c790a3c872\ztbfile_notice.docx" + res=convert_file_to_markdown(file_path) print(res) \ No newline at end of file diff --git a/flask_app/general/insert_del_pagemark.py b/flask_app/general/insert_del_pagemark.py new file mode 100644 index 0000000..0b31b26 --- /dev/null +++ b/flask_app/general/insert_del_pagemark.py @@ -0,0 +1,93 @@ +import os +import re +from io import BytesIO +from PyPDF2 import PdfReader, PdfWriter +from docx import Document +from reportlab.pdfgen import canvas +from reportlab.lib.units import cm + +def insert_mark(input_pdf_path): + try: + # 构建输出文件路径,与输入文件同目录,名称为 invalid_added.pdf + input_dir = os.path.dirname(input_pdf_path) + output_pdf_path = os.path.join(input_dir, "invalid_added.pdf") + + # 打开输入的PDF文件 + with open(input_pdf_path, 'rb') as file: + pdf_reader = PdfReader(file) + pdf_writer = PdfWriter() + + total_pages = len(pdf_reader.pages) + + # 遍历每一页 + for page_num in range(total_pages): + page = pdf_reader.pages[page_num] + pdf_writer.add_page(page) + + # 创建一个内存中的PDF,用于存放带有文本的空白页 + packet = BytesIO() + # 获取当前页面的宽度和高度 + page_width = float(page.mediabox.width) + page_height = float(page.mediabox.height) + # 使用reportlab创建一个新的PDF页面 + c = canvas.Canvas(packet, pagesize=(page_width, page_height)) + + # 计算文本的位置(单位:点,1厘米 ≈ 28.35点) + x_position = 2.3 * cm + y_position = page_height - 0.5 * cm # 从顶部开始计算,因此用页面高度减去上边距 + + # 绘制文本,使用 (page_num + 1) 作为索引 + c.setFont("Helvetica", 12) # 设置字体和大小 + c.drawString(x_position, y_position, f"[$$index_mark_{page_num + 1}$$]") + + # 完成绘制 + c.save() + + # 将内存中的PDF读入PyPDF2 + packet.seek(0) + new_pdf = PdfReader(packet) + blank_page = new_pdf.pages[0] + + # 将带有文本的空白页添加到写入器 + pdf_writer.add_page(blank_page) + + # 将所有页面写入输出的PDF文件 + with open(output_pdf_path, 'wb') as output_file: + pdf_writer.write(output_file) + print("invalid_file added successfully!") + return output_pdf_path + + except Exception as e: + print(f"发生错误: {e}") + return "" + + +def delete_mark(docx_path): + """ + 删除docx文档中的所有标记 + :param docx_path: docx文件路径 + """ + docx = Document(docx_path) + find_flag = False + for para in docx.paragraphs: + # 匹配标记: [$$index_mark_X$$] + if "[$$index_mark_" in para.text: + para._element.getparent().remove(para._element) # 删标记 + find_flag = True + if find_flag and "w:sectPr" in para._element.xml: # 删空白分节符 + para._element.getparent().remove(para._element) + find_flag = False + + # 获取文件路径信息 + import os + dir_path = os.path.dirname(docx_path) + new_file_path = os.path.join(dir_path, 'invalid_del.docx') + + # 保存修改后的文档 + docx.save(new_file_path) + +if __name__ == '__main__': + input=r'C:\Users\Administrator\Desktop\fsdownload\1073c74f-02b5-463e-a129-23c790a3c872\ztbfile_tobidders_notice_part2.pdf' + # add_blank_pages_v2(input) + doc_path = r'C:\Users\Administrator\Desktop\fsdownload\1073c74f-02b5-463e-a129-23c790a3c872\invalid_added.docx' + delete_mark(doc_path) \ No newline at end of file diff --git a/flask_app/general/insert_pagenum.py b/flask_app/general/insert_pagenum.py deleted file mode 100644 index 3284b95..0000000 --- a/flask_app/general/insert_pagenum.py +++ /dev/null @@ -1,53 +0,0 @@ -import PyPDF2 -from reportlab.lib.units import cm -from reportlab.pdfgen import canvas -from io import BytesIO - -def add_blank_pages_v2(input_pdf_path, output_pdf_path): - # 打开输入的PDF文件 - with open(input_pdf_path, 'rb') as file: - pdf_reader = PyPDF2.PdfReader(file) - pdf_writer = PyPDF2.PdfWriter() - - total_pages = len(pdf_reader.pages) - - # 遍历每一页 - for page_num in range(total_pages): - page = pdf_reader.pages[page_num] - pdf_writer.add_page(page) - - # 创建一个内存中的PDF,用于存放带有文本的空白页 - packet = BytesIO() - # 获取当前页面的宽度和高度 - page_width = float(page.mediabox.width) - page_height = float(page.mediabox.height) - # 使用reportlab创建一个新的PDF页面 - c = canvas.Canvas(packet, pagesize=(page_width, page_height)) - - # 计算文本的位置(单位:点,1厘米 ≈ 28.35点) - x_position = 2.3 * cm - y_position = page_height - 0.5 * cm # 从顶部开始计算,因此用页面高度减去上边距 - - # 绘制文本,使用 (page_num + 1) 作为索引 - c.setFont("Helvetica", 12) # 设置字体和大小 - c.drawString(x_position, y_position, f"[$$index_mark_{page_num + 1}$$]") - - # 完成绘制 - c.save() - - # 将内存中的PDF读入PyPDF2 - packet.seek(0) - new_pdf = PyPDF2.PdfReader(packet) - blank_page = new_pdf.pages[0] - - # 将带有文本的空白页添加到写入器 - pdf_writer.add_page(blank_page) - - # 将所有页面写入输出的PDF文件 - with open(output_pdf_path, 'wb') as output_file: - pdf_writer.write(output_file) - -if __name__ == '__main__': - input=r'C:\Users\Administrator\Downloads\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.pdf' - output=r'C:\Users\Administrator\Downloads\output.pdf' - add_blank_pages_v2(input,output) \ No newline at end of file diff --git a/flask_app/general/商务技术评分提取.py b/flask_app/general/商务技术评分提取.py index 19fa5ea..063de2f 100644 --- a/flask_app/general/商务技术评分提取.py +++ b/flask_app/general/商务技术评分提取.py @@ -221,8 +221,8 @@ def combine_evaluation_standards(evaluation_method_path,invalid_path,zb_type): query = ( """根据该文档,你判断它是否有关于技术评分或商务评分或投标报价的具体的评分及要求如果有,返回'是',否则返回'否'。 要求与指南: - 1. 竞争性磋商文件通常无评分要求 - 2. 评分要求主要以表格形式呈现,且会有评分因素及评分要求。 + 1. 评分要求主要以表格形式呈现,且有评分因素及评分要求、标准。 + 2. 竞争性磋商文件通常无评分要求,但若满足'1.'的内容,也请返回'是'。 3. 仅返回'是'或'否',不需要其他解释或内容。 """ ) # 应对竞争性谈判这种无评分要求的情况 @@ -242,7 +242,7 @@ def combine_evaluation_standards(evaluation_method_path,invalid_path,zb_type): 要求与指南: 1. 请首先定位评分细则的表格,不要回答有关资格审查的内容,也不要从评标办法正文中提取回答 - 2. 你无需将表格的单元格内的内容进行拆分,需要将它视为一个整体 + 2. 你无需将表格的单元格内的内容进行拆分,需要将它视为一个整体,尤其是 3. '评分'的键值不能是一个范围数字,如'0-5分',应该是一个具体数字,如'5分',或者是一个定性的指标如'合格制' 4. 如果该招标活动有多个包,则最外层键名为对应的包名,否则最外层键名为各大评分项 5. 若表格中商务和技术评分混合一起,请根据实际表格内容进行准确分类。 @@ -405,7 +405,7 @@ def combine_evaluation_standards(evaluation_method_path,invalid_path,zb_type): # 执行 user_query 相关的逻辑 return run_second_qeury(file_id) else: - judge_res,file_id=run_first_query(invalid_path) #调用 + judge_res,file_id=run_first_query(invalid_path) #调用invalid_path看看有没有评分项 if '是' in judge_res: # 执行 user_query 相关的逻辑 return run_second_qeury(file_id) diff --git a/flask_app/general/无效标和废标公共代码.py b/flask_app/general/无效标和废标公共代码.py index 5e51ace..952d418 100644 --- a/flask_app/general/无效标和废标公共代码.py +++ b/flask_app/general/无效标和废标公共代码.py @@ -7,6 +7,8 @@ from flask_app.general.doubao import doubao_model, generate_full_user_query from flask_app.general.通用功能函数 import process_string_list from collections import OrderedDict from docx import Document +from flask_app.general.insert_del_pagemark import insert_mark +from flask_app.general.format_change import pdf2docx # 只读取前附表中的最后一列(省钱,但容易漏内容) @@ -75,53 +77,92 @@ def preprocess_paragraphs(paragraphs): # 定义两个新的正则表达式模式 pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*') - #排除遇到表格、填空的情况 + # 定义列表项的模式 + list_item_pattern = re.compile( + r'^\s*(' + r'[(\(]\d+[)\)]|' # 匹配:(1) 或 (1) + r'[A-Za-z]\.\s*|' # 匹配:A. 或 b. + r'[一二三四五六七八九十]+、|' # 匹配:一、二、三、 + r'第[一二三四五六七八九十百零]+[章节部分节]|' # 匹配:第x章,第x部分,第x节 + r'[A-Za-z]\d+(?:\.\d+)*[\s\.、.)\)]?|' # 匹配:A1.2 等 + r'\d+(?:\.\d+)+[\s\.、.)\)]?(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])|' # 匹配:数字序号如1.1 1.1.1 + r'(?=\d+\s(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万]))|' # 数字后空格,空格后非指定关键字 + r'(?=\d+[、..])(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])' # 数字后直接跟顿号或点号 + r')' + ) + + # 是否存在连续超过指定长度的空白字符序列: 排除遇到表格、填空的情况 def has_long_spaces(text, max_space_count=5): return any(len(space) > max_space_count for space in re.findall(r'\s+', text)) + + # 正则表达式用于检测页面标记 + pattern_marker = re.compile(r'\$\$index_mark_\d+\$\$') + + # 辅助函数:查找上一个非空且非标记的段落 + def find_prev_text(current_index): + for i in range(current_index - 1, -1, -1): + text = paragraphs[i].text.strip() + if text and not pattern_marker.search(text): + return text, i + return '', -1 + + # 辅助函数:查找下一个非空且非标记的段落 + def find_next_text(current_index): + for i in range(current_index + 1, len(paragraphs)): + text = paragraphs[i].text.strip() + if text and not pattern_marker.search(text): + return text, i + return '', -1 + while index < len(paragraphs): current_text = paragraphs[index].text.strip() # 去除当前段落的前后空白 + + # 检查当前段落是否为页面标记 + if pattern_marker.search(current_text): + # 动态查找前一个非空段落 + prev_text, prev_index = find_prev_text(index) + # 动态查找后一个非空段落 + next_text, next_index = find_next_text(index) + + # 应用现有的合并逻辑 + if prev_text and next_text and not has_long_spaces(prev_text) and not has_long_spaces(next_text): + if not prev_text.endswith(('。', '!', '?')): # ',', ',', 先注释了,如果逗号,可能还没结束。 + # 检查后一个段落是否为列表项 + if not list_item_pattern.match(next_text) and len(prev_text) > 30: + # 合并前后段落 + merged_text = prev_text + ' ' + next_text # 为了可读性添加空格 + if prev_index < len(paragraphs): + # 移除 processed 中的前一个段落 + if processed and processed[-1] == prev_text: + processed.pop() + # 添加合并后的文本 + processed.append(merged_text) + + # 跳过标记以及前后所有空白段落,直到 next_index + index = next_index + 1 + continue # 继续下一个循环 + + # 如果不满足合并条件,跳过标记及其周围的空白段落 + # 计算下一个需要处理的索引 + # 从当前 index 向下,跳过所有连续的空白段落和标记 + skip_index = index + 1 + while skip_index < len(paragraphs): + skip_text = paragraphs[skip_index].text.strip() + if skip_text == '' or pattern_marker.search(skip_text): + skip_index += 1 + else: + break + index = skip_index + continue # 继续下一个循环 + # 检查当前段落是否匹配任一排除模式 - if (pattern_numbered.match(current_text) or pattern_parentheses.match(current_text)) and len(current_text)<8: + if (pattern_numbered.match(current_text) or pattern_parentheses.match(current_text)) and len(current_text) < 8: # 如果匹配,则跳过当前段落,不添加到processed列表中 index += 1 continue - # 检查当前段落是否为空 - if current_text == '': - # 确保有前一个和后一个段落 - if 0 < index < len(paragraphs) - 1: - prev_text = paragraphs[index - 1].text.strip() # 获取前一个段落的文本 - next_text = paragraphs[index + 1].text.strip() # 获取后一个段落的文本 - - # **新增部分:检查前一个段落和后一个段落都非空** 内部没有超过5个连续空格 - if prev_text and next_text and not has_long_spaces(prev_text) and not has_long_spaces(next_text): - # 检查前一个段落的文本是否不以标点符号结尾 - if not prev_text.endswith((',', ',', '。', '!', '?')): - # 定义列表项的模式 - list_item_pattern = r'^\s*([(\(]\d+[)\)]|[A-Za-z]\.\s*|[一二三四五六七八九十]+、)' - - # 检查后一个段落是否以列表模式开头 - is_next_list = re.match(list_item_pattern, next_text) - - # 如果后一个段落不是列表项,且前一个段落长度大于30 - if not is_next_list and len(prev_text) > 30: - # 合并前一个和后一个段落的文本 - merged_text = prev_text + ' ' + next_text # 为了可读性添加空格 - - if processed: - # 更新处理后的最后一个段落 - processed[-1] = merged_text - else: - # 如果列表为空,直接添加合并后的文本 - processed.append(merged_text) - - # 跳过下一个段落,因为它已经被合并 - index += 2 - continue - # 如果没有满足条件,不进行合并,跳过当前空段落 - else: - # 非空段落,添加到处理后的列表中 - processed.append(current_text) + # 如果当前段落不是标记且不匹配排除模式,则添加到处理后的列表中 + processed.append(current_text) index += 1 return processed @@ -366,9 +407,9 @@ def extract_table_with_keywords(data, keywords, follow_up_keywords): # 2. 分割句子,保证句子完整性(按标点符号和序号分割) split_sentences = re.split( r'(?<=[。!?!?\?])|' # 在中文句号、感叹号、问号或分号后面分割 - r'(?=\d+(?:[..]\d+)+)(?!\s*[号条款节章项例页段部步点年月日时分秒个元万])|' # 在类似1.1 1.1.1 的数字序号前分割 - r'(?=\d+\s(?!\s*[号条款节章项例页段部步点年月日时分秒个元万]))|' # 数字后面跟空格且空格后面不是指定关键字时分割 - r'(?=\d+[、..])(?!\s*[号条款节章项例页段部步点年月日时分秒个元万])|' # 在数字后直接跟顿号、半角点号或全角点号时分割 + r'(?=\d+(?:[..]\d+)+)(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])|' # 在类似1.1 1.1.1 的数字序号前分割 + r'(?=\d+\s(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万]))|' # 数字后面跟空格且空格后面不是指定关键字时分割 + r'(?=\d+[、..])(?!\s*[号条款节章项例页段部步点年月日时分秒个元千万])|' # 在数字后直接跟顿号、半角点号或全角点号时分割 r'(?=[A-Za-z][..]\s*)|' # 在字母加点(如A.、a.)前分割 r'(?=[A-Za-z]+\s*\d+\s*(?:[..]\s*\d+)*)|' # 在可选字母加数字或多级编号前分割 r'(?=[一二三四五六七八九十]+、)', # 在中文数字加顿号(如一、二、)前分割 @@ -464,48 +505,52 @@ def extract_values_if_contains(data, includes): def handle_query(file_path, user_query, output_file, result_key, keywords): try: excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "本人保证:", "我方"] - follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下','其\s*他\s*情\s*形\s*:','其\s*他\s*情\s*形\s*:'] + follow_up_keywords = [ + r'情\s*形\s*之\s*一', + r'情\s*况\s*之\s*一', + r'下\s*列', + r'以\s*下', + r'其\s*他.*?情\s*形\s*[::]' + ] extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords) # 字典结果 all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes) # 列表 - print(all_texts1) - print(all_texts2) - # - # # table_data_list=read_docx_last_column(file_path) #从投标人须知前附表中提取信息生成列表data,每个元素为'一行信息' - # table_data_list = read_tables_from_docx(file_path) - # all_tables1, all_tables2 = extract_table_with_keywords(table_data_list, keywords, follow_up_keywords) - # qianwen_txt = all_texts1 + all_tables1 - # # Proceed only if there is content to write - # selected_contents = set() # 使用 set 去重 - # if qianwen_txt: - # with open(output_file, 'w', encoding='utf-8') as file: - # counter = 1 - # for content in qianwen_txt: - # file.write(f"{counter}. {content}\n") - # file.write("..............." + '\n') - # counter += 1 - # user_query = generate_full_user_query(output_file, user_query) - # model_ans=doubao_model(user_query) #豆包 - # # file_id = upload_file(output_file) - # # model_ans = qianwen_long(file_id, user_query) - # # model_ans = qianwen_long_text(file_id, user_query) - # num_list = process_string_list(model_ans) - # print(result_key + "选中的序号:" + str(num_list)) - # - # for index in num_list: - # if index - 1 < len(qianwen_txt): - # content = qianwen_txt[index - 1] - # selected_contents.add(content) - # - # # 无论 qianwen_txt 是否为空,都添加 all_texts2 和 all_tables2 的内容 - # selected_contents.update(all_texts2) - # selected_contents.update(all_tables2) - # - # # 如果 selected_contents 不为空,则返回结果,否则返回空字符串 - # if selected_contents: - # res = {result_key: list(selected_contents)} - # else: - # res = {result_key: ""} - # return res + + # table_data_list=read_docx_last_column(file_path) #从投标人须知前附表中提取信息生成列表data,每个元素为'一行信息' + table_data_list = read_tables_from_docx(file_path) + all_tables1, all_tables2 = extract_table_with_keywords(table_data_list, keywords, follow_up_keywords) + qianwen_txt = all_texts1 + all_tables1 + # Proceed only if there is content to write + selected_contents = set() # 使用 set 去重 + if qianwen_txt: + with open(output_file, 'w', encoding='utf-8') as file: + counter = 1 + for content in qianwen_txt: + file.write(f"{counter}. {content}\n") + file.write("..............." + '\n') + counter += 1 + user_query = generate_full_user_query(output_file, user_query) + model_ans=doubao_model(user_query) #豆包 + # file_id = upload_file(output_file) + # model_ans = qianwen_long(file_id, user_query) + # model_ans = qianwen_long_text(file_id, user_query) + num_list = process_string_list(model_ans) + print(result_key + "选中的序号:" + str(num_list)) + + for index in num_list: + if index - 1 < len(qianwen_txt): + content = qianwen_txt[index - 1] + selected_contents.add(content) + + # 无论 qianwen_txt 是否为空,都添加 all_texts2 和 all_tables2 的内容 + selected_contents.update(all_texts2) + selected_contents.update(all_tables2) + + # 如果 selected_contents 不为空,则返回结果,否则返回空字符串 + if selected_contents: + res = {result_key: list(selected_contents)} + else: + res = {result_key: ""} + return res except Exception as e: print(f"handle_query 在处理 {result_key} 时发生异常: {e}") return {result_key: ""} @@ -518,33 +563,42 @@ def handle_query(file_path, user_query, output_file, result_key, keywords): #"以上是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,每条信息规定了各方不得存在的情形,请回答:在这些信息中,主语是投标人或中标人或供应商或联合体投标各方或磋商小组的信息有哪些?不要返回主语是招标人或采购人或评标委员会的信息,请你筛选所需的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,示例返回为[1,4,6],若情况不存在,返回[]。", -def combine_find_invalid(file_path, output_dir): +def combine_find_invalid(invalid_docpath, output_dir): os.makedirs(output_dir, exist_ok=True) queries = [ ( - r'否\s*决|无\s*效\s*投\s*标|无\s*效\s*文\s*件|文\s*件\s*无\s*效|无\s*效\s*响\s*应|无\s*效\s*报\s*价|无\s*效\s*标|视\s*为\s*无\s*效|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效', + r'否\s*决|' + r'无\s*效\s*投\s*标|' + r'无\s*效\s*文\s*件|' + r'(?:文\s*件|投\s*标)\s*[\u4e00-\u9fa5]?\s*(?:无|失)\s*效|' + r'无\s*效\s*响\s*应|' + r'无\s*效\s*报\s*价|' + r'无\s*效\s*标|' + r'视\s*为\s*无\s*效|' + r'被\s*拒\s*绝|' + r'予\s*以\s*拒\s*绝', """以下是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:否决投标或拒绝投标或无效投标或投标失效的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。 文本内容:{full_text} """, os.path.join(output_dir, "temp1.txt"), "否决和无效投标情形" ), - # ( - # r'废\s*标', - # """以下是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:废标项的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。 - # 文本内容:{full_text} - # """, - # os.path.join(output_dir, "temp2.txt"), - # "废标项" - # ), - # ( - # r'不\s*得|禁\s*止\s*投\s*标', - # """以下是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,每条信息规定了各方不得存在的情形,请回答:在这些信息中,投标人或中标人或供应商或联合体投标各方或磋商小组不得存在的情形或禁止投标的情形有哪些?不要返回主语是招标人或采购人或评标委员会的信息,请你筛选所需的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,示例返回为[1,4,6],若情形不存在,返回[]。以下为需要考虑的注意事项:请返回包含实际内容的信息,若信息内容诸如'投标人不得存在的其他关联情形'这样笼统的表格,而未说明具体的情形,则无需添加这条信息。 - # 文本内容:{full_text} - # """, - # os.path.join(output_dir, "temp3.txt"), - # "不得存在的情形" - # ) + ( + r'废\s*标', + """以下是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,请你根据该内容回答:废标项的情况有哪些?文本中可能存在无关的信息,请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,x为符合的信息的序号,若情况不存在,返回[]。 + 文本内容:{full_text} + """, + os.path.join(output_dir, "temp2.txt"), + "废标项" + ), + ( + r'不\s*得|禁\s*止\s*投\s*标', + """以下是从招标文件中摘取的内容,文本内之间的信息以'...............'分割,每条信息规定了各方不得存在的情形,请回答:在这些信息中,投标人或中标人或供应商或联合体投标各方或磋商小组不得存在的情形或禁止投标的情形有哪些?不要返回主语是招标人或采购人或评标委员会的信息,请你筛选所需的信息并将它的序号返回。请以[x,x,x]格式返回给我结果,示例返回为[1,4,6],若情形不存在,返回[]。以下为需要考虑的注意事项:请返回包含实际内容的信息,若信息内容诸如'投标人不得存在的其他关联情形'这样笼统的表格,而未说明具体的情形,则无需添加这条信息。 + 文本内容:{full_text} + """, + os.path.join(output_dir, "temp3.txt"), + "不得存在的情形" + ) ] results = [] @@ -552,7 +606,7 @@ def combine_find_invalid(file_path, output_dir): with ThreadPoolExecutor() as executor: futures = [] for keywords, user_query, output_file, result_key in queries: - future = executor.submit(handle_query, file_path, user_query, output_file, result_key, keywords) + future = executor.submit(handle_query, invalid_docpath, user_query, output_file, result_key, keywords) futures.append((future, result_key)) # 保持顺序 time.sleep(0.5) # 暂停0.5秒后再提交下一个任务 @@ -584,9 +638,13 @@ if __name__ == '__main__': # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招标文件(实高电子显示屏)_tobidders_notice_part1.docx" # clause_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\77a48c63-f39f-419b-af2a-7b3dbf41b70b\\clause1.json" # doc_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\磋商文件(1).docx" - doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx' + # doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx' + pdf_path=r'C:\Users\Administrator\Desktop\fsdownload\8a9ebd69-af0d-4661-a8ce-78136cb6bc4f\ztbfile.pdf' + output_dir = r"C:\Users\Administrator\Desktop\new招标文件\tmp\new招标文件\tmp" - results = combine_find_invalid(doc_path, output_dir) + invalid_added=insert_mark(pdf_path) + invalid_added_docx=pdf2docx(invalid_added) + results = combine_find_invalid(invalid_added_docx, output_dir) end_time = time.time() print("Results:", json.dumps(results, ensure_ascii=False, indent=4)) print("Elapsed time:", str(end_time - start_time)) \ No newline at end of file diff --git a/flask_app/general/通用功能函数.py b/flask_app/general/通用功能函数.py index 4601962..9e12881 100644 --- a/flask_app/general/通用功能函数.py +++ b/flask_app/general/通用功能函数.py @@ -9,7 +9,7 @@ from flask_app.general.多线程提问 import multi_threading from flask_app.general.通义千问long import upload_file from flask_app.工程标.判断是否分包等 import read_questions_from_judge -def process_judge_questions(judge_file_path, chosen_numbers, merged_baseinfo_path, baseinfo_list1): +def process_judge_questions(judge_file_path, chosen_numbers, file_id, baseinfo_list1): judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers) judge_consortium = judge_consortium_bidding(baseinfo_list1) if judge_consortium: @@ -18,8 +18,8 @@ def process_judge_questions(judge_file_path, chosen_numbers, merged_baseinfo_pat "外层键名为'联合体投标要求',其中有一个嵌套键值对为:\"是否接受联合体投标\":\"是\"" ) judge_questions.append(judge_consortium_question) - file_id3 = upload_file(merged_baseinfo_path) - res2 = multi_threading(judge_questions, "", file_id3, 2) + # file_id3 = upload_file(merged_baseinfo_path) + res2 = multi_threading(judge_questions, "", file_id, 2) if not res2: print("基础信息整合: multi_threading error!") diff --git a/flask_app/routes/工程标解析main.py b/flask_app/routes/工程标解析main.py index a580dd7..4cb1d3e 100644 --- a/flask_app/routes/工程标解析main.py +++ b/flask_app/routes/工程标解析main.py @@ -7,6 +7,7 @@ from concurrent.futures import ThreadPoolExecutor from docx import Document +from flask_app.general.insert_del_pagemark import insert_mark,delete_mark from flask_app.工程标.截取pdf import truncate_pdf_multiple from flask_app.general.merge_pdfs import merge_pdfs from flask_app.工程标.提取json工程标版 import convert_clause_to_json @@ -28,19 +29,19 @@ def get_global_logger(unique_id): # 创建全局线程池 executor = ThreadPoolExecutor() -def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id,logger): +def preprocess_files(output_folder, file_path, file_type,unique_id,logger): logger.info("starting 文件预处理...") logger.info("output_folder..." + output_folder) start_time=time.time() # 根据文件类型处理文件路径 if file_type == 1: # docx - docx_path = downloaded_file_path - pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理 + # docx_path = file_path + pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理 elif file_type == 2: # pdf - pdf_path = downloaded_file_path + pdf_path = file_path # docx_path = pdf2docx(pdf_path) # 将pdf转换为docx以供上传到知识库 elif file_type == 3: #doc - pdf_path=docx2pdf(downloaded_file_path) + pdf_path=docx2pdf(file_path) # docx_path=doc2docx(downloaded_file_path) else: logger.error("Unsupported file type provided. Preprocessing halted.") @@ -51,8 +52,6 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id,lo # 处理各个部分 tobidders_notice_table=truncate_files[0] #投标人须知前附表 - # tobidders_notice_table_docx = pdf2docx(tobidders_notice_table) # 投标人须知前附表转docx - # truncate_jsonpath = extract_tables_main(tobidders_notice_table_docx, output_folder) # 投标人须知前附表docx->json tobidders_notice = truncate_files[1] #投标人须知正文 @@ -63,14 +62,17 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id,lo invalid_path=truncate_files[5] # invalid_docpath = copy_docx(docx_path) # docx截取无效标部分 - invalid_docpath=pdf2docx(invalid_path) + # invalid_docpath=pdf2docx(invalid_path) + invalid_added_pdf = insert_mark(invalid_path) + invalid_added_docx = pdf2docx(invalid_added_pdf) #有标记的invalid_path try: # 尝试加载 .docx 文件 - doc = Document(invalid_docpath) + doc = Document(invalid_added_docx) print("yes") except Exception as e: # 捕获异常并打印错误信息 - invalid_docpath=pdf2docx(pdf_path) + invalid_added_docx=pdf2docx(pdf_path) + invalid_deleted_docx=delete_mark(invalid_added_docx) #无标记的invalid_path merged_baseinfo_path=truncate_files[-1] more_path=[merged_baseinfo_path,tobidders_notice] @@ -82,9 +84,8 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id,lo # 返回包含预处理后文件路径的字典 return { - 'file_path': downloaded_file_path, - 'output_folder': output_folder, - 'invalid_path':invalid_path, + 'invalid_deleted_docx':invalid_deleted_docx, + 'invalid_added_docx': invalid_added_docx, 'notice_path':notice_path, 'tobidders_notice_table': tobidders_notice_table, 'tobidders_notice': tobidders_notice, @@ -92,22 +93,21 @@ def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id,lo 'qualification': qualification, 'merged_baseinfo_path':merged_baseinfo_path, 'merged_baseinfo_path_more':merged_baseinfo_path_more, - 'clause_path': clause_path, - 'invalid_docpath': invalid_docpath + 'clause_path': clause_path } # 基本信息 -def fetch_project_basic_info(invalid_path, merged_baseinfo_path, merged_baseinfo_path_more, tobidders_notice, clause_path, logger): +def fetch_project_basic_info(invalid_deleted_docx, merged_baseinfo_path, merged_baseinfo_path_more, tobidders_notice, clause_path, logger): logger.info("starting 基础信息...") start_time = time.time() try: if not merged_baseinfo_path: - merged_baseinfo_path = invalid_path + merged_baseinfo_path = invalid_deleted_docx if not merged_baseinfo_path_more: - merged_baseinfo_path_more = invalid_path + merged_baseinfo_path_more = invalid_deleted_docx if not tobidders_notice: - tobidders_notice = invalid_path + tobidders_notice = invalid_deleted_docx basic_res = combine_basic_info(merged_baseinfo_path, merged_baseinfo_path_more, tobidders_notice, clause_path) result = basic_res end_time = time.time() @@ -119,19 +119,18 @@ def fetch_project_basic_info(invalid_path, merged_baseinfo_path, merged_baseinfo return result -def fetch_qualification_review(evaluation_method, qualification, output_folder, tobidders_notice_table, clause_path, invalid_path, merged_baseinfo_path, notice_path, logger): +def fetch_qualification_review(evaluation_method, qualification, output_folder, tobidders_notice_table, clause_path, invalid_deleted_docx, merged_baseinfo_path, notice_path, logger): logger.info("starting 资格审查...") start_time = time.time() try: if not notice_path: - notice_path = invalid_path + notice_path = invalid_deleted_docx if not evaluation_method: - evaluation_method = invalid_path + evaluation_method = invalid_deleted_docx if not merged_baseinfo_path: - merged_baseinfo_path = invalid_path + merged_baseinfo_path = invalid_deleted_docx review_standards_res = combine_review_standards( - evaluation_method, qualification, output_folder, tobidders_notice_table, clause_path, invalid_path, merged_baseinfo_path, notice_path - ) + evaluation_method, qualification, output_folder, tobidders_notice_table, clause_path, invalid_deleted_docx, merged_baseinfo_path, notice_path) result = review_standards_res end_time = time.time() logger.info(f"资格审查 done,耗时:{end_time - start_time:.2f} 秒") @@ -143,12 +142,12 @@ def fetch_qualification_review(evaluation_method, qualification, output_folder, # 评分细则 流式 -def fetch_evaluation_standards(invalid_path, evaluation_method,logger): +def fetch_evaluation_standards(invalid_deleted_docx, evaluation_method,logger): logger.info("starting 商务标和技术标...") start_time = time.time() if not evaluation_method: - evaluation_method = invalid_path - evaluation_standards_res = combine_evaluation_standards(evaluation_method,invalid_path,1) + evaluation_method = invalid_deleted_docx + evaluation_standards_res = combine_evaluation_standards(evaluation_method,invalid_deleted_docx,1) technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})} commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})} end_time = time.time() @@ -160,11 +159,11 @@ def fetch_evaluation_standards(invalid_path, evaluation_method,logger): # 无效、废标项解析 -def fetch_invalid_requirements(invalid_docpath, output_folder, logger): +def fetch_invalid_requirements(invalid_added_docx, output_folder, logger): logger.info("starting 无效标与废标...") start_time = time.time() try: - find_invalid_res = combine_find_invalid(invalid_docpath, output_folder) + find_invalid_res = combine_find_invalid(invalid_added_docx, output_folder) result = find_invalid_res end_time = time.time() logger.info(f"无效标与废标 done,耗时:{end_time - start_time:.2f} 秒") @@ -176,12 +175,12 @@ def fetch_invalid_requirements(invalid_docpath, output_folder, logger): # 投标文件要求 -def fetch_bidding_documents_requirements(invalid_path, merged_baseinfo_path_more, clause_path, logger): +def fetch_bidding_documents_requirements(invalid_deleted_docx, merged_baseinfo_path_more, clause_path, logger): logger.info("starting 投标文件要求...") start_time = time.time() try: if not merged_baseinfo_path_more: - merged_baseinfo_path_more = invalid_path + merged_baseinfo_path_more = invalid_deleted_docx selection = 1 fetch_bidding_documents_requirements_json = extract_from_notice(merged_baseinfo_path_more, clause_path, selection) result = {"投标文件要求": fetch_bidding_documents_requirements_json} @@ -194,12 +193,12 @@ def fetch_bidding_documents_requirements(invalid_path, merged_baseinfo_path_more return result # 开评定标流程 -def fetch_bid_opening(invalid_path, merged_baseinfo_path_more, clause_path, logger): +def fetch_bid_opening(invalid_deleted_docx, merged_baseinfo_path_more, clause_path, logger): logger.info("starting 开评定标流程...") start_time = time.time() try: if not merged_baseinfo_path_more: - merged_baseinfo_path_more = invalid_path + merged_baseinfo_path_more = invalid_deleted_docx selection = 2 fetch_bid_opening_json = extract_from_notice(merged_baseinfo_path_more, clause_path, selection) result = {"开评定标流程": fetch_bid_opening_json} @@ -213,27 +212,27 @@ def fetch_bid_opening(invalid_path, merged_baseinfo_path_more, clause_path, logg #分段返回 -def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_id): +def engineering_bid_main(output_folder, file_path, file_type, unique_id): logger = get_global_logger(unique_id) # 预处理文件,获取处理后的数据 - processed_data = preprocess_files(output_folder, downloaded_file_path, file_type,unique_id,logger) + processed_data = preprocess_files(output_folder, file_path, file_type,unique_id,logger) if not processed_data: yield json.dumps({}) # 如果处理数据失败,返回空的 JSON with concurrent.futures.ThreadPoolExecutor() as executor: # 立即启动不依赖 knowledge_name 和 index 的任务 futures = { - 'base_info': executor.submit(fetch_project_basic_info,processed_data['invalid_path'] ,processed_data['merged_baseinfo_path'],processed_data['merged_baseinfo_path_more'], + 'base_info': executor.submit(fetch_project_basic_info,processed_data['invalid_deleted_docx'] ,processed_data['merged_baseinfo_path'],processed_data['merged_baseinfo_path_more'], processed_data['tobidders_notice'], processed_data['clause_path'],logger), 'qualification_review': executor.submit(fetch_qualification_review, processed_data['evaluation_method'], processed_data['qualification'], output_folder, processed_data['tobidders_notice_table'], - processed_data['clause_path'], processed_data['invalid_path'], + processed_data['clause_path'], processed_data['invalid_deleted_docx'], processed_data['merged_baseinfo_path'],processed_data['notice_path'],logger), - 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['invalid_path'],processed_data['evaluation_method'],logger), - 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],output_folder,logger), - 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'], processed_data['merged_baseinfo_path_more'],processed_data['clause_path'],logger), - 'opening_bid': executor.submit(fetch_bid_opening,processed_data['invalid_path'],processed_data['merged_baseinfo_path_more'], processed_data['clause_path'],logger) + 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['invalid_deleted_docx'],processed_data['evaluation_method'],logger), + 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_added_docx'],output_folder,logger), + 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_deleted_docx'], processed_data['merged_baseinfo_path_more'],processed_data['clause_path'],logger), + 'opening_bid': executor.submit(fetch_bid_opening,processed_data['invalid_deleted_docx'],processed_data['merged_baseinfo_path_more'], processed_data['clause_path'],logger) } # 提前处理这些不依赖的任务,按完成顺序返回 diff --git a/flask_app/routes/货物标解析main.py b/flask_app/routes/货物标解析main.py index a02d400..cb4b363 100644 --- a/flask_app/routes/货物标解析main.py +++ b/flask_app/routes/货物标解析main.py @@ -1,7 +1,11 @@ # 竞磋 竞谈 磋商 询价 邀请 单一来源 import json import time + +from docx import Document + from flask_app.general.format_change import docx2pdf, pdf2docx,doc2docx +from flask_app.general.insert_del_pagemark import insert_mark, delete_mark from flask_app.general.json_utils import transform_json_values from flask_app.货物标.基础信息解析main import combine_basic_info from flask_app.货物标.投标人须知正文提取指定内容货物标版 import extract_from_notice @@ -30,14 +34,14 @@ def preprocess_files(output_folder, file_path, file_type,logger): logger.info("output_folder..." + output_folder) # 根据文件类型处理文件路径 if file_type == 1: # docx - docx_path = file_path - pdf_path = docx2pdf(docx_path) # 将docx转换为pdf以供后续处理 + # docx_path = file_path + pdf_path = docx2pdf(file_path) # 将docx转换为pdf以供后续处理 elif file_type == 2: # pdf pdf_path = file_path - docx_path = pdf2docx(pdf_path) + # docx_path = pdf2docx(pdf_path) elif file_type == 3: # doc pdf_path = docx2pdf(file_path) - docx_path = doc2docx(file_path) + # docx_path = doc2docx(file_path) else: logger.error("Unsupported file type provided. Preprocessing halted.") return None @@ -47,7 +51,19 @@ def preprocess_files(output_folder, file_path, file_type,logger): # 处理各个部分 invalid_path=pdf_path - invalid_docpath = docx_path # docx截取无效标部分 + + invalid_added_pdf = insert_mark(invalid_path) + invalid_added_docx = pdf2docx(invalid_added_pdf) # 有标记的invalid_path + try: + # 尝试加载 .docx 文件 + doc = Document(invalid_added_docx) + print("yes") + except Exception as e: + # 捕获异常并打印错误信息 + invalid_added_docx=pdf2docx(pdf_path) + invalid_deleted_docx = delete_mark(invalid_added_docx) # 无标记的invalid_path + + # invalid_docpath = invalid_added_docx # docx截取无效标部分 procurement_path = truncate_files[5] # 采购需求 evaluation_method_path = truncate_files[1] # 评标办法 qualification_path = truncate_files[2] # 资格审查 @@ -61,27 +77,26 @@ def preprocess_files(output_folder, file_path, file_type,logger): # 提前返回,不等待 future_knowledge 完成,返回包含 Future 对象 return { - 'invalid_path': invalid_path, + 'invalid_deleted_docx': invalid_deleted_docx, + 'invalid_added_docx': invalid_added_docx, 'output_folder': output_folder, 'procurement_path': procurement_path, 'evaluation_method_path': evaluation_method_path, 'qualification_path': qualification_path, 'notice_path': notice_path, - # 'knowledge_future': future_knowledge, # 返回 Future 对象 'clause_path': clause_path, - 'invalid_docpath': invalid_docpath, 'merged_baseinfo_path': merged_baseinfo_path } -def fetch_project_basic_info(invalid_path, merged_baseinfo_path, procurement_path, clause_path, logger): +def fetch_project_basic_info(invalid_deleted_docx, merged_baseinfo_path, procurement_path, clause_path, logger): logger.info("starting 基础信息...") start_time = time.time() try: if not merged_baseinfo_path: - merged_baseinfo_path = invalid_path + merged_baseinfo_path = invalid_deleted_docx if not procurement_path: - procurement_path = invalid_path - basic_res = combine_basic_info(merged_baseinfo_path, procurement_path, clause_path, invalid_path) + procurement_path = invalid_deleted_docx + basic_res = combine_basic_info(merged_baseinfo_path, procurement_path, clause_path, invalid_deleted_docx) base_info, good_list = post_process_baseinfo(basic_res, logger) result = base_info, good_list end_time = time.time() @@ -93,11 +108,13 @@ def fetch_project_basic_info(invalid_path, merged_baseinfo_path, procurement_pat return result -def fetch_qualification_review(invalid_path, qualification_path, notice_path, logger): +def fetch_qualification_review(invalid_deleted_docx, qualification_path, notice_path, logger): logger.info("starting 资格审查...") start_time = time.time() try: - review_standards_res = combine_qualification_review(invalid_path, qualification_path, notice_path) + if not notice_path: + notice_path=invalid_deleted_docx + review_standards_res = combine_qualification_review(invalid_deleted_docx, qualification_path, notice_path) result = review_standards_res end_time = time.time() logger.info(f"资格审查 done,耗时:{end_time - start_time:.2f} 秒") @@ -107,12 +124,12 @@ def fetch_qualification_review(invalid_path, qualification_path, notice_path, lo result = {"资格审查": {}} return result -def fetch_evaluation_standards(invalid_path, evaluation_method_path,logger): +def fetch_evaluation_standards(invalid_deleted_docx, evaluation_method_path,logger): logger.info("starting 商务评分和技术评分...") start_time = time.time() if not evaluation_method_path: - evaluation_method_path = invalid_path - evaluation_standards_res = combine_evaluation_standards(evaluation_method_path,invalid_path,2) + evaluation_method_path = invalid_deleted_docx + evaluation_standards_res = combine_evaluation_standards(evaluation_method_path,invalid_deleted_docx,2) technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})} commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})} end_time = time.time() @@ -122,11 +139,11 @@ def fetch_evaluation_standards(invalid_path, evaluation_method_path,logger): "commercial_standards": commercial_standards } -def fetch_invalid_requirements(invalid_docpath, output_folder, logger): +def fetch_invalid_requirements(invalid_added_docx, output_folder, logger): logger.info("starting 无效标与废标...") start_time = time.time() try: - find_invalid_res = combine_find_invalid(invalid_docpath, output_folder) + find_invalid_res = combine_find_invalid(invalid_added_docx, output_folder) result = find_invalid_res end_time = time.time() logger.info(f"无效标与废标 done,耗时:{end_time - start_time:.2f} 秒") @@ -136,10 +153,10 @@ def fetch_invalid_requirements(invalid_docpath, output_folder, logger): result = {"无效标与废标": {}} return result -def fetch_bidding_documents_requirements(invalid_path, merged_baseinfo_path, clause_path, logger): +def fetch_bidding_documents_requirements(invalid_deleted_docx, merged_baseinfo_path, clause_path, logger): logger.info("starting 投标文件要求...") if not merged_baseinfo_path: - merged_baseinfo_path = invalid_path + merged_baseinfo_path = invalid_deleted_docx start_time = time.time() selection = 1 try: @@ -155,10 +172,10 @@ def fetch_bidding_documents_requirements(invalid_path, merged_baseinfo_path, cla # 开评定标流程 -def fetch_bid_opening(invalid_path, merged_baseinfo_path, clause_path, logger): +def fetch_bid_opening(invalid_deleted_docx, merged_baseinfo_path, clause_path, logger): logger.info("starting 开评定标流程...") if not merged_baseinfo_path: - merged_baseinfo_path = invalid_path + merged_baseinfo_path = invalid_deleted_docx start_time = time.time() selection = 2 try: @@ -218,16 +235,16 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id): with concurrent.futures.ThreadPoolExecutor() as executor: # 立即启动不依赖 knowledge_name 和 index 的任务 futures = { - 'evaluation_standards': executor.submit(fetch_evaluation_standards,processed_data['invalid_path'], + 'evaluation_standards': executor.submit(fetch_evaluation_standards,processed_data['invalid_deleted_docx'], processed_data['evaluation_method_path'],logger), - 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'], + 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_added_docx'], output_folder,logger), - 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_path'],processed_data['merged_baseinfo_path'], + 'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,processed_data['invalid_deleted_docx'],processed_data['merged_baseinfo_path'], processed_data['clause_path'],logger), - 'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_path'],processed_data['merged_baseinfo_path'],processed_data['clause_path'],logger), - 'base_info': executor.submit(fetch_project_basic_info, processed_data['invalid_path'],processed_data['merged_baseinfo_path'], + 'opening_bid': executor.submit(fetch_bid_opening, processed_data['invalid_deleted_docx'],processed_data['merged_baseinfo_path'],processed_data['clause_path'],logger), + 'base_info': executor.submit(fetch_project_basic_info, processed_data['invalid_deleted_docx'],processed_data['merged_baseinfo_path'], processed_data['procurement_path'],processed_data['clause_path'],logger), - 'qualification_review': executor.submit(fetch_qualification_review, processed_data['invalid_path'], + 'qualification_review': executor.submit(fetch_qualification_review, processed_data['invalid_deleted_docx'], processed_data['qualification_path'], processed_data['notice_path'],logger), } diff --git a/flask_app/工程标/基础信息整合工程标.py b/flask_app/工程标/基础信息整合工程标.py index a2e6065..d58f377 100644 --- a/flask_app/工程标/基础信息整合工程标.py +++ b/flask_app/工程标/基础信息整合工程标.py @@ -156,11 +156,11 @@ def combine_basic_info(merged_baseinfo_path,merged_baseinfo_path_more, tobidders """ # baseinfo_prompt_file_path=r'D:\flask_project\flask_app\static\提示词\基本信息工程标qianwen-long.txt' baseinfo_prompt_file_path = 'flask_app/static/提示词/基本信息工程标qianwen-long.txt' - file_id1 = upload_file(merged_baseinfo_path) + file_id = upload_file(merged_baseinfo_path) questions = read_questions_from_file(baseinfo_prompt_file_path) more_query = "请你根据招标文件信息,回答以下问题:是否组织踏勘现场?是否召开投标预备会?是否允许偏离?是否退还投标文件?是否允许分包? 是否需要递交投标保证金?是否需要提交履约保证金(履约担保)?是否有招标代理服务费?请按json格式给我提供信息,键名分别为'是否组织踏勘现场','是否召开投标预备会','是否允许偏离','是否退还投标文件',是否允许分包','是否递交投标保证金','是否提交履约保证金','是否有招标代理服务费',键值仅限于'是','否','未知',若存在矛盾信息,请回答'未知'。" questions.append(more_query) - baseinfo_results = multi_threading(questions, "", file_id1, 2) + baseinfo_results = multi_threading(questions, "", file_id, 2) baseinfo_list1 = [clean_json_string(res) for _, res in baseinfo_results] if baseinfo_results else [] chosen_numbers, merged = merge_json_to_list(baseinfo_list1.pop(),tobidders_notice) baseinfo_list1_copy = copy.deepcopy(baseinfo_list1) @@ -170,7 +170,7 @@ def combine_basic_info(merged_baseinfo_path,merged_baseinfo_path_more, tobidders with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: # 提交两个任务 - future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, merged_baseinfo_path, baseinfo_list1) + future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, file_id, baseinfo_list1) future2 = executor.submit(process_baseinfo_list, baseinfo_list1_copy, tobidders_notice) #只问tobidders_notice future3 = executor.submit(extract_from_notice, merged_baseinfo_path_more, clause_path, 3) # 新增的多线程任务 diff --git a/flask_app/工程标/资格评审.py b/flask_app/工程标/资格评审.py index daaf3ac..2888f51 100644 --- a/flask_app/工程标/资格评审.py +++ b/flask_app/工程标/资格评审.py @@ -202,15 +202,6 @@ def process_qualification(qualification_review, qualification_path, invalid_path # results2 = multi_threading(ques, "", file_id2, 2) # 资格评审表,调用qianwen-long # res_list = [clean_json_string(res) for _, res in results2] if results2 else [] return res - # if res_list: - # 生成外键是'资格评审'的字典 - # merged_dict = merge_dictionaries_under_common_key(res_list, '资格评审') - # consortium_dict = get_consortium_dict(merged_baseinfo_path) - # updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) - # return updated_qualify_json - # else: - # print("资格评审: 无法获取大模型结果,返回空值") - # return {"资格评审": ""} else: print("资格评审: type2") return get_all_dict(invalid_path) or {"资格评审": ""} diff --git a/flask_app/货物标/基础信息解析main.py b/flask_app/货物标/基础信息解析main.py index da0afe0..7e9df11 100644 --- a/flask_app/货物标/基础信息解析main.py +++ b/flask_app/货物标/基础信息解析main.py @@ -147,7 +147,7 @@ def get_base_info(merged_baseinfo_path,clause_path): # judge_file_path ='D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题货物标.txt' with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor: # 提交两个任务 - future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, merged_baseinfo_path, + future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, file_id, baseinfo_list) future2 = executor.submit(extract_from_notice, merged_baseinfo_path, clause_path, 3) # 新增的多线程任务 diff --git a/flask_app/货物标/技术参数要求提取.py b/flask_app/货物标/技术参数要求提取.py index 30c80f1..f283f29 100644 --- a/flask_app/货物标/技术参数要求提取.py +++ b/flask_app/货物标/技术参数要求提取.py @@ -6,7 +6,7 @@ import time from collections import defaultdict from copy import deepcopy -from flask_app.general.file2markdown import convert_pdf_to_markdown +from flask_app.general.file2markdown import convert_file_to_markdown from flask_app.general.format_change import pdf2docx from flask_app.general.多线程提问 import multi_threading from flask_app.general.通义千问long import qianwen_long, upload_file diff --git a/flask_app/货物标/提取采购需求main.py b/flask_app/货物标/提取采购需求main.py index 1de4708..7ed9e66 100644 --- a/flask_app/货物标/提取采购需求main.py +++ b/flask_app/货物标/提取采购需求main.py @@ -3,7 +3,7 @@ import json import time from flask_app.general.doubao import pdf2txt -from flask_app.general.file2markdown import convert_pdf_to_markdown +from flask_app.general.file2markdown import convert_file_to_markdown from flask_app.general.format_change import pdf2docx from flask_app.货物标.技术参数要求提取 import get_technical_requirements from flask_app.general.通义千问long import upload_file @@ -26,7 +26,7 @@ def fetch_procurement_reqs(procurement_path, invalid_path): return DEFAULT_PROCUREMENT_REQS.copy() try: - processed_filepath = convert_pdf_to_markdown(procurement_path) # 转markdown格式 + processed_filepath = convert_file_to_markdown(procurement_path) # 转markdown格式 # processed_filepath = pdf2txt(procurement_path) # 纯文本提取 # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements with concurrent.futures.ThreadPoolExecutor() as executor: diff --git a/flask_app/货物标/资格审查main.py b/flask_app/货物标/资格审查main.py index 0302b2a..4ec0a85 100644 --- a/flask_app/货物标/资格审查main.py +++ b/flask_app/货物标/资格审查main.py @@ -634,9 +634,6 @@ def combine_qualification_review(invalid_path, qualification_path, notice_path): if notice_path: future = executor.submit(process_notice, notice_path) future_to_key[future] = "申请人资格要求" - else: - future = executor.submit(process_notice, invalid_path) - future_to_key[future] = "申请人资格要求" # 收集结果(按完成顺序) for future in as_completed(future_to_key):