From f8be23deaacc9b2dd0ce8b24c34bf22d9bde179a Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Thu, 26 Sep 2024 10:22:03 +0800 Subject: [PATCH] =?UTF-8?q?9.25=20=E8=B5=84=E6=A0=BC=E5=AE=A1=E6=9F=A5?= =?UTF-8?q?=E8=BF=98=E5=B7=AE=E8=B7=B3=E8=BD=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/main/start_up.py | 2 +- .../main/投标人须知正文条款提取成json文件.py | 4 +- .../投标人须知正文条款提取成json文件.py | 141 ++++++++++++++++++ flask_app/货物标/提示词/prompt1.txt | 2 +- flask_app/货物标/货物标截取pdf.py | 2 +- flask_app/货物标/资格审查main.py | 2 +- 6 files changed, 147 insertions(+), 6 deletions(-) create mode 100644 flask_app/货物标/投标人须知正文条款提取成json文件.py diff --git a/flask_app/main/start_up.py b/flask_app/main/start_up.py index d70df61..9b11239 100644 --- a/flask_app/main/start_up.py +++ b/flask_app/main/start_up.py @@ -147,7 +147,7 @@ def process_and_stream(file_url): for outer_key, inner_dict in parsed_data.items(): if isinstance(inner_dict, dict): combined_data.update(inner_dict) - + logger.info(json.dumps(combined_data, ensure_ascii=False,indent=4)) # 等待所有数据都处理完后,发送整合后的完整数据 complete_response = { 'message': 'Combined data', diff --git a/flask_app/main/投标人须知正文条款提取成json文件.py b/flask_app/main/投标人须知正文条款提取成json文件.py index ac36918..c2bd3fa 100644 --- a/flask_app/main/投标人须知正文条款提取成json文件.py +++ b/flask_app/main/投标人须知正文条款提取成json文件.py @@ -196,13 +196,13 @@ def post_process_json(json_file_path): #处理一级标题如'5.1'过长的内 if __name__ == "__main__": # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf' - file_path='C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest20_tobidders_notice.pdf' + file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output3\\2-招标文件(广水市教育局封闭管理)_qualification1.pdf' # start_word = "投标人须知正文" # end_phrases = [ # r'^第[一二三四五六七八九十]+章\s+评标办法', r'^评标办法前附表', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', # r'^附表:', r'^附表一:', r'^附录:', r'^附录一:', r'^附件:', r'^附件一:', r'^附表:', r'^附表一:', # ] - output_folder = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp' + output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output3\\tmp' try: output_path = convert_clause_to_json(file_path,output_folder) print(f"Final JSON result saved to: {output_path}") diff --git a/flask_app/货物标/投标人须知正文条款提取成json文件.py b/flask_app/货物标/投标人须知正文条款提取成json文件.py new file mode 100644 index 0000000..3962254 --- /dev/null +++ b/flask_app/货物标/投标人须知正文条款提取成json文件.py @@ -0,0 +1,141 @@ +import json +import docx +import re +import os +from PyPDF2 import PdfReader +from flask_app.main.截取pdf import clean_page_content, extract_common_header + +def extract_text_from_docx(file_path): + doc = docx.Document(file_path) + return '\n'.join([para.text for para in doc.paragraphs]) + + +def extract_text_from_pdf(file_path): + # 从PDF文件中提取文本 + common_header = extract_common_header(file_path) + pdf_document = PdfReader(file_path) + text = "" + # 遍历每一页 + for page in pdf_document.pages: + # 提取当前页面的文本 + page_text = page.extract_text() if page.extract_text() else "" + # 清洗页面文本 + page_text = clean_page_content(page_text, common_header) + # 将清洗后的文本添加到总文本中 + text += page_text + "\n" + return text + +def extract_section(text, start_pattern, end_phrases): + # 查找开始模式 + start_match = re.search(start_pattern, text) + if not start_match: + return "" # 如果没有找到匹配的开始模式,返回空字符串 + start_index = start_match.end() # 从匹配的结束位置开始 + + # 初始化结束索引为文本总长度 + end_index = len(text) + + # 遍历所有结束短语,查找第一个出现的结束短语 + for phrase in end_phrases: + match = re.search(phrase, text[start_index:], flags=re.MULTILINE) + if match: + end_index = start_index + match.start() # 更新结束索引为匹配到的开始位置 + break # 找到第一个匹配后立即停止搜索 + + # 提取并返回从开始模式后到结束模式前的内容 + return text[start_index:end_index] + +def should_add_newline(content, keywords, max_length=20): + content_str = ''.join(content).strip() + return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length + +def handle_content_append(current_content, line_content, append_newline, keywords): + if append_newline: + if should_add_newline(current_content, keywords): + current_content.append('\n') # 添加换行符 + append_newline = False + current_content.append(line_content) + return append_newline + +#对二级标题如x.x进行额外处理:如果当前处理内容包含keywords中的内容,则必须保留换行符/如果当前内容字数大于20,不保留换行。 +def parse_text_by_heading(text): + keywords = ['包含', '以下'] + data = {} + current_key = None + current_content = [] + append_newline = False + + lines = text.split('\n') + for i, line in enumerate(lines): + line_stripped = line.strip() + # 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号 + match = re.match(r'^(?