import json import docx import re import os import fitz from PyPDF2 import PdfReader from flask_app.货物标.截取pdf货物标版 import clean_page_content,extract_common_header def extract_text_from_docx(file_path): doc = docx.Document(file_path) return '\n'.join([para.text for para in doc.paragraphs]) #PYPDF2版本 def extract_text_from_pdf(file_path, start_word, end_pattern): # 从PDF文件中提取文本 common_header = extract_common_header(file_path) pdf_document = PdfReader(file_path) exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') all_pages_text = [] start_index = None # 处理所有页面 for i, page in enumerate(pdf_document.pages): page_text = page.extract_text() if page.extract_text() else "" cleaned_text = clean_page_content(page_text, common_header) # print(cleaned_text) # 在第一页查找开始位置 if i == 0 and start_index is None: start_match = re.search(start_word, cleaned_text, re.MULTILINE) if start_match: start_index = start_match.start() cleaned_text = cleaned_text[start_index:] # 在最后一页查找结束位置 if i == len(pdf_document.pages) - 1: matches = list(re.finditer(end_pattern, cleaned_text, re.MULTILINE)) if matches: end_index = matches[-1].start() cleaned_text = cleaned_text[:end_index] all_pages_text.append(cleaned_text) # 合并所有页面的文本 full_text = "\n".join(all_pages_text) return full_text #fitz库版本 # def extract_text_from_pdf(file_path, start_word, end_pattern): # # 从PDF文件中提取文本 # common_header = extract_common_header(file_path) # doc = fitz.open(file_path) # all_pages_text = [] # start_index = None # # # 处理所有页面 # for i in range(len(doc)): # page = doc[i] # page_text = page.get_text() # cleaned_text = clean_page_content(page_text, common_header) # print(cleaned_text) # print("yes") # # 在第一页查找开始位置 # if i == 0 and start_index is None: # start_match = re.search(start_word, cleaned_text, re.MULTILINE) # if start_match: # start_index = start_match.start() # cleaned_text = cleaned_text[start_index:] # # # 在最后一页查找结束位置 # if i == len(doc) - 1: # for pattern in end_pattern: # matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE)) # if matches: # end_index = matches[-1].start() # cleaned_text = cleaned_text[:end_index] # break # # all_pages_text.append(cleaned_text) # # # 合并所有页面的文本 # full_text = "\n".join(all_pages_text) # # 关闭文档 # doc.close() # # return full_text def compare_headings(current, new): # 使用过滤来确保只处理非空且为数字的部分 current_nums = [int(num) for num in current.split('.') if num.isdigit()] new_nums = [int(num) for num in new.split('.') if num.isdigit()] # 比较数字序列以确定标题的层次关系 for c, n in zip(current_nums, new_nums): if n > c: return True elif n < c: return False # 如果新标题有更多层次,认为是新的子章节 return len(new_nums) > len(current_nums) def should_add_newline(content, keywords, max_length=20): content_str = ''.join(content).strip() return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length def handle_content_append(current_content, line_content, append_newline, keywords,in_special_section): if append_newline: if should_add_newline(current_content, keywords): current_content.append('\n') # 添加换行符 append_newline = False current_content.append(line_content) if in_special_section: current_content.append('\n') return append_newline """ 保存换行符的具体逻辑: 对于二级标题(如 1.1),如果其后的内容包含关键词或内容较短(<=20字符),会在内容前添加一个换行符。 这个换行符会被保留在 current_content 列表中。 当处理下一个标题时,之前的内容(包括可能存在的换行符)会被合并并保存到 data 字典中。 解决了'一''二'这类标题出现在正文中的情况。但是目前的逻辑是如果'一'已有了,就加入正文,否则'一'作为新的标题。 """ #提取json主函数 def parse_text_by_heading(text): keywords = ['包含', '以下'] data = {} current_key = None current_key_chinese = None current_value_chinese = None current_content = [] append_newline = False skip_subheadings = False last_main_number = None temp_title = None # 临时存储以点号开头但不带数字的标题 pattern_numbered = re.compile(r'^\s*([一二三四五六七八九十]{1,2})\s*、\s*') pattern_parentheses = re.compile(r'^\s*[((]\s*([一二三四五六七八九十]{1,2})\s*[))]\s*') initial_heading_pattern = None special_section_keywords = ['文件的组成', '文件的构成'] # 定义特殊章节关键词 in_special_section = False # 标志是否在特殊章节中 lines = text.split('\n') def check_year_pattern(line): # 检查是否为年份模式,如 '2014年' line_no_spaces = line.replace(' ', '') return re.match(r'^\d{4}\s*年', line_no_spaces) is not None for i, line in enumerate(lines): line_stripped = line.strip().replace('.', '.') # 如果在一个章节内,跳过年份模式的行 if check_year_pattern(line_stripped) and current_key is not None: current_content.append(line_stripped) continue # **首先检查是否进入特殊章节** if any(keyword in line_stripped for keyword in special_section_keywords): in_special_section = True # 匹配带数字的标题,例如 '12.1 内容' match = re.match(r'^(? current_key_num and new_key_num - current_key_num <= 5)): # 处理临时标题与新主标题的关联 if temp_title: main_number = new_key_candidate.split('.')[0] data[f"{main_number}."] = temp_title temp_title = None # 重置临时标题 # 开始新的标题 if current_key is not None: content_string = ''.join(current_content).strip() data[current_key] = data.get(current_key, '') + content_string.replace(' ', '') current_key = new_key_candidate current_content = [line_content] append_newline = True last_main_number = new_key_candidate.rstrip('.') else: # 将当前行视为当前标题的内容 append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section) else: if not skip_subheadings and not in_special_section: # 增加 in_special_section 判断 numbered_match = pattern_numbered.match(line_stripped) parenthesis_match = pattern_parentheses.match(line_stripped) letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped) arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped) # 判断当前行是否匹配了任何标题模式 if numbered_match or parenthesis_match or letter_match or arabic_match: # 如果初始标题模式尚未设置,则记录当前匹配的标题模式 if initial_heading_pattern is None: if numbered_match: initial_heading_pattern = 'numbered' elif parenthesis_match: initial_heading_pattern = 'parentheses' elif letter_match: initial_heading_pattern = 'letter' elif arabic_match: initial_heading_pattern = 'arabic' # 确定当前匹配的标题模式 if numbered_match: current_heading_pattern = 'numbered' elif parenthesis_match: current_heading_pattern = 'parentheses' elif letter_match: current_heading_pattern = 'letter' elif arabic_match: current_heading_pattern = 'arabic' # 如果当前标题模式与初始标题模式一致,创建新的键值对 if current_heading_pattern == initial_heading_pattern: # 保存之前的 key 的内容 if current_key is not None: content_string = ''.join(current_content).strip() data[current_key] = data.get(current_key, '') + content_string.replace(' ', '') if current_key_chinese is not None: data[current_key_chinese] = current_value_chinese current_key_chinese = None # 处理匹配到的标题 if current_heading_pattern == 'numbered': current_key_chinese = numbered_match.group(1) current_value_chinese = line_stripped[numbered_match.end():].lstrip('..、,').replace(' ', '') elif current_heading_pattern == 'parentheses': current_key_chinese = parenthesis_match.group(1) current_value_chinese = line_stripped[parenthesis_match.end():].lstrip('..、,').replace(' ', '') elif current_heading_pattern == 'letter': # 字母标题处理 letter_key, letter_value = letter_match.groups() letter_to_chinese = { 'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五', 'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十', 'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五' } current_key_chinese = letter_to_chinese.get(letter_key, letter_key) current_value_chinese = letter_value.lstrip('..、,').replace(' ', '') elif current_heading_pattern == 'arabic': arabic_key, arabic_value = arabic_match.groups() current_key = arabic_key.replace('、', '.') current_content = [arabic_value] append_newline = True last_main_number = current_key.rstrip('.') continue else: # 当前标题模式与初始模式不一致,将该行视为内容 if line_stripped: append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section) else: # 未匹配到任何标题模式,将该行视为内容 if line_stripped: append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section) else: # 在特殊章节中,所有内容都作为当前标题的内容 if line_stripped: append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,in_special_section) # 最后保存最后一个 key 对应的内容 if current_key is not None: content_string = ''.join(current_content).strip() data[current_key] = data.get(current_key, '') + content_string.replace(' ', '') if current_key_chinese is not None: data[current_key_chinese] = current_value_chinese if temp_title: # 处理任何未关联的临时标题 data["未分类标题"] = temp_title # 您可以根据需要选择合适的键名 return data #type=2时提取货物标的第一章招标公告时采用该逻辑 def parse_text_to_dict(text): """ 解析文本,根据大标题划分内容,生成字典。 参数: text (str): 要解析的文本。 返回: dict: 大标题作为键,内容作为值的字典。 """ # 正则表达式模式:匹配以一至十的汉字数字开头,后跟顿号和任意字符,且位于行首 pattern = re.compile(r'^([一二三四五六七八九十]+\s*、\s*.*)$', re.MULTILINE) # 使用 re.finditer 找到所有大标题的位置 matches = list(pattern.finditer(text)) result = {} for i, match in enumerate(matches): title = match.group(1).strip() # 获取大标题文本 start = match.end() # 内容的起始位置 if i + 1 < len(matches): end = matches[i + 1].start() # 下一个大标题的起始位置 else: end = len(text) # 最后一个大标题,内容到文本末尾 content = text[start:end].strip() # 获取内容并去除前后空白 # 规范化换行符,并移除每行开头和结尾的空白 content = content.replace('\r\n', '\n') # 统一换行符 content = re.sub(r'[ \t]+\n', '\n', content) # 移除行尾空白 content = re.sub(r'^[ \t]+|[ \t]+$', '', content, flags=re.MULTILINE) # 移除行首和行尾空白 content = clean_content(content) # 处理内容中的换行符 result[title] = content return result def clean_content(content): """ 处理内容中的换行符: - 保留在子项编号前的换行符。 - 保留在冒号 ':' 或全角冒号 ':' 前的第一个换行符。 - 移除其他位置的换行符,不留下额外的空格。 参数: content (str): 要处理的内容字符串。 返回: str: 处理后的内容字符串。 """ # 定义子项编号的正则模式,包括: # - 数字+点号+数字(如 1.1 或 1.1) # - 数字+顿号(如 2、) # - 点号+数字(如 .3 或 .3) # - 数字+右括号(如 1) 或 1)) # - 圆括号包围的数字(如 (5)) # - 全角圆括号包围的数字(如 (5)) # - 数字+点号(如 1. 或 1.) numbering_pattern = r'(?:\d+[..]\d+(?:[..]\d+)*|\d+、|[..]\d+|\d+[))]|\(\d+\)|(\d+)|\d+[..])' # 定义需要保留换行符的情况: # 1. 换行符前面是子项编号 # 2. 换行符前面是任意非空白字符,且后面跟着 ':' 或 ':' pattern_keep = r'\n(?=(?:' + numbering_pattern + r'|[^:\n\r\f\v]+[::]))' # 定义占位符,用于暂时替换需要保留的换行符 placeholder = "___PLACEHOLDER___" # Step 1: 将需要保留的换行符替换为占位符 content_with_placeholder = re.sub(pattern_keep, placeholder, content) # Step 2: 移除所有剩余的换行符 content_no_newlines = content_with_placeholder.replace('\n', '') # Step 3: 将占位符替换回换行符 cleaned_content = content_no_newlines.replace(placeholder, '\n') return cleaned_content #如果file_path为空,返回"" def convert_clause_to_json(file_path,output_folder,type=1,suffix_counter="1.json"): if not os.path.exists(file_path): print(f"The specified file does not exist: {file_path}") return "" if type == 1: start_word = r'^\s*(?:[((]?\s*[一二12]?\s*[))]?\s*[、..]*\s*)?(说\s*明|总\s*则)' end_pattern = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]+\s*)$' text = extract_text_from_pdf(file_path, start_word, end_pattern) result = parse_text_by_heading(text) else: start_word = r'^(?:第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书).*|.*(?:招标公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*)$' end_pattern = r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+' text = extract_text_from_pdf(file_path, start_word, end_pattern) result=parse_text_to_dict(text) # result = convert_to_json(input_path, start_word, end_pattern) # 检查输出文件夹是否存在,如果不存在则创建 if not os.path.exists(output_folder): os.makedirs(output_folder) print(f"Created output folder: {output_folder}") file_name = "clause1.json" if type == 1 else "clause2.json" # file_name = f"clause{suffix_counter}.json" output_path = os.path.join(output_folder, file_name) with open(output_path, 'w', encoding='utf-8') as f: json.dump(result, f, indent=4, ensure_ascii=False) print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.") return output_path def process_folder(input_folder, output_folder): # 获取输入文件夹中的所有文件,过滤掉不以 'part2' 结尾的文件 files = [f for f in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, f)) and f.endswith('part2.pdf')] # 遍历文件并对每个文件进行处理 for file_name in files: file_path = os.path.join(input_folder, file_name) # 去掉文件扩展名 file_name_without_extension = os.path.splitext(file_name)[0] try: # 调用 convert_clause_to_json,传递文件路径、输出文件夹和递增的后缀 output_path = convert_clause_to_json(file_path, output_folder, 1, file_name_without_extension) print(f"Processed file: {file_name}, JSON saved to: {output_path}") except ValueError as e: print(f"Error processing {file_name}: {e}") #TODO:'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次)_tobidders_notice_part2.pdf' PYPDF2库读取有遗漏 #TODO: 投标人须知正文这块,序号可能是乱序的,或许可以删除判断序号大小的逻辑,只要出现在开头的序号就作为新的键 eg:2-招标文件。目前将这种情况当特殊处理 #TODO:11.6 目前 '文件的组成' 是匹配任意行,可以考虑只匹配'11.文件的组成' 前面有序号的行 a110ed59-00e8-47ec-873a-bd4579a6e628\ztbfile_tobidders_notice_part2.pdf还有问题 if __name__ == "__main__": # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf' file_path=r'C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\ztbfile_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf' output_folder = r'C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\\tmp' try: output_path = convert_clause_to_json(file_path,output_folder,1) print(f"Final JSON result saved to: {output_path}") except ValueError as e: print("Error:", e) # input_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4' # output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1' # # # 调用 process_folder 来处理整个文件夹中的所有文件 # process_folder(input_folder, output_folder)