diff --git a/flask_app/main/table_content_extraction.py b/flask_app/main/table_content_extraction.py index 75af5d0..104681b 100644 --- a/flask_app/main/table_content_extraction.py +++ b/flask_app/main/table_content_extraction.py @@ -107,6 +107,6 @@ def extract_tables_main(path, output_folder): if __name__ == "__main__": - path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp\\ztbfile_tobidders_notice_table.docx' - output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp" # 前附表json文件 + path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件(一中多媒体报告厅教学设备)_tobidders_notice_part1.docx' + output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp" # 前附表json文件 extract_tables_main(path, output_folder) diff --git a/flask_app/main/ttt.py b/flask_app/main/ttt.py deleted file mode 100644 index c2bd3fa..0000000 --- a/flask_app/main/ttt.py +++ /dev/null @@ -1,210 +0,0 @@ -import json -import docx -import re -import os -from PyPDF2 import PdfReader -from flask_app.main.截取pdf import clean_page_content,extract_common_header - -def extract_text_from_docx(file_path): - doc = docx.Document(file_path) - return '\n'.join([para.text for para in doc.paragraphs]) - - -def extract_text_from_pdf(file_path): - # 从PDF文件中提取文本 - common_header = extract_common_header(file_path) - pdf_document = PdfReader(file_path) - text = "" - # 遍历每一页 - for page in pdf_document.pages: - # 提取当前页面的文本 - page_text = page.extract_text() if page.extract_text() else "" - # 清洗页面文本 - page_text = clean_page_content(page_text, common_header) - # 将清洗后的文本添加到总文本中 - text += page_text+"\n" - return text - -def extract_section(text, start_pattern, end_phrases): - # 查找开始模式 - start_match = re.search(start_pattern, text) - if not start_match: - return "" # 如果没有找到匹配的开始模式,返回空字符串 - start_index = start_match.end() # 从匹配的结束位置开始 - - # 初始化结束索引为文本总长度 - end_index = len(text) - - # 遍历所有结束短语,查找第一个出现的结束短语 - for phrase in end_phrases: - match = re.search(phrase, text[start_index:], flags=re.MULTILINE) - if match: - end_index = start_index + match.start() # 更新结束索引为匹配到的开始位置 - break # 找到第一个匹配后立即停止搜索 - - # 提取并返回从开始模式后到结束模式前的内容 - return text[start_index:end_index] - -def compare_headings(current, new): - # 使用过滤来确保只处理非空且为数字的部分 - current_nums = [int(num) for num in current.split('.') if num.isdigit()] - new_nums = [int(num) for num in new.split('.') if num.isdigit()] - - # 比较数字序列以确定标题的层次关系 - for c, n in zip(current_nums, new_nums): - if n > c: - return True - elif n < c: - return False - - # 如果新标题有更多层次,认为是新的子章节 - return len(new_nums) > len(current_nums) - - -def should_add_newline(content, keywords, max_length=20): - content_str = ''.join(content).strip() - return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length - -def handle_content_append(current_content, line_content, append_newline, keywords): - if append_newline: - if should_add_newline(current_content, keywords): - current_content.append('\n') # 添加换行符 - append_newline = False - current_content.append(line_content) - return append_newline - -#对二级标题如x.x进行额外处理:如果当前处理内容包含keywords中的内容,则必须保留换行符/如果当前内容字数大于20,不保留换行。 -def parse_text_by_heading(text): - keywords = ['包含', '以下'] - data = {} - current_key = None - current_content = [] - append_newline = False - - lines = text.split('\n') - for i, line in enumerate(lines): - line_stripped = line.strip() - # 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号 - match = re.match(r'^(?1并且键名全由数字或点号组成。那么就将这些序号键名全部删除,重新组织成一个字典格式的数据,你可以考虑用字符串列表来保持部分平级的数据 -#对于同级的键,如果数量>1且键名都统一,那么将键名去掉,用列表保持它们的键值 -#对于同一个字典中,可能存在若干键值对,若它们的键值都是""或者"/" 你就将它们的键值删去,它们的键名用字符串列表保存 +def test_append_newline(): + def check_append_newline(key): + append_newline = len(key.split('.')) == 2 + return append_newline -#zbtest20也有问题 -def contains_number_or_index(key, value): - # 判断值是否是数字或数字字符串 - is_number = isinstance(value, (int, float)) or (isinstance(value, str) and value.isdigit()) - # 判断键是否包含 "序号" - contains_index = '序号' in key - # 判断值中是否包含数字 - contains_digit = isinstance(value, str) and re.search(r'\d+', value) - # 判断值中是否包含中文字符 - contains_chinese = isinstance(value, str) and re.search(r'[\u4e00-\u9fff]', value) - # 如果值中包含数字但也有中文字符,则保留(返回 False) - if contains_digit and contains_chinese: - return False - # 如果值是数字或包含数字,且不包含中文字符,或者键包含 "序号",返回 True - return is_number or contains_index or contains_digit + # 测试用例 + test_cases = ["1.1", "1."] -#对于同一个字典中,可能存在若干键值对,若它们的键值都是""或者"/" 你就将它们的键值删去,它们的键名用字符串列表保存 -#如果键名是"序号"或者键值中全是数字,删去序号 -def preprocess_dict(data): - if isinstance(data, dict): - if len(data) > 1: - # 检查是否所有值都是 "" 或 "/" - if all(v == "" or v == "/" for v in data.values()): - return list(data.keys()) - else: - processed = {} - for k, v in data.items(): - if not contains_number_or_index(k, v): - processed_v = preprocess_dict(v) - if processed_v != "": # 只添加非空值 - processed[k] = processed_v - return processed - else: - return {k: preprocess_dict(v) for k, v in data.items()} - elif isinstance(data, list): - return [preprocess_dict(item) for item in data] - else: - return data + for case in test_cases: + result = check_append_newline(case) + print(f"序号 '{case}': append_newline = {result}") - -# 测试代码 -#TODO:同一层全部都是数字才成功删除,没需求了 -input_data = { - "符合性审查": { - "说明": "1ha", - "www":"哈哈", - "审查标准": [ - { - "序号": 1, - "内容": "投标总报价超过项目(分包)预算金额或最高限价的;" - }, - { - "序号": 2, - "内容": "《投标书》、《法定代表人授权书》、《开标一览表(含明细)》未提供或不符合招标文件要求的;" - }, - { - "序号": 3, - "内容": "工期(服务期限)、质保期不符合招标文件要求的;" - }, - ] - } -} -pred=preprocess_dict(input_data) -print(json.dumps(pred, ensure_ascii=False, indent=4)) -# processed_data = process_dict(pred) -# print(json.dumps(processed_data, ensure_ascii=False, indent=4)) \ No newline at end of file +# 运行测试 +test_append_newline() \ No newline at end of file diff --git a/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py b/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py new file mode 100644 index 0000000..e2ecb3a --- /dev/null +++ b/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py @@ -0,0 +1,184 @@ +import json +import docx +import re +import os +from PyPDF2 import PdfReader +from flask_app.main.截取pdf import clean_page_content,extract_common_header + +def extract_text_from_docx(file_path): + doc = docx.Document(file_path) + return '\n'.join([para.text for para in doc.paragraphs]) + + +def extract_text_from_pdf(file_path, start_word, end_pattern): + # 从PDF文件中提取文本 + common_header = extract_common_header(file_path) + pdf_document = PdfReader(file_path) + all_pages_text = [] + start_index = None + # 处理所有页面 + for i, page in enumerate(pdf_document.pages): + page_text = page.extract_text() if page.extract_text() else "" + cleaned_text = clean_page_content(page_text, common_header) + + # 在第一页查找开始位置 + if i == 0 and start_index is None: + start_match = re.search(start_word, cleaned_text, re.MULTILINE) + if start_match: + start_index = start_match.start() + cleaned_text = cleaned_text[start_index:] + + # 在最后一页查找结束位置 + if i == len(pdf_document.pages) - 1: + for pattern in end_pattern: + matches = list(re.finditer(pattern, cleaned_text, re.MULTILINE)) + if matches: + end_index = matches[-1].start() + cleaned_text = cleaned_text[:end_index] + break + + all_pages_text.append(cleaned_text) + + # 合并所有页面的文本 + full_text = "\n".join(all_pages_text) + # print(full_text) + return full_text + +def compare_headings(current, new): + # 使用过滤来确保只处理非空且为数字的部分 + current_nums = [int(num) for num in current.split('.') if num.isdigit()] + new_nums = [int(num) for num in new.split('.') if num.isdigit()] + + # 比较数字序列以确定标题的层次关系 + for c, n in zip(current_nums, new_nums): + if n > c: + return True + elif n < c: + return False + + # 如果新标题有更多层次,认为是新的子章节 + return len(new_nums) > len(current_nums) + + +def should_add_newline(content, keywords, max_length=20): + content_str = ''.join(content).strip() + return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length + +def handle_content_append(current_content, line_content, append_newline, keywords): + if append_newline: + if should_add_newline(current_content, keywords): + current_content.append('\n') # 添加换行符 + append_newline = False + current_content.append(line_content) + return append_newline + +""" +保存换行符的具体逻辑: + +对于二级标题(如 1.1),如果其后的内容包含关键词或内容较短(<=20字符),会在内容前添加一个换行符。 +这个换行符会被保留在 current_content 列表中。 +当处理下一个标题时,之前的内容(包括可能存在的换行符)会被合并并保存到 data 字典中。 +""" +#提取json主函数 +def parse_text_by_heading(text): + keywords = ['包含', '以下'] + data = {} + current_key = None + current_content = [] + append_newline = False + + lines = text.split('\n') #['一、说明', '1.适用范围', '招标文件仅适用于第一章“投标邀请书”中所述项目的货物、工程及服务的采购。'] + for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n' + line_stripped = line.strip() + + # 匹配中文数字标题,如 "一、说明" + chinese_match = re.match(r'^([一二三四五六七八九十]+、)\s*(.+)$', line_stripped) + if chinese_match: + chinese_key, chinese_value = chinese_match.groups() + chinese_key = chinese_key.rstrip('、') # 移除顿号 + data[chinese_key] = chinese_value + current_key = None + current_content = [] + continue + + # 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号 + match = re.match(r'^(? begin_page: start_page = i if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page: @@ -183,8 +187,9 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter pdf_document = PdfReader(pdf_path) exclusion_pattern = None if output_suffix == "tobidders_notice": + exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') start_page, mid_page, end_page = extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, - begin_page, common_header) + begin_page, common_header,exclusion_pattern) if start_page is None or mid_page is None or end_page is None: print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header) @@ -208,13 +213,15 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter print(f"Error processing {pdf_path}: {e}") return None -def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header): +def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header,exclusion_pattern): start_page = None mid_page = None end_page = None for i, page in enumerate(pdf_document.pages): text = page.extract_text() or "" cleaned_text = clean_page_content(text, common_header) + if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and mid_page is not None: + continue if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page: start_page = i if start_page is not None and mid_page is None and re.search( @@ -266,6 +273,7 @@ def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+' ) pdf_document = PdfReader(pdf_path) + exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') # 提取第一部分 start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, -1, common_header) if start_page1 is None or end_page1 is None: @@ -273,7 +281,7 @@ def extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, return None, None # 提取第二部分 start_page2 = end_page1 # 第二部分的开始页就是第一部分的结束页 - _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header) + _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,exclusion_pattern) if end_page2 is None: print(f"second: {output_suffix} 未找到第二部分的结束页在文件 {pdf_path} 中!") return None, None @@ -390,7 +398,7 @@ def truncate_pdf_multiple(input_path, output_folder): # TODO:交通智能系统和招标(1)(1)文件有问题 sele=4的时候excludsion有问题 if __name__ == "__main__": - input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目).pdf" + input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles" output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output4" # truncate_pdf_multiple(input_path,output_folder) selection = 4 # 例如:1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1和qualification2 4.投标人须知前附表