2.7 添加注释
This commit is contained in:
parent
50dd6dd3c8
commit
6f33d65c8f
26
README.md
26
README.md
@ -125,6 +125,32 @@ get_deviation.py、偏离表数据解析main.py用了process_functions_in_parall
|
||||
|
||||
三级是*截取pdf通用函数.py*
|
||||
|
||||
如何判断截取位置是否正确?根据output文件夹中的切分情况(打开各个文件查看是否切分准确,目前的逻辑主要是按大章切分,即'招标公告'章节)
|
||||
|
||||
|
||||
|
||||
**如果切分不准确,如何定位正则表达式?**
|
||||
|
||||
首先判断当前是工程标解析还是货物标解析,即zb_type=1还是2
|
||||
|
||||
如果是2,那么是货物标解析,那么就是*截取pdf_main.py*调用*截取pdf货物标版*.py,如下图,selection=1代表截取'招标公告',那么如果招标公告没有切准,就在这块修改。这里可以发现get_notice是通用函数,即*截取pdf通用函数.py*中的get_notice函数,那么继续往内部跳转。
|
||||
|
||||
若开头没截准,就改begin_pattern,末尾没截准,就改end_pattern
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
另外:在*截取pdf货物标版*.py中,还有extract_pages_twice函数,即第一次没有切分到之后,会运行该函数,这边又有一套begin_pattern和end_pattern,即二次提取
|
||||
|
||||
|
||||
|
||||
**如何测试?**
|
||||
|
||||

|
||||
|
||||
输入pdf_path,和你要切分的序号,selection=1代表切公告,依次类推,可以看切出来的效果如何。
|
||||
|
||||
|
||||
|
||||
**无效标和废标公共代码**
|
||||
|
@ -41,7 +41,7 @@ def preprocess_files(output_folder, file_path, file_type,logger):
|
||||
return None
|
||||
# 调用截取PDF多次
|
||||
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'engineering')
|
||||
print("切割出的文件:"+str(truncate_files))
|
||||
# print("切割出的文件:"+str(truncate_files))
|
||||
|
||||
# 处理各个部分
|
||||
notice_path=truncate_files[0] #招标公告
|
||||
|
@ -33,7 +33,8 @@ def preprocess_files(output_folder, file_path, file_type,logger):
|
||||
return None
|
||||
|
||||
# 调用截取PDF多次
|
||||
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods') # index: 0->商务技术服务要求 1->评标办法 2->资格审查 3->投标人须知前附表 4->投标人须知正文
|
||||
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods')
|
||||
|
||||
# 处理各个部分
|
||||
invalid_path = truncate_files[6] if truncate_files[6] != "" else pdf_path #无效标(投标文件格式\合同条款之前的内容)
|
||||
|
||||
|
@ -256,7 +256,7 @@ if __name__ == "__main__":
|
||||
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
|
||||
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\新建文件夹"
|
||||
selection = 4 # 例如:1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
|
||||
selection = 4 # 例如:1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-invalid
|
||||
generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection, logger)
|
||||
print(generated_files)
|
||||
# print("生成的文件:", generated_files)
|
||||
|
@ -324,6 +324,6 @@ if __name__ == "__main__":
|
||||
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output6"
|
||||
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
||||
selection = 6 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 6-invalid_path
|
||||
selection = 6 # 例如:1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 procurement 6-invalid
|
||||
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
||||
print(generated_files)
|
||||
|
BIN
md_files/18.png
Normal file
BIN
md_files/18.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 77 KiB |
BIN
md_files/19.png
Normal file
BIN
md_files/19.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 77 KiB |
BIN
md_files/20.png
Normal file
BIN
md_files/20.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 92 KiB |
Loading…
x
Reference in New Issue
Block a user