use-safetensors
#1
by
dolfim-ibm
- opened
- .gitattributes +0 -2
- README.md +73 -73
- model_artifacts/{tableformer/accurate/tableformer_accurate.safetensors β layout/beehive_v0.0.5_pt/model.pt} +2 -2
- model_artifacts/tableformer/{fast/tableformer_fast.safetensors β fat/otslp_all_standard_094_clean.check} +2 -2
- model_artifacts/tableformer/{accurate β fat}/tm_config.json +2 -1
- model_artifacts/tableformer/otslp_all_fast.check +3 -0
- model_artifacts/tableformer/{fast/tm_config.json β tm_config.json} +2 -1
.gitattributes
CHANGED
@@ -35,5 +35,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
model_artifacts/tableformer/otslp_all_fast.check filter=lfs diff=lfs merge=lfs -text
|
37 |
model_artifacts/tableformer/fat/otslp_all_standard_094_clean.check filter=lfs diff=lfs merge=lfs -text
|
38 |
-
model_artifacts/tableformer/accurate/otslp_all_standard_094_clean.check filter=lfs diff=lfs merge=lfs -text
|
39 |
-
model_artifacts/tableformer/fast/otslp_all_fast.check filter=lfs diff=lfs merge=lfs -text
|
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
model_artifacts/tableformer/otslp_all_fast.check filter=lfs diff=lfs merge=lfs -text
|
37 |
model_artifacts/tableformer/fat/otslp_all_standard_094_clean.check filter=lfs diff=lfs merge=lfs -text
|
|
|
|
README.md
CHANGED
@@ -1,73 +1,73 @@
|
|
1 |
-
---
|
2 |
-
license: cdla-permissive-2.0
|
3 |
-
---
|
4 |
-
|
5 |
-
# Docling Models
|
6 |
-
|
7 |
-
This page contains models that power the PDF document converion package [docling](https://github.com/DS4SD/docling).
|
8 |
-
|
9 |
-
## Layout Model
|
10 |
-
|
11 |
-
The layout model will take an image from a
|
12 |
-
|
13 |
-
| | human | MRCNN | MRCNN | FRCNN | YOLO |
|
14 |
-
|----------------|---------|---------|---------|---------|--------|
|
15 |
-
| | human | R50 | R101 | R101 | v5x6 |
|
16 |
-
| Caption | 84-89 | 68.4 | 71.5 | 70.1 | 77.7 |
|
17 |
-
| Footnote | 83-91 | 70.9 | 71.8 | 73.7 | 77.2 |
|
18 |
-
| Formula | 83-85 | 60.1 | 63.4 | 63.5 | 66.2 |
|
19 |
-
| List-item | 87-88 | 81.2 | 80.8 | 81.0 | 86.2 |
|
20 |
-
| Page-footer | 93-94 | 61.6 | 59.3 | 58.9 | 61.1 |
|
21 |
-
| Page-header | 85-89 | 71.9 | 70.0 | 72.0 | 67.9 |
|
22 |
-
| Picture | 69-71 | 71.7 | 72.7 | 72.0 | 77.1 |
|
23 |
-
| Section-header | 83-84 | 67.6 | 69.3 | 68.4 | 74.6 |
|
24 |
-
| Table | 77-81 | 82.2 | 82.9 | 82.2 | 86.3 |
|
25 |
-
| Text | 84-86 | 84.6 | 85.8 | 85.4 | 88.1 |
|
26 |
-
| Title | 60-72 | 76.7 | 80.4 | 79.9 | 82.7 |
|
27 |
-
| All | 82-83 | 72.4 | 73.5 | 73.4 | 76.8 |
|
28 |
-
|
29 |
-
## TableFormer
|
30 |
-
|
31 |
-
The tableformer model will identify the structure of the table, starting from an image of a table. It uses the predicted table regions of the layout model to identify the tables. Tableformer has SOTA table structure identification,
|
32 |
-
|
33 |
-
| Model (TEDS) | Simple table | Complex table | All tables |
|
34 |
-
| ------------ | ------------ | ------------- | ---------- |
|
35 |
-
| Tabula | 78.0 | 57.8 | 67.9 |
|
36 |
-
| Traprange | 60.8 | 49.9 | 55.4 |
|
37 |
-
| Camelot | 80.0 | 66.0 | 73.0 |
|
38 |
-
| Acrobat Pro | 68.9 | 61.8 | 65.3 |
|
39 |
-
| EDD | 91.2 | 85.4 | 88.3 |
|
40 |
-
| TableFormer | 95.4 | 90.1 | 93.6 |
|
41 |
-
|
42 |
-
## References
|
43 |
-
|
44 |
-
```
|
45 |
-
@techreport{Docling,
|
46 |
-
author = {Deep Search Team},
|
47 |
-
month = {8},
|
48 |
-
title = {{Docling Technical Report}},
|
49 |
-
url={https://arxiv.org/abs/2408.09869},
|
50 |
-
eprint={2408.09869},
|
51 |
-
doi = "10.48550/arXiv.2408.09869",
|
52 |
-
version = {1.0.0},
|
53 |
-
year = {2024}
|
54 |
-
}
|
55 |
-
|
56 |
-
@article{doclaynet2022,
|
57 |
-
title = {DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis},
|
58 |
-
doi = {10.1145/3534678.353904},
|
59 |
-
url = {https://arxiv.org/abs/2206.01062},
|
60 |
-
author = {Pfitzmann, Birgit and Auer, Christoph and Dolfi, Michele and Nassar, Ahmed S and Staar, Peter W J},
|
61 |
-
year = {2022}
|
62 |
-
}
|
63 |
-
|
64 |
-
@InProceedings{TableFormer2022,
|
65 |
-
author = {Nassar, Ahmed and Livathinos, Nikolaos and Lysak, Maksym and Staar, Peter},
|
66 |
-
title = {TableFormer: Table Structure Understanding With Transformers},
|
67 |
-
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
|
68 |
-
month = {June},
|
69 |
-
year = {2022},
|
70 |
-
pages = {4614-4623},
|
71 |
-
doi = {https://doi.org/10.1109/CVPR52688.2022.00457}
|
72 |
-
}
|
73 |
-
```
|
|
|
1 |
+
---
|
2 |
+
license: cdla-permissive-2.0
|
3 |
+
---
|
4 |
+
|
5 |
+
# Docling Models
|
6 |
+
|
7 |
+
This page contains models that power the PDF document converion package [docling](https://github.com/DS4SD/docling).
|
8 |
+
|
9 |
+
## Layout Model
|
10 |
+
|
11 |
+
The layout model will take an image from a poge and apply RT-DETR model in order to find different layout components. It currently detects the labels: Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title. As a reference (from the DocLayNet-paper), this is the performance of standard object detection methods on the DocLayNet dataset compared to human evaluation,
|
12 |
+
|
13 |
+
| | human | MRCNN | MRCNN | FRCNN | YOLO |
|
14 |
+
|----------------|---------|---------|---------|---------|--------|
|
15 |
+
| | human | R50 | R101 | R101 | v5x6 |
|
16 |
+
| Caption | 84-89 | 68.4 | 71.5 | 70.1 | 77.7 |
|
17 |
+
| Footnote | 83-91 | 70.9 | 71.8 | 73.7 | 77.2 |
|
18 |
+
| Formula | 83-85 | 60.1 | 63.4 | 63.5 | 66.2 |
|
19 |
+
| List-item | 87-88 | 81.2 | 80.8 | 81.0 | 86.2 |
|
20 |
+
| Page-footer | 93-94 | 61.6 | 59.3 | 58.9 | 61.1 |
|
21 |
+
| Page-header | 85-89 | 71.9 | 70.0 | 72.0 | 67.9 |
|
22 |
+
| Picture | 69-71 | 71.7 | 72.7 | 72.0 | 77.1 |
|
23 |
+
| Section-header | 83-84 | 67.6 | 69.3 | 68.4 | 74.6 |
|
24 |
+
| Table | 77-81 | 82.2 | 82.9 | 82.2 | 86.3 |
|
25 |
+
| Text | 84-86 | 84.6 | 85.8 | 85.4 | 88.1 |
|
26 |
+
| Title | 60-72 | 76.7 | 80.4 | 79.9 | 82.7 |
|
27 |
+
| All | 82-83 | 72.4 | 73.5 | 73.4 | 76.8 |
|
28 |
+
|
29 |
+
## TableFormer
|
30 |
+
|
31 |
+
The tableformer model will identify the structure of the table, starting from an image of a table. It uses the predicted table regions of the layout model to identify the tables. Tableformer has SOTA table structure identification,
|
32 |
+
|
33 |
+
| Model (TEDS) | Simple table | Complex table | All tables |
|
34 |
+
| ------------ | ------------ | ------------- | ---------- |
|
35 |
+
| Tabula | 78.0 | 57.8 | 67.9 |
|
36 |
+
| Traprange | 60.8 | 49.9 | 55.4 |
|
37 |
+
| Camelot | 80.0 | 66.0 | 73.0 |
|
38 |
+
| Acrobat Pro | 68.9 | 61.8 | 65.3 |
|
39 |
+
| EDD | 91.2 | 85.4 | 88.3 |
|
40 |
+
| TableFormer | 95.4 | 90.1 | 93.6 |
|
41 |
+
|
42 |
+
## References
|
43 |
+
|
44 |
+
```
|
45 |
+
@techreport{Docling,
|
46 |
+
author = {Deep Search Team},
|
47 |
+
month = {8},
|
48 |
+
title = {{Docling Technical Report}},
|
49 |
+
url={https://arxiv.org/abs/2408.09869},
|
50 |
+
eprint={2408.09869},
|
51 |
+
doi = "10.48550/arXiv.2408.09869",
|
52 |
+
version = {1.0.0},
|
53 |
+
year = {2024}
|
54 |
+
}
|
55 |
+
|
56 |
+
@article{doclaynet2022,
|
57 |
+
title = {DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis},
|
58 |
+
doi = {10.1145/3534678.353904},
|
59 |
+
url = {https://arxiv.org/abs/2206.01062},
|
60 |
+
author = {Pfitzmann, Birgit and Auer, Christoph and Dolfi, Michele and Nassar, Ahmed S and Staar, Peter W J},
|
61 |
+
year = {2022}
|
62 |
+
}
|
63 |
+
|
64 |
+
@InProceedings{TableFormer2022,
|
65 |
+
author = {Nassar, Ahmed and Livathinos, Nikolaos and Lysak, Maksym and Staar, Peter},
|
66 |
+
title = {TableFormer: Table Structure Understanding With Transformers},
|
67 |
+
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
|
68 |
+
month = {June},
|
69 |
+
year = {2022},
|
70 |
+
pages = {4614-4623},
|
71 |
+
doi = {https://doi.org/10.1109/CVPR52688.2022.00457}
|
72 |
+
}
|
73 |
+
```
|
model_artifacts/{tableformer/accurate/tableformer_accurate.safetensors β layout/beehive_v0.0.5_pt/model.pt}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b752ab5d4493457f4da5355952c68573559b5f6f091ff77f2f422b4386634743
|
3 |
+
size 201557903
|
model_artifacts/tableformer/{fast/tableformer_fast.safetensors β fat/otslp_all_standard_094_clean.check}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eb5ee88f5e411e2a7925837c05de5236bee1934569433b4ab58371ffba3f1da6
|
3 |
+
size 212845885
|
model_artifacts/tableformer/{accurate β fat}/tm_config.json
RENAMED
@@ -61,7 +61,8 @@
|
|
61 |
"padding": false,
|
62 |
"padding_size": 50,
|
63 |
"disable_post_process": false,
|
64 |
-
"profiling": false
|
|
|
65 |
},
|
66 |
"debug": {
|
67 |
"save_debug_images": false
|
|
|
61 |
"padding": false,
|
62 |
"padding_size": 50,
|
63 |
"disable_post_process": false,
|
64 |
+
"profiling": false,
|
65 |
+
"device_mode": "auto"
|
66 |
},
|
67 |
"debug": {
|
68 |
"save_debug_images": false
|
model_artifacts/tableformer/otslp_all_fast.check
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3299cbfd5071aa8414e1fdc5d2703f1061f557d28c741ae28c0a9098a5f67872
|
3 |
+
size 145516093
|
model_artifacts/tableformer/{fast/tm_config.json β tm_config.json}
RENAMED
@@ -61,7 +61,8 @@
|
|
61 |
"padding": false,
|
62 |
"padding_size": 50,
|
63 |
"disable_post_process": false,
|
64 |
-
"profiling": false
|
|
|
65 |
},
|
66 |
"debug": {
|
67 |
"save_debug_images": false
|
|
|
61 |
"padding": false,
|
62 |
"padding_size": 50,
|
63 |
"disable_post_process": false,
|
64 |
+
"profiling": false,
|
65 |
+
"device_mode": "auto"
|
66 |
},
|
67 |
"debug": {
|
68 |
"save_debug_images": false
|