PubTabNet is a large dataset for image-based table recognition, containing 568k+ images of tabular data annotated with the corresponding HTML representation of the tables.
The dataset is open sourced by IBM Research Australia and is available to download freely on the IBM Developer Data Asset Exchange.
This notebook can be found on GitHub and Watson Studio.
# importing prerequisites
import sys
import requests
import tarfile
import json
import numpy as np
from os import path
from PIL import Image
from PIL import ImageFont, ImageDraw
from glob import glob
from matplotlib import pyplot as plt
%matplotlib inline
Since the dataset is large (~12GB), here we will be downloading a small subset of the data and extract it.
fname = 'examples.tar.gz'
url = 'https://dax-cdn.cdn.appdomain.cloud/dax-pubtabnet/1.0.0/' + fname
r = requests.get(url)
open(fname , 'wb').write(r.content)
# Extracting the dataset
tar = tarfile.open(fname)
tar.extractall()
tar.close()
# Verifying the file was extracted properly
data_path = "examples/"
path.exists(data_path)
In this section, we visualize the raw image and extract it's HTML annotation from the JSON file. We further render the table using Jupyter notebook's inbuilt HTML capabilities.
# Helper function to read in tables from the annotations
import re
from bs4 import BeautifulSoup as bs
def format_html(img):
''' Formats HTML code from tokenized annotation of img
'''
html_string = '''<html>
<head>
<meta charset="UTF-8">
<style>
table, th, td {
border: 1px solid black;
font-size: 10px;
}
</style>
</head>
<body>
<table frame="hsides" rules="groups" width="100%%">
%s
</table>
</body>
</html>''' % ''.join(img['html']['structure']['tokens'])
cell_nodes = list(re.finditer(r'(<td[^<>]*>)(</td>)', html_string))
assert len(cell_nodes) == len(img['html']['cells']), 'Number of cells defined in tags does not match the length of cells'
cells = [''.join(c['tokens']) for c in img['html']['cells']]
offset = 0
for n, cell in zip(cell_nodes, cells):
html_string = html_string[:n.end(1) + offset] + cell + html_string[n.start(2) + offset:]
offset += len(cell)
# prettify the html
soup = bs(html_string)
html_string = soup.prettify()
return html_string
# Loading the json annotations
with open('examples/PubTabNet_Examples.json', 'r') as fp:
annotations = json.load(fp)
# Inspecting the annotations
annotations.keys()
annotations['images'][0].keys()
annotations['images'][0]
# Showing the raw image
from IPython.display import Image as displayImage
filename = annotations['images'][0]['filename']
displayImage("examples/"+filename)
# Extracting the HTML for the table from the annotation
html_string = format_html(annotations['images'][0])
print(html_string)
# Rendering the above HTML in Jupyter Notebook for a more readable format
from IPython.core.display import display, HTML
display(HTML(html_string))