This notebook explores the WikiText-103 dataset. WikiText-103 consists of thousands of articles extracted from the set of verified ‘Good’ and ‘Featured’ articles on Wikipedia. The full collection of articles yields over 100 million tokens. The dataset is used as a common benchmark for long-term dependency language modeling. WikiText-103 is available to download freely on the IBM Developer Data Asset Exchange: WikiText-103 Dataset. This notebook can be found on Watson Studio: WikiText-103-Notebook.
from IPython.display import clear_output
# Download & load required python packages
!pip install wordcloud
import hashlib
import re
import sys
import tarfile
from collections import Counter, defaultdict
from pathlib import Path
import matplotlib.pyplot as plt
import requests
from IPython.display import Image
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
nltk.download('all')
clear_output()
Lets download the dataset from the Data Asset Exchange content delivery network and extract the tarball.
# Download the dataset
fname = 'wikitext-103.tar.gz'
url = 'https://dax-cdn.cdn.appdomain.cloud/dax-wikitext-103/1.0.1/' + fname
r = requests.get(url)
Path(fname).write_bytes(r.content)
# Verify the file was downloaded properly by comparing sha512 checksums
sha512sum = 'c8186919aa1840af6b734ea41abc580574ea8efe2fafda220f5d01002464d17566d84be5199b875136c9593f0e0678fb5d7c84bb2231de8b4151cb9c83fa2109'
sha512sum_computed = hashlib.sha512(Path('wikitext-103.tar.gz').read_bytes()).hexdigest()
sha512sum == sha512sum_computed
# Extract the dataset
with tarfile.open(fname) as tar:
tar.extractall()
Lets read our data into Python lists.
# Read train, val, and test sets into string objects
train_data = Path('wikitext-103/wiki.train.tokens').read_text()
val_data = Path('wikitext-103/wiki.valid.tokens').read_text()
test_data = Path('wikitext-103/wiki.test.tokens').read_text()
# Store regular expression pattern to search for wikipedia article headings
heading_pattern = '( \n \n = [^=]*[^=] = \n \n )'
# Split out train headings and articles
train_split = re.split(heading_pattern, train_data)
train_headings = [x[7:-7] for x in train_split[1::2]]
train_articles = [x for x in train_split[2::2]]
# Split out validation headings and articles
val_split = re.split(heading_pattern, val_data)
val_headings = [x[7:-7] for x in val_split[1::2]]
val_articles = [x for x in val_split[2::2]]
# Split out test headings and articles
test_split = re.split(heading_pattern, test_data)
test_headings = [x[7:-7] for x in test_split[1::2]]
test_articles = [x for x in test_split[2::2]]
Lets visualize a subset of our data to learn a little more about its contents.
# Number of Wikipedia articles in our training data
len(train_headings)
# Example article
print('Heading: ', train_headings[110])
print('Article sample: ', train_articles[110][:118])
# Remove casing, punctuation, special characters, and stop words and also lemmatize the words on a subset of the first 110 articles in the train data
my_new_text = re.sub('[^ a-zA-Z0-9]|unk', '', train_data[:2010011])
stop_words = set(stopwords.words('english'))
lemma = WordNetLemmatizer()
word_tokens = word_tokenize(my_new_text.lower())
filtered_sentence = (w for w in word_tokens if w not in stop_words)
normalized = " ".join(lemma.lemmatize(word) for word in filtered_sentence)
# Now we can create a word cloud
wordcloud = WordCloud(max_font_size=60).generate(normalized)
plt.figure(figsize=(16,12))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
Lets test some sentences in our training data by feeding them to our Modal Asset Exchange's Named Entity Tagger API. This API tags every token in a sentence with a named entity tag.
# Store MAX NER API endpoint
api = 'http://max-named-entity-tagger.codait-prod-41208c73af8fca213512856c7a09db52-0000.us-east.containers.appdomain.cloud/'
predict_endpoint = 'model/predict'
url = api + predict_endpoint
# Select three Wikipedia article headings to generate NER tags for
strings = [train_headings[0], train_headings[14], train_headings[86]]
# For each article heading, send the sentence to the API, print the status code, and print the API result
for s in strings:
file_form = {"text": s}
r = requests.post(url=url, json=file_form)
print('\nStatus code: ', r.status_code)
result = r.json()
print(result)