This notebook explores the COVID-19 Questions dataset. The dataset containing categorized questions which were frequently asked by the public during the COVID-19 pandemic period.
The dataset is available to download freely on the IBM Developer Data Asset eXchange: COVID-19 Questions. This notebook can be found on Watson Studio: COVID-19 Questions Notebook
# Import Packages
import pandas as pd
import nltk
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import plotly.express as px
import requests
import tarfile
from os import path
fname = 'covid-19-questions.tar.gz'
url = 'https://dax-cdn.cdn.appdomain.cloud/dax-covid-19-questions/1.0.0/' + fname
r = requests.get(url)
open(fname , 'wb').write(r.content)
# Extracting the dataset
tar = tarfile.open(fname)
tar.extractall()
tar.close()
# Verifying the file was extracted properly
data_path = "covid-19-questions/"
path.exists(data_path)
data = pd.read_csv('covid-19-questions/covid_19_questions.tsv', sep='\t')
# shape of the data
data.shape
# Display first 5 rows
data.head()
# Get total number of unique labels
data['label'].nunique()
# Get unique labels with counts
data['label'].value_counts()
# Get unique labels with percentage
df_label = data['label'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
df_label = df_label.reset_index(name='counts')
df_label.head()
df_label.info()
df_label['counts'] = df_label['counts'].apply(lambda x: np.nan if x in ['-'] else x[:-1]).astype(float)/100
df_label.info()
format_dict = {'counts': '{:.2%}'}
df_label.style.format(format_dict).hide_index()
# Get unique words in `Case_Count` category
stemmer = PorterStemmer()
# Tokenization of questions
data['tokenized'] = data['utterance'].apply(lambda x: word_tokenize(x))
# Stemming on tokenization
data['stemmed'] = data['tokenized'].apply(lambda x: [stemmer.stem(y) for y in x])
data.head()
case_count_df = data.loc[data['label'] == 'Case_Count']
case_count_df.shape
results = set()
case_count_df['tokenized'].apply(results.update)
print(results)