The dataset contains:
The released data file has 4 columns:
Label (-/P/C/?) | Concept | Category/List Page Title | URL |
---|---|---|---|
... | ... | ... | ... |
... | ... | ... | ... |
... | ... | ... | ... |
For each category, the label is one of the following:
import requests # External dependency: pip install requests
import tarfile
# Downloading the dataset
url_base = 'https://dax.cdn.appdomain.cloud/dax-wikipedia-category-stance'
version = '1.0.2'
fname = 'wikipedia-category-stance.tar.gz'
url = "{}/{}/{}".format(url_base, version, fname)
r = requests.get(url)
if not r.ok:
print("There are some errors when downloading {}".format(url))
with open(fname, 'wb') as f:
f.write(r.content)
# Extracting the dataset
with tarfile.open(fname, 'r:*') as f:
f.extractall()
import os
import pandas as pd # External dependency: pip install pandas
data_path = "WikipediaCategoriesResults.csv"
if not os.access(data_path, os.R_OK):
print("Failed to read the target file: {}".format(data_path))
data = pd.read_csv(data_path)
data.head(20)