The dataset consists of 100 discussion threads crawled from Ubuntu Forums discussions. Each message in each individual thread is assigned a dialog label out of following eight classes: question, repeat question, clarification, further details, solution, positive feedback, negative feedback, junk.
The dataset was open sourced by IBM Research India and is available to download freely on the IBM Developer Data Asset Exchange.
# importing prerequisites
!pip install xmltodict
import os
import requests
import tarfile
import xmltodict
import collections
we will download the data and extract it.
fname = 'forum_datasets.tar.gz'
url = 'https://dax-cdn.cdn.appdomain.cloud/dax-forum/1.0.7/' + fname
r = requests.get(url)
open(fname , 'wb').write(r.content)
# Extracting the dataset
tar = tarfile.open(fname)
tar.extractall()
tar.close()
# Verifying the file was extracted properly
data_path = "forum_datasets/"
os.path.exists(data_path)
The data will be in the folder of forum_datasets/Ubuntu, including 100 xml files. Each file may have original text description and its corresponding response (but not all of them).
path = 'forum_datasets/Ubuntu/'
label = []
content_txt = []
for r, d, f in os.walk(path):
for file in f:
if '.xml' in file:
with open(os.path.join(r, file), encoding="utf-8", errors='ignore') as fl:
xml_content = fl.read()
try:
rlt = xmltodict.parse(xml_content)
label.append(rlt['Thread']['InitPost']['Class'])
content_txt.append(rlt['Thread']['InitPost']['icontent'])
except :
# if there is an error due to format issue, then pass
pass
rlt = xmltodict.parse(xml_content)
# Example tokenlist sentence Ordered Dictionary fields
print("\n" + str(rlt['Thread']['InitPost']))
# Example tokenlist sentence Ordered Dictionary fields
od = collections.OrderedDict(sorted(rlt.items(), key=lambda x:x[0]))
od
# All posts of the distribution among all categories
path = 'forum_datasets/Ubuntu/'
label = []
content_txt = []
categories = []
for r, d, f in os.walk(path):
for file in f:
if '.xml' in file:
with open(os.path.join(r, file), encoding="utf-8", errors='ignore') as fl:
xml_content = fl.read()
try:
rlt = xmltodict.parse(xml_content)
od = collections.OrderedDict(sorted(rlt.items(), key=lambda x:x[0]))
num_length=len(od['Thread']['Post'])
category = 0
for i in range(num_length):
category = int(od['Thread']['Post'][i]['Class'])+1
categories.append(category)
except :
pass
# Plot the histogram of all categories
import matplotlib.pyplot as plt
plt.hist(categories, density=True, bins=8)
plt.ylabel('percentage of all posts %');
plt.xlabel('Categories');
plt.show()