Data Source: https://developer.ibm.com/exchanges/data/all/claim-sentences-search/
Claims are short phrases that an argument aims to prove. The goal of the Claim Sentence Search task is to detect sentences containing claims in a large corpus, given a debatable topic or motion. The dataset contains results of the q_mc query – sentences containing a certain topic, as described in the paper – containing 1.49M sentences. In addition, the dataset contains a claim sentence test set containing 2.5k top predicted sentences of our model, along with their labels. The sentences were retrieved from Wikipedia 2017.
The dataset includes:
from IPython.display import clear_output
!pip install wordcloud
clear_output()
!pip install gensim
clear_output()
!pip install --user -U nltk
clear_output()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
stemmer = SnowballStemmer('english')
import nltk
nltk.download('wordnet')
!wget https://dax-cdn.cdn.appdomain.cloud/dax-claim-sentences-search/1.0.2/claim-sentences-search.tar.gz -O claim-sentences-search.tar.gz && tar -zxf claim-sentences-search.tar.gz; rm claim-sentences-search.tar.gz
ls
# Load the training data
train_data = pd.read_csv('q_mc_train.csv')
# Shape of the training data
train_data.shape
# Training data column names
train_data.columns
Towards an argumentative content search engine using weak supervision Ran Levy, Ben Bogin, Shai Gretz, Ranit Aharonov and Noam Slonim COLING 2018
# Get unique topic id
print(sorted(pd.unique(train_data['id'])))
# Number of unique ids
len(pd.unique(train_data['id']))
# Get unique topics
sorted(pd.unique(train_data['topic']))
# Number of unique topics
len(pd.unique(train_data['topic']))
# Number of data points under specific id and topic
train_data.groupby(['id','topic']).size().reset_index().rename(columns={0:'count'}).head()
# Total number of words in the document
train_data['sentence'].apply(lambda x: len(x.split(' '))).sum()
# Get unique main content
main_content = pd.unique(train_data['mc'])
main_content
plt.figure(figsize=(20,10))
train_data.mc.value_counts().plot(kind='bar');
HTTP Cookie
topic¶# Get all data points under the topic HTTP Cookies
http_cookies_data = train_data[train_data['mc'] == 'HTTP cookie']
# Total data points under the topic
http_cookies_data.shape
http_cookies_data.head()
# Total number of words
print(http_cookies_data['sentence'].apply(lambda x: len(x.split(' '))).sum())
def input_preprocess(inp_sentence):
processed_result = []
for token in gensim.utils.simple_preprocess(inp_sentence):
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
processed_result.append(token)
return processed_result
# Removing stop words from the sentences
processed_http_cookies = http_cookies_data['sentence'].map(input_preprocess)
token_http_cookies = []
for i in processed_http_cookies:
for j in i:
token_http_cookies.append(j)
# Number of unique words after preprocessing
len(set(token_http_cookies))
unique_tokens_http = set(token_http_cookies)
unique_string=(" ").join(unique_tokens_http)
wordcloud = WordCloud(width = 1000, height = 500, background_color="white").generate(unique_string)
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
plt.close()
# Pre-process documents to remove stop words
processed_docs = train_data['sentence'].map(input_preprocess)
processed_docs[:5]
# Create dictionary which contain mapping between words and integer ids
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
print(k, v)
count += 1
if count > 10:
break
# Total number of unique words in the dictionary
count = 0
for k, v in dictionary.iteritems():
count += 1
print('Total number of words in the dictionary: ', count)
# Filter out tokens in the dictionary by the frequency
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=10000)
count = 0
for k, v in dictionary.iteritems():
count += 1
print('Total number of words in the dictionary after filtering process: ', count)
# Convert document into bag-of-words format. This results in a tuple containing token_id and token_count
# The output indicates that a token with a specific id appears specified number of time
# For example: (7,1) indicates token with id 7 appears 1 time.
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[100]
# LDA model
# parameters provided are document vector, number of requested latent topics, dictionary containing mapping from words IDs to word,
# and number of passes through the corpus.
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=1)
for idx, topic in lda_model.print_topics(-1):
print('Topic: {} \nWords: {}'.format(idx, topic))