Data Source: https://developer.ibm.com/exchanges/data/all/mention-detection-benchmark/
The goal of Mention Detection is to map entities/concepts mentioned in text to the correct concept in a knowledge base. The dataset contains 3000 sentences that are annotated with Mentions.
Dataset Overview:
The 3000 sentences are divided as follows.
– 1000 sentences taken from Wikipedia articles that discuss various topics, such as those in Debatabase (http://idebate.org/debatabase)
– 1000 sentences taken from professional speakers discussing some of those topics. Those sentences have two forms (thus resulting in 2000 sentence): the output of an Automatic Speech Recognition (ASR) engine; and a cleansed manual transcription of it.
In this notebook, we are going to explore the following datasets: Wiki-dev , Trans-dev and ASR-dev
import pandas as pd
import os
import numpy as np
import glob
# Get the data
!wget https://dax-cdn.cdn.appdomain.cloud/dax-mention-detection/1.0.2/mention-detection.tar.gz -O mention-detection.tar.gz && tar -zxf mention-detection.tar.gz; rm mention-detection.tar.gz
--2020-03-26 23:18:29-- https://dax-cdn.cdn.appdomain.cloud/dax-mention-detection/1.0.2/mention-detection.tar.gz Resolving dax-cdn.cdn.appdomain.cloud (dax-cdn.cdn.appdomain.cloud)... 23.194.112.82, 23.194.112.88, 2600:1404:4400::17c7:31e0, ... Connecting to dax-cdn.cdn.appdomain.cloud (dax-cdn.cdn.appdomain.cloud)|23.194.112.82|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 731975 (715K) [application/x-gzip] Saving to: ‘mention-detection.tar.gz’ 100%[======================================>] 731,975 --.-K/s in 0.04s 2020-03-26 23:18:29 (19.0 MB/s) - ‘mention-detection.tar.gz’ saved [731975/731975]
ls
ADJECTIVES.xlsx LEXICON_BG.txt LICENSE.txt SEMANTIC_CLASSES.xlsx attribution/ LEXICON_UG.txt README.txt topics.csv data/ LICENSE ReleaseNotes.txt
# list to hold data read
def file_read(ann_directory, ann_data_files, txt_directory):
complete_data = []
file_count = 0
for file in ann_data_files:
with open(os.path.join(ann_directory,file), 'r') as f:
# get count of total number of ann files read
file_count += 1
# Get the corresponding sentence
txt_file_name = file.split('.')[0] + '.txt'
with open(os.path.join(txt_directory,txt_file_name), 'r') as t_f:
sentence = t_f.readlines()[0]
# Get details about the entity for the sentence
lines = f.readlines()
for l in lines:
# intermediate list to hold file read count
int_list = [file_count, sentence ]
# split data based on '|||'
split_data = l.split('|||')
for i in range(len(split_data)):
if i != 1 and i != (len(split_data)-1):
int_list.append(split_data[i])
elif i == 1:
int_list.append(split_data[i].split("/")[-1])
elif i == (len(split_data)-1):
int_list.append(split_data[i].rstrip('\n'))
complete_data.append(int_list)
return complete_data
# Sentence file directory
wiki_txt_directory = 'data/Wiki-dev/paragraphs/500-rows'
# Get all ann file list
wiki_ann_directory = 'data/Wiki-dev/labeled/500-rows'
wiki_ann_data_files = [f for f in os.listdir(wiki_ann_directory) if os.path.isfile(os.path.join(wiki_ann_directory, f)) and f.split('.')[1] == 'ann']
# List of ann files
wiki_ann_data_files[0:5]
['987_7483320_2027_2133.ann', '683_860924_1515_1618.ann', '803_118767_18052_18154.ann', '984_24661_13068_13232.ann', '221_2213921_12189_12428.ann']
# number of valid ann files
len(wiki_ann_data_files)
500
Create list containing following fields:
# Create list of all relevant data
wiki_data = file_read(wiki_ann_directory, wiki_ann_data_files, wiki_txt_directory)
# Convert list to dataframe where each row is identified by a token
wiki_df = pd.DataFrame(np.array(wiki_data).reshape(len(wiki_data),6), columns = list(['sentence_id', 'sentence', 'Token', 'Entity', 'Token_Start', 'Token_End']))
wiki_df.shape
(3218, 6)
# Unique entities in wiki
len(pd.unique(wiki_df['Entity']))
2024
# Sentence file directory
trans_txt_directory = 'data/Trans-dev/paragraphs/500-rows'
# Get all ann file list
trans_ann_directory = 'data/Trans-dev/labeled/500-rows'
trans_ann_data_files = [f for f in os.listdir(trans_ann_directory) if os.path.isfile(os.path.join(trans_ann_directory, f)) and f.split('.')[1] == 'ann']
# List of ann files
trans_ann_data_files[0:5]
['2183_1807323334_109_337.ann', '1065_3692018576_4654_4777.ann', '2221_154264590_137_274.ann', '2183_1571213081_1924_2021.ann', '1963_1330765595_648_786.ann']
# Number of valid ann files
len(trans_ann_data_files)
500
# Create list of all relevant data
trans_data = file_read(trans_ann_directory, trans_ann_data_files, trans_txt_directory)
# Convert list to dataframe where each row is identified by a token
trans_df = pd.DataFrame(np.array(trans_data).reshape(len(trans_data),6), columns = list(['sentence_id', 'sentence', 'Token', 'Entity', 'Token_Start', 'Token_End']))
trans_df.shape
(3530, 6)
# Unique entities in wiki
len(pd.unique(trans_df['Entity']))
1295
# Sentence file directory
asr_txt_directory = 'data/ASR-dev/paragraphs/500-rows'
# Get all ann file list
asr_ann_directory = 'data/ASR-dev/labeled/500-rows'
asr_ann_data_files = [f for f in os.listdir(asr_ann_directory) if os.path.isfile(os.path.join(asr_ann_directory, f)) and f.split('.')[1] == 'ann']
# List of ann files
asr_ann_data_files[0:5]
['2183_1807323334_109_337.ann', '1065_3692018576_4654_4777.ann', '2221_154264590_137_274.ann', '2183_1571213081_1924_2021.ann', '1963_1330765595_648_786.ann']
# Number of valid ann files
len(asr_ann_data_files)
500
# Create list of all relevant data
asr_data = file_read(asr_ann_directory, asr_ann_data_files, asr_txt_directory)
# Convert list to dataframe where each row is identified by a token
asr_df = pd.DataFrame(np.array(asr_data).reshape(len(asr_data),6), columns = list(['sentence_id', 'sentence', 'Token', 'Entity', 'Token_Start', 'Token_End']))
asr_df.shape
(3530, 6)
# Unique entities in wiki
len(pd.unique(asr_df['Entity']))
1295
dfs = [wiki_df, trans_df, asr_df]
combined_df = pd.concat(dfs)
combined_df.head()
sentence_id | sentence | Token | Entity | Token_Start | Token_End | |
---|---|---|---|---|---|---|
0 | 1 | The victim of marketing in this case is the in... | marketing | Marketing | 14 | 23 |
1 | 1 | The victim of marketing in this case is the in... | buyer | Buyer | 53 | 58 |
2 | 1 | The victim of marketing in this case is the in... | right to self-determination | Self-determination | 65 | 92 |
3 | 1 | The victim of marketing in this case is the in... | infringed | Patent_infringement | 96 | 105 |
4 | 2 | As a general rule, economic wind generators re... | general rule | Rule_of_thumb | 5 | 17 |
# shape of combined df
combined_df.shape
(10278, 6)
# Number of unique entries in the combined dataframe
len(pd.unique(combined_df['Entity']))
2854
# Get top 10 frequently occuring entity
grouped = combined_df.groupby(['Entity']).size().reset_index(name='counts')
grouped.sort_values('counts', ascending=False)[0:10]
Entity | counts | |
---|---|---|
1868 | People | 209 |
414 | Child | 109 |
2469 | State_(polity) | 73 |
1100 | Government | 67 |
598 | Country | 65 |
2602 | Thought | 63 |
244 | Belief | 62 |
1152 | Harm | 60 |
1836 | Parent | 58 |
681 | Democracy | 58 |