import pandas as pd
import os
import numpy as np
import glob


# Get the data
!wget https://dax-cdn.cdn.appdomain.cloud/dax-mention-detection/1.0.2/mention-detection.tar.gz -O mention-detection.tar.gz && tar -zxf mention-detection.tar.gz; rm mention-detection.tar.gz

--2020-03-26 23:18:29--  https://dax-cdn.cdn.appdomain.cloud/dax-mention-detection/1.0.2/mention-detection.tar.gz
Resolving dax-cdn.cdn.appdomain.cloud (dax-cdn.cdn.appdomain.cloud)... 23.194.112.82, 23.194.112.88, 2600:1404:4400::17c7:31e0, ...
Connecting to dax-cdn.cdn.appdomain.cloud (dax-cdn.cdn.appdomain.cloud)|23.194.112.82|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 731975 (715K) [application/x-gzip]
Saving to: ‘mention-detection.tar.gz’

100%[======================================>] 731,975     --.-K/s   in 0.04s   

2020-03-26 23:18:29 (19.0 MB/s) - ‘mention-detection.tar.gz’ saved [731975/731975]

ls

ADJECTIVES.xlsx  LEXICON_BG.txt  LICENSE.txt       SEMANTIC_CLASSES.xlsx
attribution/     LEXICON_UG.txt  README.txt        topics.csv
data/            LICENSE         ReleaseNotes.txt


# list to hold data read
def file_read(ann_directory, ann_data_files, txt_directory):
    complete_data = []
    file_count = 0
    for file in ann_data_files:
        with open(os.path.join(ann_directory,file), 'r') as f:
            # get count of total number of ann files read
            file_count += 1
        
            # Get the corresponding sentence 
            txt_file_name = file.split('.')[0] + '.txt'
            with open(os.path.join(txt_directory,txt_file_name), 'r') as t_f:
                sentence = t_f.readlines()[0]
        
            # Get details about the entity for the sentence
            lines = f.readlines()
            for l in lines:
                # intermediate list to hold file read count
                int_list = [file_count, sentence ]
                # split data based on '|||'
                split_data = l.split('|||')
                for i in range(len(split_data)):
                    if i != 1 and i != (len(split_data)-1):
                        int_list.append(split_data[i])
                    elif i == 1:
                        int_list.append(split_data[i].split("/")[-1])
                    elif i == (len(split_data)-1):
                        int_list.append(split_data[i].rstrip('\n'))
                complete_data.append(int_list)
    return complete_data


# Sentence file directory
wiki_txt_directory = 'data/Wiki-dev/paragraphs/500-rows'
# Get all ann file list
wiki_ann_directory = 'data/Wiki-dev/labeled/500-rows'
wiki_ann_data_files = [f for f in os.listdir(wiki_ann_directory) if os.path.isfile(os.path.join(wiki_ann_directory, f)) and f.split('.')[1] == 'ann']


# List of ann files
wiki_ann_data_files[0:5]

['987_7483320_2027_2133.ann',
 '683_860924_1515_1618.ann',
 '803_118767_18052_18154.ann',
 '984_24661_13068_13232.ann',
 '221_2213921_12189_12428.ann']


# number of valid ann files
len(wiki_ann_data_files)

500


# Create list of all relevant data
wiki_data = file_read(wiki_ann_directory, wiki_ann_data_files, wiki_txt_directory)


# Convert list to dataframe where each row is identified by a token
wiki_df = pd.DataFrame(np.array(wiki_data).reshape(len(wiki_data),6), columns = list(['sentence_id', 'sentence', 'Token', 'Entity', 'Token_Start', 'Token_End']))


wiki_df.shape

(3218, 6)


# Unique entities in wiki 
len(pd.unique(wiki_df['Entity']))

2024


# Sentence file directory
trans_txt_directory = 'data/Trans-dev/paragraphs/500-rows'
# Get all ann file list
trans_ann_directory = 'data/Trans-dev/labeled/500-rows'
trans_ann_data_files = [f for f in os.listdir(trans_ann_directory) if os.path.isfile(os.path.join(trans_ann_directory, f)) and f.split('.')[1] == 'ann']


# List of ann files
trans_ann_data_files[0:5]

['2183_1807323334_109_337.ann',
 '1065_3692018576_4654_4777.ann',
 '2221_154264590_137_274.ann',
 '2183_1571213081_1924_2021.ann',
 '1963_1330765595_648_786.ann']


# Number of valid ann files
len(trans_ann_data_files)

500


# Create list of all relevant data
trans_data = file_read(trans_ann_directory, trans_ann_data_files, trans_txt_directory)


# Convert list to dataframe where each row is identified by a token
trans_df = pd.DataFrame(np.array(trans_data).reshape(len(trans_data),6), columns = list(['sentence_id', 'sentence', 'Token', 'Entity', 'Token_Start', 'Token_End']))


trans_df.shape

(3530, 6)


# Unique entities in wiki 
len(pd.unique(trans_df['Entity']))

1295


# Sentence file directory
asr_txt_directory = 'data/ASR-dev/paragraphs/500-rows'
# Get all ann file list
asr_ann_directory = 'data/ASR-dev/labeled/500-rows'
asr_ann_data_files = [f for f in os.listdir(asr_ann_directory) if os.path.isfile(os.path.join(asr_ann_directory, f)) and f.split('.')[1] == 'ann']


# List of ann files
asr_ann_data_files[0:5]

['2183_1807323334_109_337.ann',
 '1065_3692018576_4654_4777.ann',
 '2221_154264590_137_274.ann',
 '2183_1571213081_1924_2021.ann',
 '1963_1330765595_648_786.ann']


# Number of valid ann files
len(asr_ann_data_files)

500


# Create list of all relevant data
asr_data = file_read(asr_ann_directory, asr_ann_data_files, asr_txt_directory)


# Convert list to dataframe where each row is identified by a token
asr_df = pd.DataFrame(np.array(asr_data).reshape(len(asr_data),6), columns = list(['sentence_id', 'sentence', 'Token', 'Entity', 'Token_Start', 'Token_End']))


asr_df.shape

(3530, 6)


# Unique entities in wiki 
len(pd.unique(asr_df['Entity']))

1295


dfs = [wiki_df, trans_df, asr_df]
combined_df = pd.concat(dfs)


combined_df.head()


# shape of combined df
combined_df.shape

(10278, 6)


# Number of unique entries in the combined dataframe
len(pd.unique(combined_df['Entity']))

2854


# Get top 10 frequently occuring entity 
grouped = combined_df.groupby(['Entity']).size().reset_index(name='counts')
grouped.sort_values('counts', ascending=False)[0:10]

	sentence_id	sentence	Token	Entity	Token_Start	Token_End
0	1	The victim of marketing in this case is the in...	marketing	Marketing	14	23
1	1	The victim of marketing in this case is the in...	buyer	Buyer	53	58
2	1	The victim of marketing in this case is the in...	right to self-determination	Self-determination	65	92
3	1	The victim of marketing in this case is the in...	infringed	Patent_infringement	96	105
4	2	As a general rule, economic wind generators re...	general rule	Rule_of_thumb	5	17

	Entity	counts
1868	People	209
414	Child	109
2469	State_(polity)	73
1100	Government	67
598	Country	65
2602	Thought	63
244	Belief	62
1152	Harm	60
1836	Parent	58
681	Democracy	58

IBM Debater® Mention Detection Benchmark¶

Function to read dev files¶

DATA EXPLORATION FOR WIKI DEV DATASET¶

DATA EXPLORATION FOR TRANS DEV DATASET¶

DATA EXPLORATION FOR ASR DEV DATASET¶

CONCATENATE ALL DF¶