The Reporting Carrier On-Time Performance Dataset contains information on approximately 200 million domestic US flights reported to the United States Bureau of Transportation Statistics. The dataset contains basic information about each flight (such as date, time, departure airport, arrival airport) and, if applicable, the amount of time the flight was delayed and information about the reason for the delay. This dataset can be used to predict the likelihood of a flight arriving on time.
In this notebook we explore the Airline On-Time Performance dataset. The dataset can be obtained for free from the IBM Developer Data Asset Exchange.
import requests
import tarfile
from os import path
# Downloading the dataset
fname = 'airline_2m.tar.gz'
url = 'https://dax-cdn.cdn.appdomain.cloud/dax-airline/1.0.1/' + fname
r = requests.get(url)
open(fname , 'wb').write(r.content)
# Extracting the dataset
tar = tarfile.open(fname)
tar.extractall()
tar.close()
# Verifying the file was extracted properly
data_path = "airline_2m.csv"
path.exists(data_path)
# load dataset into notebook
# Load the Pandas libraries with alias 'pd'
import pandas as pd
# Read data from file 'filename.csv'
# (in the same directory that your python process is based)
# Control delimiters, rows, column names with read_csv (see later)
# Using pandas to read the data
df = pd.read_csv(data_path, encoding = "ISO-8859-1",
dtype={'Div1Airport': str, 'Div1TailNum': str, 'Div2Airport': str, 'Div2TailNum': str})
# Preview the first 5 lines of the loaded data
df.head()
from matplotlib import pyplot as plt
# Take a smaller sample of the dataset for easier plotting
df_sample = df.sample(n=500)
plt.hist(x=df['Distance'], bins='auto')
plt.xlabel('Distance')
plt.ylabel('Frequency')
# Generate scatterplot of flight distance vs time
plt.scatter('Distance', 'ActualElapsedTime', data=df_sample)
plt.xlabel('Distance')
plt.ylabel('Time')
plt.scatter('Year', 'ArrDelay', data=df_sample)
plt.xlabel('Year')
plt.ylabel('Arrival Delay')
# Set missing values to 0
arvl_delay = df_sample['ArrDelay'].fillna(0)
plt.hist(x=arvl_delay, bins='auto')
plt.xlabel('Distance')
plt.ylabel('Frequency')