In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import scipy.stats as stats
df = pd.read_csv("117612.csv")

CS 361 students answered the survey below.

Q1: What's your name? (student names removed for privacy)

Q2: We will roll a 6-sided die. Guess how many rolls will it take to see the first 6.

Q3: If the prize were $100, what's the most you would bet (in dollars) to play this game?

Q4: Have you ever been to Las Vegas?

Mean, standard deviation and variance

In [2]:
plt.subplot(1, 2, 1)
plt.hist(df["Q2:Rolls"], bins=np.arange(0, 101, 10), color="r", ec="k")
plt.xlabel("Rolls")
plt.ylabel("Count")
plt.title("Histogram of rolls")
plt.subplot(1, 2, 2)
plt.hist(df["Q3:Bet"], bins=np.arange(0, 101, 10), color="b", ec="k")
plt.xlabel("Bet")
plt.ylabel("Count")
plt.title("Histogram of bet")
plt.subplots_adjust(bottom=0, top=1, left=-0.5, right=1.5)
In [3]:
print("ROLLS")
print("Mean:", df["Q2:Rolls"].mean())
print("Standard deviation:", df["Q2:Rolls"].std())
print("Variance:", df["Q2:Rolls"].var())
print("BETS")
print("Mean:", df["Q3:Bet"].mean())
print("Standard deviation:", df["Q3:Bet"].std())
print("Variance:", df["Q3:Bet"].var())
ROLLS
Mean: 6.166666666666667
Standard deviation: 11.081066884106576
Variance: 122.79004329004341
BETS
Mean: 16.51641025641026
Standard deviation: 21.584371569200158
Variance: 465.885096037296

Standard coordinates

In [4]:
rolls_standardized = (df["Q2:Rolls"]-df["Q2:Rolls"].mean())/df["Q2:Rolls"].std()
bets_standardized = (df["Q3:Bet"]-df["Q3:Bet"].mean())/df["Q3:Bet"].std()
plt.subplot(1, 2, 1)
plt.hist(rolls_standardized, bins=10, color="r", ec="k")
plt.xlabel("Rolls")
plt.ylabel("Count")
plt.title("Histogram of standardized rolls")
plt.subplot(1, 2, 2)
plt.hist(bets_standardized, bins=10, color="b", ec="k")
plt.xlabel("Bet")
plt.ylabel("Count")
plt.title("Histogram of standardized bet")
plt.subplots_adjust(bottom=0, top=1, left=-0.5, right=1.5)

Median and interquartile range

In [5]:
print("ROLLS")
print("Median:", df["Q2:Rolls"].median())
print("Interquartile range:", stats.iqr(df["Q2:Rolls"]))
print("BETS")
print("Median:", df["Q3:Bet"].median())
print("Interquartile range:", stats.iqr(df["Q3:Bet"]))
ROLLS
Median: 4.0
Interquartile range: 2.0
BETS
Median: 10.0
Interquartile range: 13.4175

Box plots and outliers

In [6]:
plt.subplot(1, 2, 1)
plt.boxplot(df["Q2:Rolls"], labels=["rolls"])
plt.title("Box plot of rolls")
plt.subplot(1, 2, 2)
plt.boxplot(df["Q3:Bet"], labels=["bet"])
plt.title("Box plot of bet")
plt.subplots_adjust(bottom=0, top=1, left=-0.5, right=1.5)

Removing outliers

In [7]:
df_no_outliers = df[df["Q2:Rolls"] <= np.percentile(df["Q2:Rolls"],75)+1.5*stats.iqr(df["Q2:Rolls"])]
df_no_outliers = df_no_outliers[df_no_outliers["Q3:Bet"] <= np.percentile(df["Q3:Bet"],75)+1.5*stats.iqr(df["Q3:Bet"])]
df_no_outliers.count()
Out[7]:
Q2:Rolls    59
Q3:Bet      59
Q4:Vegas    59
dtype: int64
In [ ]:
plt.subplot(1, 2, 1)
plt.hist(df_no_outliers["Q2:Rolls"], bins=np.arange(0.5, 9.5, 1), color="r", ec="k")
plt.xlabel("Rolls")
plt.ylabel("Count")
plt.title("Histogram of rolls, outliers removed")
plt.subplot(1, 2, 2)
plt.hist(df_no_outliers["Q3:Bet"], bins=8, color="b", ec="k")
plt.xlabel("Bet")
plt.ylabel("Count")
plt.title("Histogram of bet, outliers removed")
plt.subplots_adjust(bottom=0, top=1, left=-0.5, right=1.5)

Sensitivity of summary statistics to outliers

In [8]:
print("ROLLS")
print("Mean:", df_no_outliers["Q2:Rolls"].mean(), "(used to be", df["Q2:Rolls"].mean(), ")")
print("Standard deviation:", df_no_outliers["Q2:Rolls"].std(), "(used to be", df["Q2:Rolls"].std(), ")")
print("Median:", df_no_outliers["Q2:Rolls"].median(), "(used to be", df["Q2:Rolls"].median(), ")")
print("Interquartile range:", stats.iqr(df_no_outliers["Q2:Rolls"]), "(used to be", stats.iqr(df["Q2:Rolls"]), ")")
print("BETS")
print("Mean:", df_no_outliers["Q3:Bet"].mean(), "(used to be", df["Q3:Bet"].mean(), ")")
print("Standard deviation:", df_no_outliers["Q3:Bet"].std(), "(used to be", df["Q3:Bet"].std(), ")")
print("Median:", df_no_outliers["Q3:Bet"].median(), "(used to be", df["Q3:Bet"].median(), ")")
print("Interquartile range:", stats.iqr(df_no_outliers["Q3:Bet"]), "(used to be", stats.iqr(df["Q3:Bet"]), ")")
ROLLS
Mean: 3.847457627118644 (used to be 6.166666666666667 )
Standard deviation: 1.5064637665971705 (used to be 11.081066884106576 )
Median: 4.0 (used to be 4.0 )
Interquartile range: 1.5 (used to be 2.0 )
BETS
Mean: 9.665762711864408 (used to be 16.51641025641026 )
Standard deviation: 7.391423548996136 (used to be 21.584371569200158 )
Median: 10.0 (used to be 10.0 )
Interquartile range: 12.5 (used to be 13.4175 )

Visualizing relationships in data

Stock trading

Data from Yahoo Finance (https://finance.yahoo.com/quote/FDX/history and https://finance.yahoo.com/quote/UPS/history)

In [9]:
fdx = pd.read_csv("FDX.csv").rename(index=str, columns={"Adj Close": "FDX"}).filter(['Date','FDX'])
ups = pd.read_csv("UPS.csv").rename(index=str, columns={"Adj Close": "UPS"}).filter(['UPS'])
stock_data = pd.concat([fdx, ups], axis=1, sort=False)
stock_data['Date'] = pd.to_datetime(stock_data['Date'], infer_datetime_format=True)
stock_data.tail()
Out[9]:
Date FDX UPS
161 2018-08-22 246.940002 121.870003
162 2018-08-23 244.470001 121.709999
163 2018-08-24 245.020004 122.739998
164 2018-08-27 248.899994 123.849998
165 2018-08-28 247.630005 122.750000

Plotting time series data

In [10]:
plt.figure(figsize=(12,5))
plt.plot(stock_data.FDX, color="mediumorchid", linestyle="--", label='FDX')
plt.plot(stock_data.UPS, color="chocolate", linestyle="-", label='UPS')
plt.legend()
plt.xticks([0, 165], ["Jan 2", "Aug 28"])
plt.ylabel("Price in USD")
plt.title("Daily stock prices in 2018: FDX and UPS")
Out[10]:
Text(0.5,1,'Daily stock prices in 2018: FDX and UPS')

Standardization

In [11]:
stock_data_standardized = stock_data.copy()
stock_data_standardized.FDX = (stock_data.FDX-stock_data.FDX.mean())/stock_data.FDX.std()
stock_data_standardized.UPS = (stock_data.UPS-stock_data.UPS.mean())/stock_data.UPS.std()
In [12]:
plt.figure(figsize=(12,5))
plt.plot(stock_data_standardized.FDX, color="mediumorchid", linestyle="--", label='FDX')
plt.plot(stock_data_standardized.UPS, color="chocolate", linestyle="-", label='UPS')
plt.legend()
plt.xticks([0, 165], ["Jan 2", "Aug 28"])
plt.ylabel("Price in USD")
plt.title("Standardized daily stock prices in 2018: FDX and UPS")
Out[12]:
Text(0.5,1,'Standardized daily stock prices in 2018: FDX and UPS')

Scatter plots

In [13]:
plt.subplot(1, 2, 1)
plt.scatter(stock_data.FDX, stock_data.UPS, color="red")
plt.title("Scatter plot of UPS vs FDX stock prices in 2018")
plt.xlabel("FDX stock price (USD)")
plt.ylabel("UPS stock price (USD)")
plt.subplot(1, 2, 2)
plt.scatter(stock_data_standardized.FDX, stock_data_standardized.UPS, color="blue")
plt.title("Scatter plot of standardized UPS vs FDX stock prices in 2018")
plt.xlabel("FDX stock price (USD)")
plt.ylabel("UPS stock price (USD)")
plt.subplots_adjust(bottom=0, top=1, left=-0.5, right=1.5)

Correlation coefficient

In [14]:
print(stock_data.corr())
          FDX       UPS
FDX  1.000000  0.747794
UPS  0.747794  1.000000

Back to Rolls and Bets

In [15]:
plt.subplot(1, 2, 1)
plt.scatter(df["Q2:Rolls"], df["Q3:Bet"], color="red")
plt.title("Scatter plot of Bet vs Rolls")
plt.xlabel("Rolls")
plt.ylabel("Bet")
plt.subplot(1, 2, 2)
plt.scatter(df_no_outliers["Q2:Rolls"], df_no_outliers["Q3:Bet"], color="blue")
plt.title("Scatter plot of Bet vs Rolls, outliers removed")
plt.xlabel("Rolls")
plt.ylabel("Bet")
plt.subplots_adjust(bottom=0, top=1, left=-0.5, right=1.5)
In [16]:
print("WITH OUTLIERS")
print(df.corr())
print("WITHOUT OUTLIERS")
print(df_no_outliers.corr())
WITH OUTLIERS
          Q2:Rolls    Q3:Bet
Q2:Rolls  1.000000 -0.101062
Q3:Bet   -0.101062  1.000000
WITHOUT OUTLIERS
          Q2:Rolls    Q3:Bet
Q2:Rolls  1.000000  0.000065
Q3:Bet    0.000065  1.000000