!pip install nltk

Requirement already satisfied: nltk in /Users/jhasegaw/anaconda3/lib/python3.10/site-packages (3.8.1)
Requirement already satisfied: tqdm in /Users/jhasegaw/anaconda3/lib/python3.10/site-packages (from nltk) (4.65.0)
Requirement already satisfied: joblib in /Users/jhasegaw/anaconda3/lib/python3.10/site-packages (from nltk) (1.2.0)
Requirement already satisfied: regex>=2021.8.3 in /Users/jhasegaw/anaconda3/lib/python3.10/site-packages (from nltk) (2023.8.8)
Requirement already satisfied: click in /Users/jhasegaw/anaconda3/lib/python3.10/site-packages (from nltk) (8.1.7)

import reader
help(reader)

Help on module reader:

NAME
    reader - This file is responsible for providing functions for reading the files

FUNCTIONS
    loadDir(dirname, stemming, lower_case, use_tqdm=True)
        Loads the files in the folder and returns a 
        list of lists of words from the text in each file.
        
        Parameters:
        name (str): the directory containing the data
        stemming (bool): if True, use NLTK's stemmer to remove suffixes
        lower_case (bool): if True, convert letters to lowercase
        use_tqdm (bool, default:True): if True, use tqdm to show status bar
        
        Output:
        texts (list of lists): texts[m][n] is the n'th word in the m'th email
        count (int): number of files loaded
    
    loadFile(filename, stemming, lower_case)
        Load a file, and returns a list of words.
        
        Parameters:
        filename (str): the directory containing the data
        stemming (bool): if True, use NLTK's stemmer to remove suffixes
        lower_case (bool): if True, convert letters to lowercase
        
        Output:
        x (list): x[n] is the n'th word in the file

DATA
    bad_words = {'aed', 'eed', 'oed'}
    porter_stemmer = <PorterStemmer>
    tokenizer = RegexpTokenizer(pattern='\\w+', gaps=False, disc...ty=True...

FILE
    /Users/jhasegaw/Dropbox/mark/teaching/ece448/ece448labs/spring24/mp01/src/reader.py

import importlib
importlib.reload(reader)
texts, count = reader.loadDir('data',False,False)

100%|███████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 2285.73it/s]

print("There were",count,"files loaded")

There were 500 files loaded

print("The first file contained the following words:",texts[0])

The first file contained the following words: ['Subject', 'done', 'new', 'sitara', 'desk', 'request', 'ref', 'cc', '20000813', 'carey', 'per', 'scott', 's', 'request', 'below', 'the', 'following', 'business', 'unit', 'aka', 'desk', 'id', 'portfolio', 'was', 'added', 'to', 'global', 'production', 'and', 'unify', 'development', 'test', 'production', 'and', 'stage', 'please', 'copy', 'to', 'the', 'other', 'global', 'environments', 'thanks', 'dick', 'x', '3', '1489', 'updated', 'in', 'global', 'production', 'environment', 'gcc', 'code', 'desc', 'p', 'ent', 'subenti', 'data', '_', 'cd', 'ap', 'data', '_', 'desc', 'code', '_', 'id', 'a', 'sit', 'deskid', 'imcl', 'a', 'ena', 'im', 'cleburne', '9273', 'from', 'scott', 'mills', '08', '30', '2000', '08', '27', 'am', 'to', 'samuel', 'schott', 'hou', 'ect', 'ect', 'richard', 'elwood', 'hou', 'ect', 'ect', 'debbie', 'r', 'brackett', 'hou', 'ect', 'ect', 'judy', 'rose', 'hou', 'ect', 'ect', 'vanessa', 'schulte', 'corp', 'enron', 'enron', 'david', 'baumbach', 'hou', 'ect', 'ect', 'daren', 'j', 'farmer', 'hou', 'ect', 'ect', 'dave', 'nommensen', 'hou', 'ect', 'ect', 'donna', 'greif', 'hou', 'ect', 'ect', 'shawna', 'johnson', 'corp', 'enron', 'enron', 'russ', 'severson', 'hou', 'ect', 'ect', 'cc', 'subject', 'new', 'sitara', 'desk', 'request', 'this', 'needs', 'to', 'be', 'available', 'in', 'production', 'by', 'early', 'afternoon', 'sorry', 'for', 'the', 'short', 'notice', 'srm', 'x', '33548']

import submitted
import importlib
importlib.reload(submitted)
print(submitted.__doc__)

This is the module you'll submit to the autograder.

There are several function definitions, here, that raise RuntimeErrors.  You should replace
each "raise RuntimeError" line with a line that performs the function specified in the
function's docstring.

help(submitted.marginal_distribution_of_word_counts)

Help on function marginal_distribution_of_word_counts in module submitted:

marginal_distribution_of_word_counts(texts, word0)
    Parameters:
    texts (list of lists) - a list of texts; each text is a list of words
    word0 (str) - the word that you want to count
    
    Output:
    Pmarginal (numpy array of length cX0) - Pmarginal[x0] = P(X0=x0), where
      X0 is the number of times that word0 occurs in a document
      cX0-1 is the largest value of X0 observed in the provided texts

importlib.reload(submitted)
Pmarginal = submitted.marginal_distribution_of_word_counts(texts, 'company')
print(Pmarginal)

[0.972 0.024 0.002 0.    0.002]

importlib.reload(submitted)
help(submitted.conditional_distribution_of_word_counts)

Help on function conditional_distribution_of_word_counts in module submitted:

conditional_distribution_of_word_counts(texts, word0, word1)
    Parameters:
    texts (list of lists) - a list of texts; each text is a list of words
    word0 (str) - the first word that you want to count
    word1 (str) - the second word that you want to count
    
    Outputs: 
    Pcond (numpy array, shape=(cX0,cX1)) - Pcond[x0,x1] = P(X1=x1|X0=x0), where
      X0 is the number of times that word0 occurs in a document
      cX0-1 is the largest value of X0 observed in the provided texts
      X1 is the number of times that word1 occurs in a document
      cX1-1 is the largest value of X0 observed in the provided texts
      CAUTION: If P(X0=x0) is zero, then P(X1=x1|X0=x0) should be np.nan.

importlib.reload(submitted)
Pcond = submitted.conditional_distribution_of_word_counts(texts, "company", "sales")
print(Pcond)

[[0.97942387 0.01234568 0.00617284 0.00205761]
 [0.83333333 0.16666667 0.         0.        ]
 [1.         0.         0.         0.        ]
 [       nan        nan        nan        nan]
 [1.         0.         0.         0.        ]]

importlib.reload(submitted)
help(submitted.joint_distribution_of_word_counts)

Help on function joint_distribution_of_word_counts in module submitted:

joint_distribution_of_word_counts(Pmarginal, Pcond)
    Parameters:
    Pmarginal (numpy array of length cX0) - Pmarginal[x0] = P(X0=x0), where
    Pcond (numpy array, shape=(cX0,cX1)) - Pcond[x0,x1] = P(X1=x1|X0=x0)
    
    Output:
    Pjoint (numpy array, shape=(cX0,cX1)) - Pjoint[x0,x1] = P(X0=x0, X1=x1)
      X0 is the number of times that word0 occurs in a given text,
      X1 is the number of times that word1 occurs in the same text.
      CAUTION: if P(X0=x0) then P(X0=x0,X1=x1)=0, even if P(X1=x1|X0=x0)=np.nan.

importlib.reload(submitted)
Pjoint = submitted.joint_distribution_of_word_counts(Pmarginal,Pcond)
print(Pjoint)

[[0.952 0.012 0.006 0.002]
 [0.02  0.004 0.    0.   ]
 [0.002 0.    0.    0.   ]
 [0.    0.    0.    0.   ]
 [0.002 0.    0.    0.   ]]

importlib.reload(submitted)
Pa = submitted.marginal_distribution_of_word_counts(texts, 'a')
Pthe_given_a = submitted.conditional_distribution_of_word_counts(texts, 'a', 'the')
Pa_the = submitted.joint_distribution_of_word_counts(Pa, Pthe_given_a)

print("Here is the joint distribution:")
print(Pa_the)
print("\n It has size", Pa_the.shape)

Here is the joint distribution:
[[0.248 0.078 0.056 ... 0.    0.    0.   ]
 [0.036 0.028 0.026 ... 0.    0.    0.   ]
 [0.006 0.006 0.014 ... 0.    0.    0.   ]
 ...
 [0.    0.    0.    ... 0.    0.    0.   ]
 [0.    0.    0.    ... 0.    0.    0.   ]
 [0.    0.    0.    ... 0.    0.    0.002]]

 It has size (20, 59)

!pip install matplotlib

Requirement already satisfied: matplotlib in /Users/jhasegaw/anaconda3/lib/python3.10/site-packages (3.7.1)
Requirement already satisfied: fonttools>=4.22.0 in /Users/jhasegaw/anaconda3/lib/python3.10/site-packages (from matplotlib) (4.25.0)
Requirement already satisfied: numpy>=1.20 in /Users/jhasegaw/anaconda3/lib/python3.10/site-packages (from matplotlib) (1.26.3)
Requirement already satisfied: python-dateutil>=2.7 in /Users/jhasegaw/anaconda3/lib/python3.10/site-packages (from matplotlib) (2.8.2)
Requirement already satisfied: contourpy>=1.0.1 in /Users/jhasegaw/anaconda3/lib/python3.10/site-packages (from matplotlib) (1.2.0)
Requirement already satisfied: pillow>=6.2.0 in /Users/jhasegaw/anaconda3/lib/python3.10/site-packages (from matplotlib) (10.0.1)
Requirement already satisfied: packaging>=20.0 in /Users/jhasegaw/anaconda3/lib/python3.10/site-packages (from matplotlib) (23.1)
Requirement already satisfied: pyparsing>=2.3.1 in /Users/jhasegaw/anaconda3/lib/python3.10/site-packages (from matplotlib) (3.0.9)
Requirement already satisfied: kiwisolver>=1.0.1 in /Users/jhasegaw/anaconda3/lib/python3.10/site-packages (from matplotlib) (1.4.4)
Requirement already satisfied: cycler>=0.10 in /Users/jhasegaw/anaconda3/lib/python3.10/site-packages (from matplotlib) (0.11.0)
Requirement already satisfied: six>=1.5 in /Users/jhasegaw/anaconda3/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)

import numpy as np

cX0, cX1 = Pa_the.shape
x, y = np.meshgrid(np.arange(cX0), np.arange(cX1))
base = np.zeros((cX0, cX1))

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111, projection='3d')
ax.bar3d(x.ravel(), y.ravel(), base.ravel(), 1, 1, Pa_the.ravel(), shade=True)
ax.set_title("Counts of the word /the/ have the following probability mass function:")
ax.set_xlabel("$x_0=$ frequency of /a/")
ax.set_ylabel("$x_1=$ frequency of /the/")
ax.set_zlabel("$P(X_0=x_0,X_1=x_1)$")

Text(0.5, 0, '$P(X_0=x_0,X_1=x_1)$')

importlib.reload(submitted)
help(submitted.mean_vector)

Help on function mean_vector in module submitted:

mean_vector(Pjoint)
    Parameters:
    Pjoint (numpy array, shape=(cX0,cX1)) - Pjoint[x0,x1] = P(X0=x0, X1=x1)
    
    Outputs:
    mu (numpy array, length 2) - the mean of the vector [X0, X1]

importlib.reload(submitted)
mu = submitted.mean_vector(Pa_the)
print(mu)

[1.364 4.432]

importlib.reload(submitted)
help(submitted.covariance_matrix)

Help on function covariance_matrix in module submitted:

covariance_matrix(Pjoint, mu)
    Parameters:
    Pjoint (numpy array, shape=(cX0,cX1)) - Pjoint[x0,x1] = P(X0=x0, X1=x1)
    mu (numpy array, length 2) - the mean of the vector [X0, X1]
    
    Outputs:
    Sigma (numpy array, shape=(2,2)) - matrix of variance and covariances of [X0,X1]

importlib.reload(submitted)
Sigma = submitted.covariance_matrix(Pa_the, mu)
print(Sigma)

[[ 4.891504  9.244752]
 [ 9.244752 41.601376]]

importlib.reload(submitted)
help(submitted.distribution_of_a_function)

Help on function distribution_of_a_function in module submitted:

distribution_of_a_function(Pjoint, f)
    Parameters:
    Pjoint (numpy array, shape=(cX0,cX1)) - Pjoint[x0,x1] = P(X0=x0, X1=x1)
    f (function) - f should be a function that takes two
       real-valued inputs, x0 and x1.  The output, z=f(x0,x1),
       may be any hashable value (number, string, or even a tuple).
    
    Output:
    Pfunc (Counter) - Pfunc[z] = P(Z=z)
       Pfunc should be a collections.defaultdict or collections.Counter, 
       so that previously unobserved values of z have a default setting
       of Pfunc[z]=0.

def f(x0,x1):
    if x0<1 and x1 < 1:
        return "Zero"
    elif x0 < 2 and x1 < 2:
        return "Small"
    else:
        return "Big"

print("f(0,0)=",f(0,0))
print("f(0,15)=",f(0,15))
print("f(1,1)=",f(1,1))
print("f(19,58)=",f(19,58))

f(0,0)= Zero
f(0,15)= Big
f(1,1)= Small
f(19,58)= Big

importlib.reload(submitted)
Pz = submitted.distribution_of_a_function(Pa_the, f)
print(Pz)

Counter({'Big': 0.6100000000000003, 'Zero': 0.24799999999999997, 'Small': 0.142})

import matplotlib.pyplot
fig = plt.figure(figsize=(12,2))
ax = fig.add_subplot(111)
Zvals = ['Zero','Small','Big']
ax.bar(np.arange(3), [Pz[z] for z in Zvals], tick_label=Zvals)
ax.set_xlabel('Instance value $z=f(x_0,x_1)$')
ax.set_ylabel('$P(f(X_0,X_1)=z)$')
ax.set_title('Probability Mass Function of a Function of Two Random Variables')

Text(0.5, 1.0, 'Probability Mass Function of a Function of Two Random Variables')

!python grade.py

EE............
======================================================================
ERROR: test_extra (test_extra.TestStep)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/Users/jhasegaw/Dropbox/mark/teaching/ece448/ece448labs/spring24/mp01/src/tests/test_extra.py", line 16, in test_extra
    hyp_p, hyp = extra.estimate_geometric(Pa)
  File "/Users/jhasegaw/Dropbox/mark/teaching/ece448/ece448labs/spring24/mp01/src/extra.py", line 13, in estimate_geometric
    raise RuntimeError("You need to write this")
RuntimeError: You need to write this

======================================================================
ERROR: test_extra (test_extra_hidden.TestStep)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/Users/jhasegaw/Dropbox/mark/teaching/ece448/ece448labs/spring24/mp01/src/tests/test_extra_hidden.py", line 16, in test_extra
    hyp_p, hyp = extra.estimate_geometric(Pa)
  File "/Users/jhasegaw/Dropbox/mark/teaching/ece448/ece448labs/spring24/mp01/src/extra.py", line 13, in estimate_geometric
    raise RuntimeError("You need to write this")
RuntimeError: You need to write this

----------------------------------------------------------------------
Ran 14 tests in 0.431s

FAILED (errors=2)

!python grade.py -j

{
    "tests": [
        {
            "name": "test_extra (test_extra.TestStep)",
            "score": 0.0,
            "max_score": 5,
            "status": "failed",
            "output": "Test Failed: You need to write this\n"
        },
        {
            "name": "test_extra (test_extra_hidden.TestStep)",
            "score": 0.0,
            "max_score": 5,
            "status": "failed",
            "output": "Test Failed: You need to write this\n"
        },
        {
            "name": "test_cond (test_hidden.TestStep)",
            "score": 8,
            "max_score": 8,
            "status": "passed"
        },
        {
            "name": "test_covariance (test_hidden.TestStep)",
            "score": 8,
            "max_score": 8,
            "status": "passed"
        },
        {
            "name": "test_distribution_of_function (test_hidden.TestStep)",
            "score": 8,
            "max_score": 8,
            "status": "passed"
        },
        {
            "name": "test_joint (test_hidden.TestStep)",
            "score": 9,
            "max_score": 9,
            "status": "passed"
        },
        {
            "name": "test_marginal (test_hidden.TestStep)",
            "score": 9,
            "max_score": 9,
            "status": "passed"
        },
        {
            "name": "test_mean (test_hidden.TestStep)",
            "score": 8,
            "max_score": 8,
            "status": "passed"
        },
        {
            "name": "test_cond (test_visible.TestStep)",
            "score": 8,
            "max_score": 8,
            "status": "passed"
        },
        {
            "name": "test_covariance (test_visible.TestStep)",
            "score": 8,
            "max_score": 8,
            "status": "passed"
        },
        {
            "name": "test_distribution_of_function (test_visible.TestStep)",
            "score": 8,
            "max_score": 8,
            "status": "passed"
        },
        {
            "name": "test_joint (test_visible.TestStep)",
            "score": 9,
            "max_score": 9,
            "status": "passed"
        },
        {
            "name": "test_marginal (test_visible.TestStep)",
            "score": 9,
            "max_score": 9,
            "status": "passed"
        },
        {
            "name": "test_mean (test_visible.TestStep)",
            "score": 8,
            "max_score": 8,
            "status": "passed"
        }
    ],
    "leaderboard": [],
    "visibility": "visible",
    "execution_time": "0.23",
    "score": 100.0
}

import extra, importlib
importlib.reload(extra)
help(extra.estimate_geometric)

Help on function estimate_geometric in module extra:

estimate_geometric(PX)
    @param:
    PX (numpy array of length cX): PX[x] = P(X=x), the observed probability mass function
    
    @return:
    p (scalar): the parameter of a matching geometric random variable
    PY (numpy array of length cX): PY[x] = P(Y=y), the first cX values of the pmf of a
      geometric random variable such that E[Y]=E[X].

importlib.reload(extra)
p, PY = extra.estimate_geometric(Pa)

print('p=',p)
print('The first five entries in the model pmf are',PY[:5])

p= 0.4230118443316413
The first five entries in the model pmf are [0.42301184 0.24407282 0.14082713 0.08125559 0.04688351]

import matplotlib.pyplot as plt
fig, axs = plt.subplots(2,1,figsize=(14,4))
axs[0].bar(np.arange(len(Pa)), Pa)
axs[0].set_title('Observed probability mass function')
axs[0].set_ylabel('$P(X=x)')
axs[1].bar(np.arange(len(PY)), PY)
axs[1].set_title('Geometric distribution model')
axs[1].set_ylabel('$P(Y=x)$')
axs[1].set_xlabel('Instance value, $x$, of the number of occurrences of the word "the"')
fig.tight_layout()

!python grade.py

.......
----------------------------------------------------------------------
Ran 7 tests in 0.102s

OK

CS440/ECE448 Spring 2024¶

MP01: Probability¶

Table of Contents¶

Reading the data¶

Joint, Conditional, and Marginal Distributions¶

Joint distribution:¶

Marginal distributions:¶

Conditional distribution:¶

Mean Vector and Covariance Matrix¶

A Function of Random Variables is a Random Variable¶

Grade your homework¶

Extra Credit¶