import reader, importlib
importlib.reload(reader)
help(reader.loadTrain)

Help on function loadTrain in module reader:

loadTrain(dirname, stemming, lower_case, use_tqdm=True)
    Loads a training dataset.
    
    Parameters:
    dirname (str): the directory containing the data
        - dirname/y should contain training examples from class y
    
    stemming (bool): if True, use NLTK's stemmer to remove suffixes
    lower_case (bool): if True, convert letters to lowercase
    use_tqdm (bool, default:True): if True, use tqdm to show status bar
    
    Output:
    train (dict of list of lists): 
        - train[y][i][k] = k'th token of i'th text of class y


importlib.reload(reader)

train = reader.loadTrain('data/train', False, True)

100%|██████████| 2000/2000 [00:00<00:00, 10929.16it/s]
100%|██████████| 6000/6000 [00:00<00:00, 11735.77it/s]


for y in train.keys():
    print("There were",len(train[y]),"texts loaded for class",y)

There were 2000 texts loaded for class neg
There were 6000 texts loaded for class pos


print("The first positive review is:",train['pos'][0])

The first positive review is: ['i', 'went', 'and', 'saw', 'this', 'movie', 'last', 'night', 'after', 'being', 'coaxed', 'to', 'by', 'a', 'few', 'friends', 'of', 'mine', 'i', 'll', 'admit', 'that', 'i', 'was', 'reluctant', 'to', 'see', 'it', 'because', 'from', 'what', 'i', 'knew', 'of', 'ashton', 'kutcher', 'he', 'was', 'only', 'able', 'to', 'do', 'comedy', 'i', 'was', 'wrong', 'kutcher', 'played', 'the', 'character', 'of', 'jake', 'fischer', 'very', 'well', 'and', 'kevin', 'costner', 'played', 'ben', 'randall', 'with', 'such', 'professionalism', 'the', 'sign', 'of', 'a', 'good', 'movie', 'is', 'that', 'it', 'can', 'toy', 'with', 'our', 'emotions', 'this', 'one', 'did', 'exactly', 'that', 'the', 'entire', 'theater', 'which', 'was', 'sold', 'out', 'was', 'overcome', 'by', 'laughter', 'during', 'the', 'first', 'half', 'of', 'the', 'movie', 'and', 'were', 'moved', 'to', 'tears', 'during', 'the', 'second', 'half', 'while', 'exiting', 'the', 'theater', 'i', 'not', 'only', 'saw', 'many', 'women', 'in', 'tears', 'but', 'many', 'full', 'grown', 'men', 'as', 'well', 'trying', 'desperately', 'not', 'to', 'let', 'anyone', 'see', 'them', 'crying', 'this', 'movie', 'was', 'great', 'and', 'i', 'suggest', 'that', 'you', 'go', 'see', 'it', 'before', 'you', 'judge']


import submitted, importlib
importlib.reload(submitted)
help(submitted.create_frequency_table)

Help on function create_frequency_table in module submitted:

create_frequency_table(train)
    Parameters:
    train (dict of list of lists) 
        - train[y][i][k] = k'th token of i'th text of class y
    
    Output:
    frequency (dict of Counters): 
        - frequency[y][x] = number of occurrences of bigram x in texts of class y,
          where x is in the format 'word1*-*-*-*word2'


importlib.reload(submitted)
frequency = submitted.create_frequency_table(train)

print("frequency['pos'][('this', 'film')]=",frequency['pos']['this*-*-*-*film'])
print("frequency['neg'][('this', 'film')]=",frequency['neg']['this*-*-*-*film'])
print("\n")

print("frequency['pos'][('the', 'movie')]=",frequency['pos']['the*-*-*-*movie'])
print("frequency['neg'][('the', 'movie')]=",frequency['neg'][('the*-*-*-*movie')])
print("\n")


print("frequency['pos'][('of', 'the')]=",frequency['pos']['of*-*-*-*the'])
print("frequency['neg'][('of', 'the')]=",frequency['neg']['of*-*-*-*the'])
print("\n")

print("frequency['pos'][('to', 'be')]=",frequency['pos']['to*-*-*-*be'])
print("frequency['neg'][('to', 'be')]=",frequency['neg']['to*-*-*-*be'])
print("\n")

print("frequency['pos'][('and', 'the')]=",frequency['pos']['and*-*-*-*the'])
print("frequency['neg'][('and', 'the')]=",frequency['neg']['and*-*-*-*the'])
print("\n")

print("--------------------------------------\n")

print("Total # tokens in pos texts is",sum(frequency['pos'].values()))
print("Total # tokens in neg texts is",sum(frequency['neg'].values()))
print("\n")

print("Total # types in pos texts is",len(frequency['pos'].keys()))
print("Total # types in neg texts is",len(frequency['neg'].keys()))

frequency['pos'][('this', 'film')]= 2656
frequency['neg'][('this', 'film')]= 879


frequency['pos'][('the', 'movie')]= 2507
frequency['neg'][('the', 'movie')]= 1098


frequency['pos'][('of', 'the')]= 10008
frequency['neg'][('of', 'the')]= 2779


frequency['pos'][('to', 'be')]= 2500
frequency['neg'][('to', 'be')]= 1057


frequency['pos'][('and', 'the')]= 3342
frequency['neg'][('and', 'the')]= 983


--------------------------------------

Total # tokens in pos texts is 1421513
Total # tokens in neg texts is 468194


Total # types in pos texts is 475651
Total # types in neg texts is 195021


importlib.reload(submitted)
print(sorted(submitted.stopwords))

["'d", "'ll", "'m", "'re", "'s", "'t", "'ve", 'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'cannot', 'could', 'couldn', 'did', 'didn', 'do', 'does', 'doesn', 'doing', 'don', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', 'has', 'hasn', 'have', 'haven', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', 'it', 'its', 'itself', 'let', 'll', 'me', 'more', 'most', 'mustn', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', 'shan', 'she', 'should', 'shouldn', 'so', 'some', 'such', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'wasn', 'we', 'were', 'weren', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'with', 'won', 'would', 'wouldn', 'you', 'your', 'yours', 'yourself', 'yourselves']


importlib.reload(submitted)
help(submitted.remove_stopwords)

Help on function remove_stopwords in module submitted:

remove_stopwords(frequency)
    Parameters:
    frequency (dict of Counters): 
        - frequency[y][x] = number of occurrences of bigram x in texts of class y,
          where x is in the format 'word1*-*-*-*word2'
    stopwords (set of str):
        - Set of stopwords to be excluded
    
    Output:
    nonstop (dict of Counters): 
        - nonstop[y][x] = frequency of bigram x in texts of class y,
          but only if neither token in x is a stopword. x is in the format 'word1*-*-*-*word2'


importlib.reload(submitted)
nonstop = submitted.remove_stopwords(frequency)

print("frequency['pos'][('this', 'film')]=",frequency['pos']['this*-*-*-*film'])
print("frequency['pos'][('this', 'film')]=",nonstop['pos']['this*-*-*-*film'])
print("\n")

print("frequency['pos'][('the', 'movie')]=",frequency['pos']['the*-*-*-*movie'])
print("frequency['pos'][('the', 'movie')]=",nonstop['pos']['the*-*-*-*movie'])
print("\n")


print("frequency['pos'][('of', 'the')]=",frequency['pos']['of*-*-*-*the'])
print("frequency['pos'][('of', 'the')]=",nonstop['pos']['of*-*-*-*the'])
print("\n")

print("frequency['pos'][('to', 'be')]=",frequency['pos']['to*-*-*-*be'])
print("frequency['pos'][('to', 'be')]=",nonstop['pos']['to*-*-*-*be'])
print("\n")

print("frequency['pos'][('and', 'the')]=",frequency['pos']['and*-*-*-*the'])
print("frequency['pos'][('and', 'the')]=",nonstop['pos']['and*-*-*-*the'])
print("\n")

print("--------------------------------------\n")

print("Total pos frequency:",sum(frequency['pos'].values()))
print("Total pos non-stopwords",sum(nonstop['pos'].values()))
print("\n")

print("Total # types in pos texts is",len(frequency['pos'].keys()))
print("Total # non-stopwords in pos is",len(nonstop['pos'].keys()))

print("Length of the stopwords set is:",len(submitted.stopwords))

frequency['pos'][('this', 'film')]= 2656
frequency['pos'][('this', 'film')]= 2656


frequency['pos'][('the', 'movie')]= 2507
frequency['pos'][('the', 'movie')]= 2507


frequency['pos'][('of', 'the')]= 10008
frequency['pos'][('of', 'the')]= 0


frequency['pos'][('to', 'be')]= 2500
frequency['pos'][('to', 'be')]= 0


frequency['pos'][('and', 'the')]= 3342
frequency['pos'][('and', 'the')]= 0


--------------------------------------

Total pos frequency: 1421513
Total pos non-stopwords 1168682


Total # types in pos texts is 475651
Total # non-stopwords in pos is 468246
Length of the stopwords set is: 150


importlib.reload(submitted)
help(submitted.laplace_smoothing)

Help on function laplace_smoothing in module submitted:

laplace_smoothing(nonstop, smoothness)
    Parameters:
    nonstop (dict of Counters) 
        - nonstop[y][x] = frequency of bigram x in y, where x is in the format 'word1*-*-*-*word2'
          and neither word1 nor word2 is a stopword
    smoothness (float)
        - smoothness = Laplace smoothing hyperparameter
    
    Output:
    likelihood (dict of dicts) 
        - likelihood[y][x] = Laplace-smoothed likelihood of bigram x given y,
          where x is in the format 'word1*-*-*-*word2'
        - likelihood[y]['OOV'] = likelihood of an out-of-vocabulary bigram given y
    
    
    Important: 
    Be careful that your vocabulary only counts bigrams that occurred at least once
    in the training data for class y.


importlib.reload(submitted)
likelihood = submitted.laplace_smoothing(frequency, 0.001)

print("likelihood['pos'][('this', 'film')]=",likelihood['pos']['this*-*-*-*film'])
print("likelihood['neg'][('this', 'film')]=",likelihood['neg']['this*-*-*-*film'])
print("\n")

print("likelihood['pos']['OOV']=",likelihood['pos']['OOV'])
print("likelihood['neg']['OOV']=",likelihood['neg']['OOV'])
print("\n")

print("(should be approx. 1): likelihood['pos'] sums to",sum(likelihood['pos'].values()))
print("(should be approx. 1): Likelihood['neg'] sums to",sum(likelihood['neg'].values()))

likelihood['pos'][('this', 'film')]= 0.0018678074513916727
likelihood['neg'][('this', 'film')]= 0.0018766473139073699


likelihood['pos']['OOV']= 7.032404925267997e-10
likelihood['neg']['OOV']= 2.134977450432218e-09


(should be approx. 1): likelihood['pos'] sums to 0.9999999999864526
(should be approx. 1): Likelihood['neg'] sums to 1.0000000000037939


importlib.reload(submitted)
help(submitted.naive_bayes)

Help on function naive_bayes in module submitted:

naive_bayes(texts, likelihood, prior)
    Parameters:
    texts (list of lists) -
        - texts[i][k] = k'th token of i'th text
    likelihood (dict of dicts) 
        - likelihood[y][x] = Laplace-smoothed likelihood of bigram x given y,
          where x is in the format 'word1*-*-*-*word2'
    prior (float)
        - prior = the prior probability of the class called "pos"
    
    Output:
    hypotheses (list)
        - hypotheses[i] = class label for the i'th text


importlib.reload(reader)
texts, labels = reader.loadDev('data/dev', False, True, True)

for y in ['neg','pos']:
    print("There are",labels.count(y),'examples of class',y)

100%|██████████| 1000/1000 [00:00<00:00, 10638.69it/s]
100%|██████████| 4000/4000 [00:00<00:00, 11661.83it/s]

There are 1000 examples of class neg
There are 4000 examples of class pos


importlib.reload(submitted)
hypotheses = submitted.naive_bayes(texts, likelihood, 0.5)

for y in ['neg','pos', 'undecided']:
    print("There are",hypotheses.count(y),'examples that were labeled with class',y)

There are 808 examples that were labeled with class neg
There are 4192 examples that were labeled with class pos
There are 0 examples that were labeled with class undecided


print(len(hypotheses)) 
print(len(labels))

5000
5000


print("The accuracy of the classifier on the dev set is:")

count_correct = 0
for (y,yhat) in zip(labels, hypotheses):
    if y==yhat:
        count_correct += 1
        
print(count_correct / len(labels))

The accuracy of the classifier on the dev set is:
0.884


importlib.reload(submitted)
help(submitted.optimize_hyperparameters)

Help on function optimize_hyperparameters in module submitted:

optimize_hyperparameters(texts, labels, nonstop, priors, smoothnesses)
    Parameters:
    texts (list of lists) - dev set texts
        - texts[i][k] = k'th token of i'th text
    labels (list) - dev set labels
        - labels[i] = class label of i'th text
    nonstop (dict of Counters) 
        - nonstop[y][x] = frequency of word x in class y, x not stopword
    priors (list)
        - a list of different possible values of the prior
    smoothnesses (list)
        - a list of different possible values of the smoothness
    
    Output:
    accuracies (numpy array, shape = len(priors) x len(smoothnesses))
        - accuracies[m,n] = dev set accuracy achieved using the
          m'th candidate prior and the n'th candidate smoothness


importlib.reload(submitted)
import numpy as np

priors = [0.5,0.65,0.75]
smoothnesses = [0.0001,0.001,0.01]
accuracies = submitted.optimize_hyperparameters(texts,labels,nonstop,priors,smoothnesses)

(m,n) = np.unravel_index(np.argmax(accuracies), accuracies.shape)
print("The best accuracy achieved was",accuracies[m,n])
print("It was achieved for a prior of", priors[m], "and a smoothness of", smoothnesses[n])

The best accuracy achieved was 0.885
It was achieved for a prior of 0.75 and a smoothness of 0.001


!python grade.py

..........
----------------------------------------------------------------------
Ran 10 tests in 31.553s

OK

CS440/ECE448 Spring 2023¶

MP02: Naive Bayes¶

Table of Contents¶

Reading the data¶

Learning a Naive Bayes Model: Maximum Likelihood¶

Learning a Naive Bayes model: Stop words¶

Learning a Naive Bayes model: Laplace Smoothing¶

Decisions using a Naive Bayes model¶

Implementation Details¶

Implementation¶

Optimizing Hyperparameters¶

Grade your homework¶