I'm still interested. Tell me more...
pip install numpy scipy scikit-learn
Dataset * Text files of emails from Machine Learning in Action published by Manning * unzip email.zip
* Small dataset of ham and spam
Making word vectors before we use Naive Bayes to classify the word vectors
import re
import numpy as np
from glob import glob
# Use regular expressions to split up the sentence on anything that isn't a word or a number
regEx = re.compile('\\W*')
email_text = open('email/ham/1.txt').read()
# words sorta equal tokens
list_of_tokens = regEx.split(email_text)
list_of_tokens
['Hi', 'Peter', 'With', 'Jose', 'out', 'of', 'town', 'do', 'you', 'want', 'to', 'meet', 'once', 'in', 'a', 'while', 'to', 'keep', 'things', 'going', 'and', 'do', 'some', 'interesting', 'stuff', 'Let', 'me', 'know', 'Eugene']
email_text = open('email/spam/1.txt').read()
# words sorta equal tokens
list_of_tokens = regEx.split(email_text)
list_of_tokens
['', 'Codeine', '15mg', '30', 'for', '203', '70', 'VISA', 'Only', 'Codeine', 'Methylmorphine', 'is', 'a', 'narcotic', 'opioid', 'pain', 'reliever', 'We', 'have', '15mg', '30mg', 'pills', '30', '15mg', 'for', '203', '70', '60', '15mg', 'for', '385', '80', '90', '15mg', 'for', '562', '50', 'VISA', 'Only', '']
def parse_text(email_filename):
"""converts all tokens to lowercase and removes tokens < 2 characters long
"""
email_text = open(email_filename).read()
tokens = re.split('\\W*', email_text)
return [token.lower() for token in tokens if len(token) > 2]
def get_all_text(email_type):
files = glob('email/' + email_type + '/*.txt')
return [parse_text(file) for file in files]
def create_vocab_list(data_set):
vocab_set = set([]) #create empty set
for document in data_set:
vocab_set = vocab_set | set(document) #union of the two sets
return list(vocab_set)
def bag_of_words(vocab_list, input_words):
returnVec = [0]*len(vocab_list)
for word in input_words:
if word in vocab_list:
returnVec[vocab_list.index(word)] += 1
return returnVec
email_types = ['ham', 'spam']
ham = get_all_text('ham')
spam = get_all_text('spam')
all_documents = ham + spam
all_labels = ['ham'] * 25 + ['spam'] * 25
vocab_list = create_vocab_list(all_documents) #create vocabulary
# Convert the documents into word vectors
features = [bag_of_words(vocab_list, document) for document in all_documents]
print np.array(features).shape
print np.array(all_labels).shape
(50, 692) (50,)
# Cross validation
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
np.array(features), np.array(all_labels), test_size=0.3, random_state=0)
print "Training set:", X_train.shape, y_train.shape
print "Test set:", X_test.shape, y_test.shape
Training set: (35, 692) (35,) Test set: (15, 692) (15,)
import numpy as np
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print "Number of mislabeled points : %d" % (y_test != y_pred).sum()
print "Score:", classifier.score(X_test, y_test)
Number of mislabeled points : 1 Score: 0.933333333333
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
precision recall f1-score support ham 0.86 1.00 0.92 6 spam 1.00 0.89 0.94 9 avg / total 0.94 0.93 0.93 15
Iris Flower Data Set - The data set consists of 50 samples from each of three species of Iris - Iris setosa - Iris virginica - Iris versicolor - Four features were measured from each sample: - the length of the sepals - the width of the sepals - the length of the petals - the width of the petals
import matplotlib.pyplot as plt
%matplotlib inline
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features.
Y = iris.target
# Plot the points
plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
<matplotlib.text.Text at 0x113578e50>
from sklearn import cluster, datasets
iris = datasets.load_iris()
X_iris = iris.data # features
y_iris = iris.target # labels
k_means = cluster.KMeans(n_clusters=3)
k_means.fit(X_iris)
# does the kmeans clustering predict reality?
# only predicts species # 2
print(k_means.labels_[::10])
print(y_iris[::10])
[1 1 1 1 1 0 0 0 0 0 2 2 2 2 2] [0 0 0 0 0 1 1 1 1 1 2 2 2 2 2]