Data Science (Side Projects): 2014

Friday, December 19, 2014

Image Classification: Dogs Vs Cats

I wanted to learn how machine learning is used to classify images (Image recognition). I was browsing Kaggle's past competitions and I found Dogs Vs Cats: Image Classification Competition (Here one needs to classify whether image contain either a dog or a cat). Google search helped me to get started. Here are some of the references that I found quite useful: Yhat's Image Classification in Python and SciKit-image Tutorial. Data is available here. Here I am using first 501 dog images and first 501 cat images from train data folder. For testing I selected first 100 images from test data folder and manually labeled image for verifying.

##########################################
# View files in the directory
ls

Out:
Image_Classification.ipynb data/

# View files in the data directory
ls data

Out:
data/ test/ train/

# Import necessary libraries
import pandas as pd
import numpy as np
from skim age import io
from matplotlib import pyplot as plt

# Define location of data
import os
train_directory = "./data/train/"
test_directory = "./data/test/"

# Define a function to return a list containing the names of the files in a directory given by path
def images(image_directory):
return [image_directory+image for image in os.listdir(image_directory)]

images(train_directory)

Out:

In training directory, image filename indicates image label as cat or dog: Need to extract labels

## Extracting training image labels
train_image_names = images(train_directory)

# Function to extract labels
def extract_labels(file_names):
'''Create labels from file names: Cat = 0 and Dog = 1'''

# Create empty vector of length = no. of files, filled with zeros
n = len(file_names)
y = np.zeros(n, dtype = np.int32)

# Enumerate gives index
for i, filename in enumerate(file_names):

# If 'cat' string is in file name assign '0'
if 'cat' in str(filename):
y[i] = 0
else:
y[i] = 1
return y

extract_labels(train_image_names)

Out:
array([0, 0, 0, ..., 1, 1, 1], dtype=int32)

# Save labels
y = extract_labels(train_image_names)

# Save labels: np.save(file or string, array)
np.save('y', y)

# Images in test directory
images(test_directory)

Out:

## View image: Dog
# from skimage import io # (imported earlier)
temp = io.imread('./data/train/dog.20.jpg')
plt.imshow(temp)

Out:

## View image: Cat
# from skimage import io # (imported earlier)
temp = io.imread('./data/train/cat.4.jpg')
plt.imshow(temp)

Out:

Using folder sort I found that Images are of different sizes: (max size = cat.835.jpg, min size = cat.4821.jpg). Need a standard size for analysis

# Get size of images (Ref: stackoverflow)

image_size = [ ]

for i in train_image_names: # images(file_directory)

im = Image.open(i)

image_size.append(im.size) # A list with tuples: [(x, y), …]

# Get mean of image size (Ref: stackoverflow)

[sum(y) / len(y) for y in zip(*image_size)]

Out: [403, 358]

Transforming the image: Standard size = (400, 350)

## Transforming the image

# Set up a standard image size based on approximate mean size

STANDARD_SIZE = (400, 350)

Code below copied from: Yhat's Image Classification in Python

# Function to read image, change image size and transform image to matrix

def img_to_matrix(filename, verbose=False):

'''

takes a filename and turns it into a numpy array of RGB pixels

'''

img = Image.open(filename)

# img = Image.fromarray(filename)

if verbose == True:

print "Changing size from %s to %s" % (str(img.size), str(STANDARD_SIZE))

img = img.resize(STANDARD_SIZE)

img = list(img.getdata())

img = map(list, img)

img = np.array(img)

return img

# Function to flatten numpy array

def flatten_image(img):

'''

takes in an (m, n) numpy array and flattens it

into an array of shape (1, m * n)

'''

s = img.shape[0] * img.shape[1]

img_wide = img.reshape(1, s)

return img_wide[0]

## Prepare training data

data = []

for i in images(train_directory):

img = img_to_matrix(i)

img = flatten_image(img)

data.append(img)

data = np.array(data)

data.shape

Out: (1002, 420000)

data[1].shape

Out: (420000, )

Total 420,000 features per image. 420,000 features is a lot to deal with for many algorithms, so the number of dimensions should be reduced somehow. For this we can use an unsupervised learning technique called PCA to derive a smaller number of features from the raw pixel data. Principal Component Analysis (PCA): to identify patterns to reduce dimensions of the dataset with minimum loss of information.

# Import PCA

from sklearn.decomposition import PCA

# PCA on training data
pca = PCA(n_components = 2)
X = pca.fit_transform(data)
X.size

Out: 20040

X[:, 0[.size

Out: 1002

X[:, 1[.size

Out: 1002

# Create a dataframe
df = pd.DataFrame({"x-1": X[:, 0], "x-2": X[:, 1], "label" : np.where(y == 1, "Dog", "Cat")})
df

Out:

# Create a dataframe
np.sum(pca.explained_variance_ratio_)

Out: 0.6461222455062432

Here 2-Dimension PCA, captures 64.6% of the variation

## Prepare testing data: PCA

test_images = images(test_directory)

test = [ ]

for i in test_images:

img = img_to_matrix(i)

img = flatten_image(img)

test.append(img)

test = np.array(test)

test.shape

Out: (100, 420000)

# Transforming test data
testX = pca.fit_transform(test)
testX.shape[1]

Out: 20

## Logistic regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression( )
logreg = clf.fit(X, y)

# Predict using Logistic Regression
y_predict_logreg = logreg.predict(testX)
y_predict_logreg
Out:

## Logistic Regression: Accuracy

# Load 'Actual' labels for test data
actual = pd.read_csv('ActualLabels.csv')
actual['Labels'].head( )

Out:

logreg_accuracy =
np.where(y_predict_logreg == actual['Labels'], 1, 0).sum()/float(len(actual))

logreg_accuracy

Out: 0.54

54% of the images were correctly classified using logistic regression (2D PCA)

## KNN classifier
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X, y)

Out: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_neighbors=5, p=2, weights='uniform')

# Predict using KNN classifier

y_predict_knn = knn.predict(testX)
y_predict_knn
Out:

## KNN: Accuracy
knn_accuracy = np.where(y_predict_knn == actual['Labels'], 1, 0).sum()/float(len(actual))
knn_accuracy

Out: 0.52

52% of the images were correctly classified using KNN (2D PCA)

More sophisticated approaches, for example Support Vector Machines, Neural Networks and Others, would classify images with higher accuracy .

Friday, December 5, 2014

Sentiment Analysis on Rotten Tomatoes Movie Reviews

For the past couple of weeks I have been reading and learning Natural Language Processing (NLP) basics from Dr. Christopher Potts (Stanford University, Department of Linguistics) online tutorial. Kaggle's knowledge based competition: Sentiment analysis on movie reviews motivated me to learn basics of NLP (pretty interesting area of research).

I will be using Python (ipython notebook) to analyze data and scikit-learn (Machine Learning library for Python) for predicting sentiment labels. The analysis and prediction done here are based on scikit-learn Working with Text Data tutorial. Movie reviews are from Rotten Tomatoes dataset. The sentiment labels are as follows:

0 - negative
1 - somewhat negative
2 - neutral
3 - somewhat positive
4 - positive

##########################################
# View files in the directory
ls

Out:
RottenTomatoes.ipynb train.tsv test.tsv

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading training and testing data (*tsv : Tab-Seperated Values)
train = pd.read_csv('train.tsv', sep = '\t')
test = pd.read_csv('test.tsv', sep = '\t')

# View training data (top 5 instances)
train.head()

Out:

# View testing data (top 5 instances)
test.head()

Out:

# Unique sentiment labels
train['Sentiment'].unique()

Out:
array([1, 2, 3, 4, 0])

# Type of data frame
type(train)

Out:
pandas.core.frame.DataFrame

# Summary of data (Works only for numerical data)
train.descripe()

Out:

Extracting features from the text, i.e. conveying text content into numerical feature vector.

Working on explanations of Bag of Words, Tokenizing text, Term Frequency, Term Frequency times Inverse Document Frequency, Naive Bayes Classifier...

# Tokenizing text with scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
x_train_counts = count_vector.fit_transform(train['Phrase'])

# Dimensions of the training data count vector
x_train_counts.shape

Out:
(156060, 15240)

# Get index of some common words/n-grams/consecutive characters
# For example: 'movie'
count_vector.vocabulary_.get(u'movie')

Out:
8791

# Get feature names
count_vector.get_feature_names()

Out:

# Converting occurrences to frequencies
from sklearn.feature_extraction.text import TfidfTransformer

## Term Frequencies (tf)
# Use fit() method to fit estimator to the data
tf_transformer = TfidfTransformer(use_idf = False).fit(x_train_counts)
# Use transform() method to transform count-matrix to 'tf' representation
x_train_tf = tf_transformer.transform(x_train_counts)

## Term Frequency times Inverse Document Frequency (tf-idf)
tfidf_transformer = TfidfTransformer()
# Use transform() method to transform count-matrix to 'tf-idf' representation
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

## Training a classifier to predict sentiment label of a phrase
# Naive Bayes Classifier (Multinomial)
from sklearn.naive_bayes import MultinominalNB
clf = MultinomialNB().fit(x_train_tfidf, train['Sentiment'])

## Prediction on test data
# Tokenizing test phrase
x_test_counts = count_vector.transform(test['Phrase'])
# Use transform() method to transform test count-matrix to 'tf-idf' representation
x_test_tfidf = tfidf_transformer.transform(x_test_counts)

# Prediction
predicted = clf.predict(x_test_tfidf)

# View predictions
for i, j in zip(test['PhraseId'], predicted):
print(i, predicted[j])

Out:

# Writing *csv file for Kaggle submission
import csv
with open('Rotten_Sentiment.csv', 'w') as csvfile:
csvfile.write('PhraseId,Sentiment\n')
for i, j in zip(test['PhraseId'], predicted):
csvfile.write('{}, {}\n'.format(i, j))

Out:

Finally, I submitted test data sentiment label predictions on competitions submission link, and got score of 0.58289. Below is the screenshot of leader board standings.

Thursday, November 13, 2014

"Titanic: Machine Learning from Disaster" Data Analysis using Python

After reading Why is Python a language of choice for data scientists?, Is Python Becoming the King of the Data Science Forest? and other related blogs, I decided to brush up and improve my Python programming skills (after a couple of years of hiatus). At UCI I had intensively used Matlab, and I had learned Python from YouTube videos for some of my research calculations.

Last week I studied and practiced Python programming from Codeacademy's online Python Course. This is a really nice, easy to follow and interactive course. Estimated course time is 13 hours but it took me nearly 26 hours to finish. After finishing the course, I decided to analyze data using Python to familiarize myself with Python's Data Analysis Library: Pandas, Scientific Computing Libraries : NumPy, SciPy, Plotting Library: matplotlib (IMO: ggplot2 package in R plots much better looking plots compared to matplotlib plots), and scikit-learn for Machine Learning in Python.

For analyzing data I am using Titanic: Machine Learning from Disaster data from Kaggle's knowledge based competition, a major reason to use this data is that there are a lot of online Python tutorials and blogs that use this data and this makes learning/understanding easier.

Note: This is not a tutorial. The data analysis done here is based on various online Titanic Data related Python tutorials/blogs.

#############################################################
### Kaggle Competition: Titanic Machine Learning from Disaster
# Import important libraries and modules
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pylab as p
import sklearn as sol

# Reading Titanic (training) data
train = pd.read_csv("/Users/Ankoor/Desktop/Python/Kaggle/Titanic/train.csv")

# View dataframe
train

# View first 'n' rows (R*)
train.head(5)

Out:

# View last 'n' rows (R*)
train.tail(3)

# Get column names (features / attributes) in data frame [Similar to R's names()]
list(train)

Out:

train.columns # another command to get feature names

# What kind of data array is 'train'?
type(train)

Out: pandas.core.frame.DataFrame

# Data types in 'train'
train.dtypes

Out:

PassengerId int64
Survived int64
Pclass int64
Name object
Sex object
Age float64
SibSp int64
Parch int64
Ticket object
Fare float64
Cabin object
Embarked object
dtype: object

# Information about data i.e. How to find out missing values?
train.info()

Out:

There are 891 observations. Features 'Age' (714 observations remaining), 'Cabin' (204 observations remaining) and 'Embarked' (889 observations remaining) have missing data.

# Checking missing values in the data: Age and Cabin
sum(train['Age'].isnull())

Out: 177

177 'Age' observations missing

sum(train['Cabin'].isnull())

Out: 687

687 'Cabin' observations missing

Note: .isnull() does not work for 'str'

# Describe data: Count, Mean, STD, Min, Max [Similar to R's summary()]
train.describe()

Out:

# Referencing in Pandas
train['Age'][0:10]

Out:

# Referencing: Other method (Kind of similar to R when you replace '.' with '$')
train.Age[0:10]

# Type of referenced data?
type(train['Age'])
type(train.Age) # another command to get type of referenced data

## Some basic statistics like mean and median
# Mean Age (Ignoring missing values)
train.Age.mean()
train['Age'].mean()

Out: 29.69911764705882

# Median Fare (Ignoring missing values)
train.Fare.median()
train['Fare'].median()

Out: 14.4542

# Unique values
train.Sex.unique()

Out: array(['male', 'female'], dtype=object)

train['Embarked'].unique()

Out: array(['S', 'C', 'Q', nan], dtype=object)

train['Pclass'].unique()

Out: array([3, 1, 2])

3 Passenger classes

# Getting subsets of dataframe
train[['Sex', 'Pclass', 'Age']]

Out:

# Filtering data with 'Age' > 60 years
train[train['Age'] > 60]

Out:

# Filtering and sub-setting data with 'Age' > 70 years
train[train['Age'] > 60][['Sex', 'Pclass', 'Age', 'Survived']]

Out:

# Filtering and sub-setting data with missing values
train[train['Age'].isnull()][['Sex', 'Pclass', 'Age', 'Survived']]

# Counting # of males in each passenger class
for i in range(1, 4):
print i, len(train[(train['Sex'] == 'male') & (train['Pclass'] == i)])

Out:

1 122
2 108
3 347

# Counting # of females in each passenger class
for i in range(1, 4):
print i, len(train[(train['Sex'] == 'female') & (train['Pclass'] == i)])

Out:

1 94
2 76
3 144

Passenger class 3 has more male and female passengers compared to passenger class 1 and 2

# Simple Histogram of Age
train['Age'].hist()
p.show()

Out:

# Histogram of Age (after dropping missing values), alpha controls 'transparency'?

train['Age'].dropna().hist(bins = 16, range = (0, 80), alpha = 0.5)

Out:

## Cleaning data: Transforming 'String values'

# 1. Adding a new column and filling it with a number

train['Gender'] = 4

# 2. Populating the new column 'Gender' with M or F

train['Gender'] = train['Sex'].map(lambda x: x[0].upper())

# 3. Populating the new column witn binary integers

train['Gender'] = train['Sex'].map({'female': 0, 'male': 1}).astype(int)

## Feature Engineering: Name attribute has honorific titles like Mr., Mrs., etc

# Extracting salutation from Name (Format: Last Name, Title, First Name)

# Name example: Dahlberg, Miss. Gerda Ulrika

def title(name):

temp_1 = name.split(',') # Split by (,)

temp_2 = temp_1[1].split('.')[0] # Split by (.)

temp_3 = temp_2.strip() # Remove white space

return temp_3

train['Title'] = train['Name'].apply(title) # Apply function 'title' to 'Name'

train[['PassengerId', 'Survived', 'Sex', 'Pclass', 'Age', 'Gender', 'Title']]

Out:

## How to count passenger by Title

# Grouping by Title

temp_4 = train.groupby('Title')

# Counting passengers by Title

temp_5 = temp_4.PassengerId.count()

print temp_5

Out:

# Barplot: Passenger count by title

temp_5.plot(kind = 'bar')

Out:

Majority of passengers had 4 honorific titles: Mr, Mrs, Miss, and Master. I will rename (1) honorific titles like Capt, Don, Dr, Jonkheer, Major, Rev and Sir to Mr; (2) honorific titles like Lady, Mme, Ms, and the Countess to Mrs; (3) honorific titles like Mlle to Miss

# How many males and females are Doctors?

train[train['Title'] == 'Dr']

Out:

6 male doctors and 1 female doctor (Dr. Alice (Farnham) Leader)

## Create a 'Temp' column in train and fill it with concatenated 'Sex' and 'Title' string values

train ['Temp'] = train['Sex'] + train['Title']

## Replace concatenated value 'femaleDr' value with 'Mrs'

train.loc[train['Temp'] == 'femaleDr', 'Title'] = 'Mrs'

# Drop 'Temp' column

train = train.drop(['Temp'], axis = 1)

## There are 4 main titles: Mr, Mrs, Master and Miss, and some other titles

# Taking care of other titles

def new_title(title):

if title == 'Mr' or title == 'Capt' or title == 'Don' or title == 'Dr' or title == 'Jonkheer' or title == 'Major' or title == 'Rev' or title == 'Sir' or title == 'Col':

return 'Mr'

elif title == 'Mrs' or title == 'Lady' or title == 'Mme' or title == 'Ms' or title == 'the Countess':

return 'Mrs'

elif title == 'Miss' or title == 'Mlle':

return 'Miss'

else:

return 'Master'

train['NewTitle'] = train['Title'].apply(new_title)

# Drop 'Title' attribute

train = train.drop(['Title'], axis = 1)

# Grouping by Title

temp_6 = train.groupby('NewTitle')

# Counting passengers by Title

temp_7 = temp_6.PassengerId.count()

print temp_7
temp_7.plot(kind = 'bar')

Out:

Now all the passenger honorific titles have been updated.

## Now descriptive statistics plots to understand data and survival chance

train.boxplot(column = 'Age', by = 'NewTitle')

Out:

Outliers: Miss with Age around 60? I used train[(train['Age'] > 30) & (train['NewTitle'] == 'Master')] and found that some females with age > 30 years have 'Miss in their title (May be they were unmarried or some other reason)

train.boxplot(column = 'Fare', by = 'Pclass')

Out:

Outliers: Some passengers in First Class have paid more than $200 for tickets, may be they have paid for their whole family.

# Passenger distribution by Passenger Class and Survival Chance

group_1 = train.groupby('Pclass').PassengerId.count()

group_1.plot(kind = 'bar')

Out:

Almost half of the passengers were 3rd class passengers

group_2 = train.groupby('Pclass').Survived.sum()

Pclass_Survival_Prob = group_2/group_1

Pclass_Survival_Prob.plot(kind = 'bar', color = 'pink', alpha = 0.65)

Out:

However, more First Class and Second Class passengers survived compared to passengers in Third Class (May be better access to Lifeboats/Life Jackets, or easy access to upper decks?)

# Passenger distribution by Passenger Class, Gender and Survival Chance

# Barplot using Cross-tabulation

group_3 = pd.crosstab([train.Pclass, train.Sex], train.Survived)

group_3.plot(kind = 'bar', stacked = True, color = ['black', 'yellow'])

Out:

Compared to males more females survived the disaster.

# Some other related plots

group_4 = pd.crosstab([train.Pclass, train.Sex, train.Embarked], train.Survived)
group_4.plot(kind = 'bar', stacked = True, color = ['black', 'yellow'], alpha = 0.5)

Out:

group_5 = pd.crosstab([train.Pclass, train.NewTitle], train.Survived)
group_5.plot(kind = 'bar', stacked = True, color = ['black', 'yellow'])

Out:

group_6 = pd.crosstab([train.Embarked, train.NewTitle], train.Survived)
group_6.plot(kind = 'bar', stacked = True, color = ['black', 'yellow'])

Out:

# Feature Engineering: Family size
train['Family'] = train['SibSp'] + train['Parch']
group_7 = pd.crosstab([train.Pclass, train.Family], train.Survived)
group_7.plot(kind = 'bar', stacked = True, color = ['black', 'yellow'], alpha = 0.25)

## Imputing missing values in attribute 'Age'. I found the code used below at this blog.

# View dataframe: 'Age' = NaN

train[train['Age'].isnull()].head()

table = train.pivot_table(values = 'Age', index = ['NewTitle'], columns = ['Pclass', 'Sex'], aggfunc = np.mean)

def ageFunc(x):

return table[x['Pclass']][x['Sex']][x['NewTitle']]

train['Age'].fillna(train[train['Age'].isnull()].apply(ageFunc, axis = 1), inplace = True)

train['Age'] = train['Age'].astype(int)

#Some more plots

#Specifying Plot Parameters
# figsize = (x inches, y inches), dpi = n dots per inches
fig = plt.figure(figsize = (11, 8), dpi = 1600)

# Plot: 1
ax1 = fig.add_subplot(221) # .add_subplot(rcp): r = row, c = col, p = position
female_hiclass = train['Survived'][train['Sex'] == 'female'][train['Pclass'] != 3].value_counts()
female_hiclass.plot(kind = 'bar', label = 'Female High Class', color = 'deeppink', alpha = 0.25)
ax1.set_xticklabels(['Survived', 'Dead'], rotation = 0)
ax1.set_xlim(-1, len(female_hiclass))
ax1.set_ylim(0, 400)
plt.legend(loc = 'best')

# Plot: 2
ax2 = fig.add_subplot(222) # .add_subplot(rcp): r = row, c = col, p = position
female_loclass = train['Survived'][train['Sex'] == 'female'][train['Pclass'] == 3].value_counts()
female_loclass.plot(kind = 'bar', label = 'Female Low Class', color = 'pink', alpha = 0.25)
ax2.set_xticklabels(['Survived', 'Dead'], rotation = 0)
ax2.set_xlim([-1, len(female_loclass)])
ax2.set_ylim(0, 400)
plt.legend(loc = 'best')

# Plot: 3
ax3 = fig.add_subplot(223) # .add_subplot(rcp): r = row, c = col, p = position
male_hiclass = train['Survived'][train['Sex'] == 'male'][train['Pclass'] != 3].value_counts()
male_hiclass.plot(kind = 'bar', label = 'Male High Class', color = 'teal', alpha = 0.25)
ax3.set_xticklabels(['Dead', 'Survided'], rotation = 0)
ax3.set_xlim(-1, len(male_hiclass))
ax3.set_ylim(0, 400)
plt.legend(loc = 'best')

# Plot: 4
ax4 = fig.add_subplot(224) # .add_subplot(rcp): r = row, c = col, p = position
male_loclass = train['Survived'][train['Sex'] == 'male'][train['Pclass'] == 3].value_counts()
male_loclass.plot(kind = 'bar', label = 'Male Low Class', color = 'green', alpha = 0.25)
ax4.set_xticklabels(['Dead', 'Survived'], rotation = 0)
ax4.set_xlim(-1, len(male_loclass))
ax4.set_ylim(0, 400)
plt.legend(loc = 'best')

Out:

Females in the high class had better survival chance compared to females in low class. Irrespective of the class more male passengers perished compared to females.