Short Term Course on Machine Learning for Practitioners

Ankita Mandal and Aparajita Khan

Indian Statistical Institute

E-mail: amandal@isical.ac.in, aparajitak_r@isical.ac.in

November 19, 2019

Principal Component Analysis¶

PCA- A unsupervised feature extraction algorithm that aims to maximize the variance along the extracted features

Why maximize variance?

each=200
Ca<-cbind(rnorm(each,mean=5,sd=1.15),rnorm(200,mean=4,sd=1.05))
Cb<-cbind(rnorm(each,mean=14,sd=1.15),rnorm(200,mean=7,sd=1.05))
X<-rbind(Ca,Cb)
cat(" Glimpse of Dataset X: \n")
print(X[1:5,])
cat("\n Dimension of Dataset: \t Samples:",dim(X)[1],"\t Features:",dim(X)[2])
true=c(rep(1,each),rep(2,each))
colvec = c("coral3","darkseagreen3")[true]
pchs= c(22,24)[true]
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Scatter plot of Data")

 Glimpse of Dataset X: 
         [,1]     [,2]
[1,] 6.889960 2.695371
[2,] 5.772627 4.514670
[3,] 3.712034 4.269017
[4,] 3.249126 3.928945
[5,] 3.244437 1.634670

 Dimension of Dataset: 	 Samples: 400 	 Features: 2

Mean centering doesnot change the structure of dataset¶

X<-scale(X,center=TRUE,scale=FALSE)
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Scatter plot of Mean-Centered Data")

xm=min(X[,1])
ym=min(X[,2])
ProjF1=cbind(X[,1],rep(ym,nrow(X)))
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Projection of Data along Feature1")
points(ProjF1,col="black",bg=colvec,pch=pchs)
text(0,ym+1,"Well-separated",font=2)

ProjF2=cbind(rep(xm,nrow(X)),X[,2])
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Projection of Data along Feature2")
points(ProjF2,col="black",bg=colvec,pch=pchs)
text(xm+1,2,"poor separation",font=2,srt=90)

Variance gives a notion of the structure of the data ¶

PCA Objective Find direction $v_1$ a linear combination of Feature 1 and Feature 2 that maximizes the variance of the projected data

Projected data:
\begin{equation} y = Xv_1 \tag{1} \end{equation}

Say $ v_1 = \begin{bmatrix} v_{11} \\ v_{12} \end{bmatrix} $, the projected data $y=Xv$ implies

\begin{equation} y = v_{11} \times \text{Feature 1} + v_{11} \times \text{Feature 2} \tag{2} \end{equation}

$y$ is a linear combination of Feature 1 and Feature 2.

\begin{equation} \mathrm{maximize} \hspace{3mm} var(y)= \frac{1}{(n-1)} y^T y = v_1^T X^T X v_1 \tag{3} \end{equation}

For mean-centered data $X$:

$\frac{1}{(n-1)} X^T X $ is the Covariance matrix of $X$

\begin{equation} \mathrm{maximize} \hspace{3mm} v_1^T C v_1 \hspace{10mm} \mathrm{subject} \space \mathrm{to} \hspace{3mm} v_1^T v_1 = 1, \tag{4} \end{equation}

where $C= \frac{1}{(n-1)} X^T X$ and $v_1$ is unit vector as only the direction of maximum variance matters.

Solution for $v_1$ in $(4)$ given by the eigenvector of $C$ corresponding largest eigenvalue.

Principal components of $X$ : Project $X$ on the eigenvectors of its covaraince matrix $C$¶

PCA on IRIS dataset

cat("IRIS dataset\n")
head(iris)
X<-iris[,-5]
class=as.numeric(iris$Species)
cat("\n Samples: ",dim(X)[1],"\t Features: ",dim(X)[2],"\t Classes: ",levels(iris$Species))
colvec = c("coral3","darkseagreen3","darkgoldenrod2")[class]
pchs= c(22,23,24)[class]
pairs(X, col=colvec, pch=pchs)

IRIS dataset

 Samples:  150 	 Features:  4 	 Classes:  setosa versicolor virginica

X<-scale(X,center=TRUE,scale=FALSE)
n=nrow(X)
C=(t(X)%*%X)/(n-1)
cat("\n Covariance matrix of mean-centered X:\n\n")
print(C)
cat("\n Dimension of C: ",dim(C)[1],"x", dim(C)[2])

 Covariance matrix of mean-centered X:

             Sepal.Length Sepal.Width Petal.Length Petal.Width
Sepal.Length    0.6856935  -0.0424340    1.2743154   0.5162707
Sepal.Width    -0.0424340   0.1899794   -0.3296564  -0.1216394
Petal.Length    1.2743154  -0.3296564    3.1162779   1.2956094
Petal.Width     0.5162707  -0.1216394    1.2956094   0.5810063

 Dimension of C:  4 x 4

eigC=eigen(C)
cat(" Eigenvalues of C: ",eigC$values)
cat("\n Eigenvectors of C: \n\n")
print(eigC$vectors)
v1=eigC$vectors[,1,drop=FALSE]
cat("\n Eigenvector v1= \n")
cat(" ",v1,sep="\n")

 Eigenvalues of C:  4.228242 0.2426707 0.0782095 0.02383509
 Eigenvectors of C: 

            [,1]        [,2]        [,3]       [,4]
[1,]  0.36138659  0.65658877 -0.58202985  0.3154872
[2,] -0.08452251  0.73016143  0.59791083 -0.3197231
[3,]  0.85667061 -0.17337266  0.07623608 -0.4798390
[4,]  0.35828920 -0.07548102  0.54583143  0.7536574

 Eigenvector v1= 
 
0.3613866
-0.08452251
0.8566706
0.3582892

First principal component \begin{equation} y_1 = Xv_1 \tag{5} \end{equation}

\begin{equation} y1= 0.3613 \times \text{Sepal.Length} -0.0845 \times \text{Sepal.Width} + 0.8566 \times \text{Petal.Length} + 0.35828 \times \text{Petal.Width} \end{equation}

y1=X%*%v1
cat("\n First principal component y1= \n ")
cat(" ",y1[1:8],sep="\n")
cat("\n Largest eigenvalue= ",eigC$values[1])
pc1Mat=cbind(y1,rep(0,n))
plot(pc1Mat,col="black",bg=colvec,pch=pchs,xlab="Principal Component 1",ylab="",ylim=c(-1,1),xlim=c(-4, 4),main="First principal component")
text(0,-0.2,paste("variance= ",var(y1)),font=2)

 First principal component y1= 
  
-2.684126
-2.714142
-2.888991
-2.745343
-2.728717
-2.28086
-2.820538
-2.626145

 Largest eigenvalue=  4.228242

v2=eigC$vectors[,2,drop=FALSE]
y2=X%*%v2
cat("\n Second principal component y2= \n ")
cat(" ",y2[1:8],sep="\n")
cat("\n Second Largest eigenvalue= ",eigC$values[2])

 Second principal component y2= 
  
0.3193972
-0.1770012
-0.1449494
-0.318299
0.3267545
0.7413304
-0.08946138
0.163385

 Second Largest eigenvalue=  0.2426707

pc1Mat=cbind(rep(0,n),y2)
plot(pc1Mat,col="black",bg=colvec,pch=pchs,xlab="",ylab="Principal Component 2",ylim=c(-1,1.5),xlim=c(-1, 1),main="Second principal component")
text(0.2,0.2,paste("variance= ",var(y2)),font=2,srt=90)

PC=cbind(y1,y2)
plot(PC,col="black",bg=colvec,pch=pchs,xlab="Principal Component 1",ylab="Principal Component 2",ylim=c(-1.2,1.3),xlim=c(-3.5,4.5),main="Top Two Principal Components")

Classes are well-separated when the 4 dimensional IRIS data is projected on its principal components!¶

PCA using prcomp() ¶

pc=prcomp(X, center=TRUE, scale=FALSE, retx=TRUE)
cat("\n Directions/Eigenvectors: \n")
print(pc$rotation)
cat("\n Principal Components: \n")
print(pc$x[1:5,])
cat("\n Variance along principal components: ")
cat(pc$sdev^2)

 Directions/Eigenvectors: 
                     PC1         PC2         PC3        PC4
Sepal.Length  0.36138659 -0.65658877  0.58202985  0.3154872
Sepal.Width  -0.08452251 -0.73016143 -0.59791083 -0.3197231
Petal.Length  0.85667061  0.17337266 -0.07623608 -0.4798390
Petal.Width   0.35828920  0.07548102 -0.54583143  0.7536574

 Principal Components: 
           PC1        PC2         PC3          PC4
[1,] -2.684126 -0.3193972  0.02791483  0.002262437
[2,] -2.714142  0.1770012  0.21046427  0.099026550
[3,] -2.888991  0.1449494 -0.01790026  0.019968390
[4,] -2.745343  0.3182990 -0.03155937 -0.075575817
[5,] -2.728717 -0.3267545 -0.09007924 -0.061258593

 Variance along principal components: 4.228242 0.2426707 0.0782095 0.02383509

PCA on Pima Indians Diabetes Database ¶

library(mlbench)

data(PimaIndiansDiabetes)
Dataset<-PimaIndiansDiabetes
cat("\n Predict the onset of diabetes in female Pima Indians from medical record data.")
cat("\n Dimension of dataset: ",dim(Dataset))
cat("\n Classes: ",levels(Dataset$diabetes))
head(Dataset)
class=as.numeric(Dataset$diabetes)

 Predict the onset of diabetes in female Pima Indians from medical record data.
 Dimension of dataset:  768 9
 Classes:  neg pos

X<-Dataset[,-ncol(Dataset)]
X<-as.matrix(as.data.frame(lapply(X, as.numeric)))
colvec = c("cyan3","plum3")[class]
pchs= c(22,24)[class]
pairs(X[,1:4], col=colvec, pch=pchs)

pc=prcomp(X, center=TRUE, scale=FALSE, retx=TRUE)
show=4
cat("\n Directions/Eigenvectors: \n")
print(pc$rotation[1:5,1:show])
cat("\n Principal Components: \n")
print(pc$x[1:5,1:show])
cat("\n Variance along principal components: ")
cat(pc$sdev^2)

 Directions/Eigenvectors: 
                  PC1         PC2        PC3         PC4
pregnant -0.002021766  0.02264889 -0.0224649  0.04904596
glucose   0.097811577  0.97221004  0.1434287 -0.11983002
pressure  0.016093050  0.14190933 -0.9224672  0.26274279
triceps   0.060756686 -0.05786147 -0.3070131 -0.88436938
insulin   0.993110844 -0.09462669  0.0209773  0.06555036

 Principal Components: 
           PC1        PC2         PC3        PC4
[1,] -75.71465  35.950783 -7.26078895 -15.669269
[2,] -82.35827 -28.908213 -5.49667139  -9.004554
[3,] -74.63064  67.906496 19.46180812   5.653056
[4,]  11.07742 -34.898486 -0.05301779  -1.314873
[5,]  89.74379   2.746937 25.21285861 -18.994237

 Variance along principal components: 13456.57 932.7601 390.5778 198.1827 112.6891 45.82944 7.760709 0.102871

PC=pc$x[,1:2]
plot(PC,col="black",bg=colvec,pch=pchs,xlab="Principal Component 1",ylab="Principal Component 2",,main="Top Two Principal Components")

Some machine learning datasets in "mlbench" package

https://machinelearningmastery.com/machine-learning-datasets-in-r/

Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
<dbl>	<dbl>	<dbl>	<dbl>	<fct>
5.1	3.5	1.4	0.2	setosa
4.9	3.0	1.4	0.2	setosa
4.7	3.2	1.3	0.2	setosa
4.6	3.1	1.5	0.2	setosa
5.0	3.6	1.4	0.2	setosa
5.4	3.9	1.7	0.4	setosa