PCA- A unsupervised feature extraction algorithm that aims to maximize the variance along the extracted features
Why maximize variance?
each=200
Ca<-cbind(rnorm(each,mean=5,sd=1.15),rnorm(200,mean=4,sd=1.05))
Cb<-cbind(rnorm(each,mean=14,sd=1.15),rnorm(200,mean=7,sd=1.05))
X<-rbind(Ca,Cb)
cat(" Glimpse of Dataset X: \n")
print(X[1:5,])
cat("\n Dimension of Dataset: \t Samples:",dim(X)[1],"\t Features:",dim(X)[2])
true=c(rep(1,each),rep(2,each))
colvec = c("coral3","darkseagreen3")[true]
pchs= c(22,24)[true]
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Scatter plot of Data")
X<-scale(X,center=TRUE,scale=FALSE)
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Scatter plot of Mean-Centered Data")
xm=min(X[,1])
ym=min(X[,2])
ProjF1=cbind(X[,1],rep(ym,nrow(X)))
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Projection of Data along Feature1")
points(ProjF1,col="black",bg=colvec,pch=pchs)
text(0,ym+1,"Well-separated",font=2)
ProjF2=cbind(rep(xm,nrow(X)),X[,2])
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Projection of Data along Feature2")
points(ProjF2,col="black",bg=colvec,pch=pchs)
text(xm+1,2,"poor separation",font=2,srt=90)
PCA Objective Find direction $v_1$ a linear combination of Feature 1 and Feature 2 that maximizes the variance of the projected data
Projected data:
\begin{equation}
y = Xv_1
\tag{1}
\end{equation}
Say $ v_1 = \begin{bmatrix} v_{11} \\ v_{12} \end{bmatrix} $, the projected data $y=Xv$ implies
\begin{equation} y = v_{11} \times \text{Feature 1} + v_{11} \times \text{Feature 2} \tag{2} \end{equation}$y$ is a linear combination of Feature 1 and Feature 2.
\begin{equation} \mathrm{maximize} \hspace{3mm} var(y)= \frac{1}{(n-1)} y^T y = v_1^T X^T X v_1 \tag{3} \end{equation}For mean-centered data $X$:
$\frac{1}{(n-1)} X^T X $ is the Covariance matrix of $X$
\begin{equation} \mathrm{maximize} \hspace{3mm} v_1^T C v_1 \hspace{10mm} \mathrm{subject} \space \mathrm{to} \hspace{3mm} v_1^T v_1 = 1, \tag{4} \end{equation}where $C= \frac{1}{(n-1)} X^T X$ and $v_1$ is unit vector as only the direction of maximum variance matters.
Solution for $v_1$ in $(4)$ given by the eigenvector of $C$ corresponding largest eigenvalue.
cat("IRIS dataset\n")
head(iris)
X<-iris[,-5]
class=as.numeric(iris$Species)
cat("\n Samples: ",dim(X)[1],"\t Features: ",dim(X)[2],"\t Classes: ",levels(iris$Species))
colvec = c("coral3","darkseagreen3","darkgoldenrod2")[class]
pchs= c(22,23,24)[class]
pairs(X, col=colvec, pch=pchs)
X<-scale(X,center=TRUE,scale=FALSE)
n=nrow(X)
C=(t(X)%*%X)/(n-1)
cat("\n Covariance matrix of mean-centered X:\n\n")
print(C)
cat("\n Dimension of C: ",dim(C)[1],"x", dim(C)[2])
eigC=eigen(C)
cat(" Eigenvalues of C: ",eigC$values)
cat("\n Eigenvectors of C: \n\n")
print(eigC$vectors)
v1=eigC$vectors[,1,drop=FALSE]
cat("\n Eigenvector v1= \n")
cat(" ",v1,sep="\n")
First principal component \begin{equation} y_1 = Xv_1 \tag{5} \end{equation}
\begin{equation} y1= 0.3613 \times \text{Sepal.Length} -0.0845 \times \text{Sepal.Width} + 0.8566 \times \text{Petal.Length} + 0.35828 \times \text{Petal.Width} \end{equation}y1=X%*%v1
cat("\n First principal component y1= \n ")
cat(" ",y1[1:8],sep="\n")
cat("\n Largest eigenvalue= ",eigC$values[1])
pc1Mat=cbind(y1,rep(0,n))
plot(pc1Mat,col="black",bg=colvec,pch=pchs,xlab="Principal Component 1",ylab="",ylim=c(-1,1),xlim=c(-4, 4),main="First principal component")
text(0,-0.2,paste("variance= ",var(y1)),font=2)
v2=eigC$vectors[,2,drop=FALSE]
y2=X%*%v2
cat("\n Second principal component y2= \n ")
cat(" ",y2[1:8],sep="\n")
cat("\n Second Largest eigenvalue= ",eigC$values[2])
pc1Mat=cbind(rep(0,n),y2)
plot(pc1Mat,col="black",bg=colvec,pch=pchs,xlab="",ylab="Principal Component 2",ylim=c(-1,1.5),xlim=c(-1, 1),main="Second principal component")
text(0.2,0.2,paste("variance= ",var(y2)),font=2,srt=90)
PC=cbind(y1,y2)
plot(PC,col="black",bg=colvec,pch=pchs,xlab="Principal Component 1",ylab="Principal Component 2",ylim=c(-1.2,1.3),xlim=c(-3.5,4.5),main="Top Two Principal Components")
pc=prcomp(X, center=TRUE, scale=FALSE, retx=TRUE)
cat("\n Directions/Eigenvectors: \n")
print(pc$rotation)
cat("\n Principal Components: \n")
print(pc$x[1:5,])
cat("\n Variance along principal components: ")
cat(pc$sdev^2)
library(mlbench)
data(PimaIndiansDiabetes)
Dataset<-PimaIndiansDiabetes
cat("\n Predict the onset of diabetes in female Pima Indians from medical record data.")
cat("\n Dimension of dataset: ",dim(Dataset))
cat("\n Classes: ",levels(Dataset$diabetes))
head(Dataset)
class=as.numeric(Dataset$diabetes)
X<-Dataset[,-ncol(Dataset)]
X<-as.matrix(as.data.frame(lapply(X, as.numeric)))
colvec = c("cyan3","plum3")[class]
pchs= c(22,24)[class]
pairs(X[,1:4], col=colvec, pch=pchs)
pc=prcomp(X, center=TRUE, scale=FALSE, retx=TRUE)
show=4
cat("\n Directions/Eigenvectors: \n")
print(pc$rotation[1:5,1:show])
cat("\n Principal Components: \n")
print(pc$x[1:5,1:show])
cat("\n Variance along principal components: ")
cat(pc$sdev^2)
PC=pc$x[,1:2]
plot(PC,col="black",bg=colvec,pch=pchs,xlab="Principal Component 1",ylab="Principal Component 2",,main="Top Two Principal Components")
Some machine learning datasets in "mlbench" package
https://machinelearningmastery.com/machine-learning-datasets-in-r/