Does variance maximization always work?
N=250
Ca<-cbind(rnorm(N,mean=5,sd=6.85),rnorm(N,mean=8,sd=0.3))
Cb<-cbind(rnorm(N,mean=11,sd=7.5),rnorm(N,mean=2,sd=0.6))
X<-rbind(Ca,Cb)
Theta=0.26
R=matrix(c(cos(Theta),-sin(Theta),sin(Theta),cos(Theta)),2,2)
X=X%*%R
cat(" Glimpse of Dataset X: \n")
print(X[1:5,])
cat("\n Dimension of Dataset: \t Samples:",dim(X)[1],"\t Features:",dim(X)[2])
class=c(rep(1,N),rep(2,N))
colvec = c("darkseagreen3","coral3")[class]
pchs= c(24,22)[class]
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Scatter plot of Data")
pc=prcomp(X, center=TRUE, scale=FALSE, retx=TRUE)
cat("\n Directions/Eigenvectors: \n")
print(pc$rotation)
cat("\n Principal Components: \n")
print(pc$x[1:5,])
cat("\n Variance along principal components: ")
cat(pc$sdev^2)
xm=min(X[,1])
ym=min(X[,2])
ProjF1=cbind(pc$x[,1],rep(ym,nrow(X)))
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Projection of Data along Feature1")
points(ProjF1,col="black",bg=colvec,pch=pchs,xlab="",ylab="",)
Project on a lower dimension which is a linear combination of original features as well as contains class information.
LDA Objective
Find direction $v$ a linear combination of original $d$ features that maximizes class separability of the projected data
Projected of sample $ x \in \mathbb{R}^d$:
\begin{equation}
y = v^T x
\tag{1}
\end{equation}
Let \begin{align} \tilde{\mu_1} \mbox{ and } \tilde{\sigma_1}^2 \mbox{ be mean and variance of projected data for the points in class } \boldsymbol{\omega_1},\\ \tilde{\mu_2} \mbox{ and } \tilde{\sigma_2}^2 \mbox{ be mean and variance of projected data for the points in class } \boldsymbol{\omega_2}. \end{align}
Maximize Fisher Discriminant Ratio (FDR) on $y$ \begin{equation} \mathrm{maximize} \hspace{3mm} FDR = \frac{ (\tilde{\mu_1} -\tilde{\mu_2})^2}{\tilde{\sigma_1}^2+\tilde{\sigma_2}^2} \tag{2} \end{equation}
$\tilde{\mu_1} = v^T \mu_1$
$\tilde{\mu_2} = v^T \mu_2$
$\tilde{\sigma_1}^2 = \sum_{y \in \boldsymbol{\omega_1}} (y -\tilde{\mu_1})^2 =\sum_{X \in \boldsymbol{\omega_1}} (v^T x - v^T \mu_1)^2 =\sum_{X \in \boldsymbol{\omega_1}} v^T(x - \mu_1)(x - \mu_1)^T v = v^T \Sigma_1 v$
$\tilde{\sigma_2}^2 = v^T \Sigma_2 v$
\begin{equation} (\tilde{\mu_1} - \tilde{\mu_2})^2= (v^T \mu_1 - v^T \mu_2)^2 = v^T(\mu_1 - \mu_2)(\mu_1 - \mu_2)^T v= v^T S_b v \end{equation}\begin{equation} \tilde{\sigma_1}^2+ \tilde{\sigma_2}^2= v^T \Sigma_1 v + v^T \Sigma_2 v = v^T S_w v \end{equation}LDA Objective \begin{equation} \mathrm{maximize} \hspace{3mm} \frac{ (\tilde{\mu_1} -\tilde{\mu_2})^2}{\tilde{\sigma_1}^2+\tilde{\sigma_2}^2} = \frac{v^T S_b v}{v^T S_w v} \simeq v^T S_w^{-1} S_b v \tag{3} \end{equation}
Solution for $v$ in $(3)$ given by the eigenvector of $J= S_w^{-1} S_b$ corresponding largest eigenvalue.
N=nrow(X)
nclass=length(unique(class))
mu0<-colMeans(X)
S<-list()
Mu<-list()
Sw=0
for(i in 1:nclass)
{
wi=X[which(class==i),]
Ni=nrow(wi)
Mu[[i]]<-colMeans(wi)
S[[i]]<-cov(wi)
cat("\n\n Class ",i,": ")
cat("\n Mean Mu: ",Mu[[i]])
cat("\n Covariance Matrix: \n")
print(S[[i]])
Sw=Sw+S[[i]]
}
MuDiff=Mu[[1]]-Mu[[2]]
Sb=outer(MuDiff,MuDiff)
cat("\n Within cluster scatter Sw: \n")
print(Sw)
cat("\n Between cluster scatter Sb: \n")
print(Sb)
J=solve(Sw)%*%Sb
eigJ=eigen(J)
cat(" Eigenvalues of J: ",eigJ$values)
cat("\n Eigenvectors of J: \n\n")
print(eigJ$vectors)
v=eigJ$vectors[,1,drop=FALSE]
cat("\n LDA direction v= \n")
print(v)
Eigenvalues of $J$ \begin{equation} \mathrm{maximize} \hspace{3mm} v^T J v \tag{3} \end{equation}
$ J = S_w^{-1} S_b \space $ where $ \space S_b = (\mu_1 - \mu_2)(\mu_1 - \mu_2)^T $
Rank of $S_b$ is 1 (outer product of two vectors).
Rank of $J$ is also atmost 1, so atmost 1 eigenvalue of $J$ will be non-zero.
$\lambda_2(J)=0$
Between-class scatter matrix \begin{equation} S_b=\sum \limits_{i=1}^M P(\boldsymbol{\omega_i}) (\mu_i-\mu_0)(\mu_i-\mu_0)^T \tag{4} \end{equation} $S_b $ is the sum of $M$ matrices of rank $\leq 1$.
Additional constraint on mean $\mu_i$: \begin{equation} \mu_0= \frac{1}{M} \sum_{i=1}^M \mu_i \end{equation}
Rank of $S_b $ is atmost $(M-1)$, atmost $(M-1)$ eigenvalues of $J$ will be non-zero.
First LDA component \begin{equation} y = Xv \tag{4} \end{equation}
y=X%*%v
cat("\n LDA projection y= \n ")
cat(y[1:8],sep="\n")
pc1Mat=cbind(y,rep(0,nrow(X)))
plot(pc1Mat,col="black",bg=colvec,pch=pchs,ylab="",xlab="",main="LDA projection")
cat("IRIS dataset\n")
head(iris)
X<-iris[,-5]
class=as.numeric(iris$Species)
cat("\n Samples: ",dim(X)[1],"\t Features: ",dim(X)[2],"\t Classes: ",levels(iris$Species))
colvec = c("coral3","darkseagreen3","darkgoldenrod2")[class]
pchs= c(22,23,24)[class]
pairs(X, col=colvec, pch=pchs)
N=nrow(X)
nclass=length(unique(class))
mu0<-colMeans(X)
cat("\n Global mean: ", mu0)
S<-list()
Mu<-list()
P<-list()
for(i in 1:nclass)
{
wi=X[which(class==i),]
Ni=nrow(wi)
P[[i]]<-Ni/N
Mu[[i]]<-colMeans(wi)
S[[i]]<-cov(wi)
cat("\n\n Class ",i,": ")
cat("\n Prior P: ",P[[i]])
cat("\n Mean Mu: ",Mu[[i]])
cat("\n Covariance Matrix: \n")
print(S[[i]])
}
Sw=0
Sb=0
for(i in 1:nclass)
{
mui0=Mu[[i]]-mu0
Sw=Sw+ P[[i]]*S[[i]]
Sb=Sb+ P[[i]]*outer(mui0,mui0)
}
J=solve(Sw)%*%Sb
eigJ=eigen(J)
cat(" Eigenvalues of J: ",eigJ$values)
cat("\n Eigenvectors of J: \n\n")
print(eigJ$vectors)
v=eigJ$vectors[,1:2,drop=FALSE]
cat("\n dim X= ",dim(X)," dim v= ",dim(v))
y=as.matrix(X)%*%v
plot(y,col="black",bg=colvec,pch=pchs,xlab="LDA component 1",ylab="LDA component 2",main="LDA projection")
library(mlbench)
data(Ionosphere)
Dataset<-Ionosphere
cat("\n Predict the onset of diabetes in female Pima Indians from medical record data.")
cat("\n Dimension of dataset: ",dim(Dataset))
cat("\n Classes: ",levels(Dataset$Class))
head(Dataset)
class=as.numeric(Dataset$Class)
X<-Dataset[,-c(1,2,ncol(Dataset))]
X<-as.matrix(as.data.frame(lapply(X, as.numeric)))
colvec = c("cyan3","plum3")[class]
pchs= c(22,24)[class]
pairs(X[,1:4], col=colvec, pch=pchs)
show=5
N=nrow(X)
nclass=length(unique(class))
mu0<-colMeans(X)
cat("\n Global mean: ", mu0[1:show])
S<-list()
Mu<-list()
P<-list()
for(i in 1:nclass)
{
wi=X[which(class==i),]
Ni=nrow(wi)
P[[i]]<-Ni/N
Mu[[i]]<-colMeans(wi)
S[[i]]<-cov(wi)
cat("\n\n Class ",i,": ")
cat("\n Prior P: ",P[[i]])
cat("\n Mean Mu: ",Mu[[i]][1:show])
cat("\n Covariance Matrix: \n")
print(S[[i]][1:show,1:show])
}
Sw=0
Sb=0
for(i in 1:nclass)
{
mui0=Mu[[i]]-mu0
Sw=Sw+ P[[i]]*S[[i]]
Sb=Sb+ P[[i]]*outer(mui0,mui0)
}
J=solve(Sw)%*%Sb
eigJ=eigen(J)
cat(" Eigenvalues of J: \n",eigJ$values[1:show])
cat("\n Eigenvectors of J: \n\n")
print(eigJ$vectors[1:show,1:show])
v=eigJ$vectors[,1:2,drop=FALSE]
cat("\n dim X= ",dim(X)," dim v= ",dim(v))
y=as.matrix(X)%*%v
plot(y,col="black",bg=colvec,pch=pchs,xlab="LDA component 1",ylab="LDA component 2",main="LDA projection")
J=Sb
eigJ=eigen(J)
cat(" Eigenvalues of J: \n",eigJ$values[1:show])
cat("\n Eigenvectors of J: \n\n")
print(eigJ$vectors[1:show,1:show])
v=eigJ$vectors[,1:2,drop=FALSE]
cat("\n dim X= ",dim(X)," dim v= ",dim(v))
y=as.matrix(X)%*%v
plot(y,col="black",bg=colvec,pch=pchs,xlab="LDA component 1",ylab="LDA component 2",main="LDA projection")