Two classes:
\begin{align} \text{Class} \space \space \boldsymbol{\omega_1} \space \space \text{with mean} \space \space \mu_1 \space \space \text{variance} \space \space \sigma_1^2 \\ \text{Class} \space \space \boldsymbol{\omega_2} \space \space \text{with mean} \space \space \mu_2 \space \space \text{variance} \space \space \sigma_2^2 \end{align}Assumption: $ \sigma_1^2 = \sigma_2^2 = \sigma^2 $
Hypothesis design for feature selection in classification problem:
\begin{align} & H_1 \mbox{ (Difference between class means is non-zero): }& \mu_1 - \mu_2 \neq 0 \\ & H_0 \mbox{ (Difference between class means is zero): }& \mu_1 - \mu_2 = 0 \\ \end{align}Feature with higher difference between class means has greater class separability, hence better feature.
N=200
w1<-cbind(rnorm(N,mean=5,sd=1),rnorm(N,mean=4,sd=1))
w2<-cbind(rnorm(N,mean=15,sd=1),rnorm(N,mean=6,sd=1))
X<-rbind(w1,w2)
cat(" Glimpse of Dataset X: \n")
print(X[1:5,])
cat("\n Dimension of Dataset: \t Samples:",dim(X)[1],"\t Features:",dim(X)[2])
true=c(rep(1,N),rep(2,N))
colvec = c("coral3","darkseagreen3")[true]
pchs= c(22,24)[true]
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Scatter plot of Data")
where \begin{align} & \bar{x}= \frac{1}{N} \sum\limits_{i=1}^{N} x_i, \hspace{1cm} x_i \mbox{ are samples values of the feature in class } \boldsymbol{\omega_1} \\ & \bar{y}= \frac{1}{N} \sum\limits_{i=1}^{N} y_i, \hspace{1cm} y_i \mbox{ are samples values of the feature in class } \boldsymbol{\omega_2} \\ & s^2= \frac{1}{2N-2} \left( \sum\limits_{i=1}^{N} (x_i - \bar{x})^2 + \sum\limits_{i=1}^{N} (y_i - \bar{y})^2\right) \hspace{1cm} \mbox{class-wise sample variance.} \end{align}
According to null-hypothesis $H_0$, $\mu_1 - \mu_2=0$
$q$ follows $t$-distributution with $2N-2$ degrees of freedom.
f=X[,1]
fw1=f[1:N]
fw2=f[(N+1):(2*N)]
xbar=mean(fw1)
ybar=mean(fw2)
s=sqrt(sum((fw1-xbar)^2)+sum((fw2-ybar)^2)/(2*N-2))
q=(xbar-ybar)/(s*sqrt(2/N))
cat("\n xbar=",xbar," ybar=",ybar)
cat("\n\n Test statistic q= ",q)
cat("\n q follows t- distribution with ",2*N-2," degrees of freedom")
Compute $p$ value
If p-value < 0.05: Feature has signficant difference between the class means
If p-value $\geq$ 0.05: Difference between class means along the feature is not significant
p.val=pt(q,2*N-2)
cat("\n p-value for hypothesis test on Feature 1= ",p.val)
alpha=0.05
if(p.val< alpha){
cat("\n Feature is relevant")
}else
cat("\n Feature is not relevant, lacks class separability")
f=X[,2]
fw1=f[1:N]
fw2=f[(N+1):(2*N)]
xbar=mean(fw1)
ybar=mean(fw2)
s=sqrt(sum((fw1-xbar)^2)+sum((fw2-ybar)^2)/(2*N-2))
q=(xbar-ybar)/(s*sqrt(2/N))
cat("\n xbar=",xbar," ybar=",ybar)
cat("\n\n Test statistic q= ",q)
p.val=pt(q,2*N-2)
cat("\n p-value for hypothesis test on Feature 2= ",p.val)
if(p.val< alpha){
cat("\n Feature is relevant")
}else
cat("\n Feature is not relevant, lacks class separability")
N=200
w1<-cbind(rnorm(N,mean=4,sd=1),rnorm(N,mean=7,sd=1),rnorm(N,mean=10,sd=1),rnorm(N,mean=-3.5,sd=1))
w2<-cbind(rnorm(N,mean=7,sd=1),rnorm(N,mean=16,sd=1),rnorm(N,mean=12,sd=1),rnorm(N,mean=2,sd=1))
X<-rbind(w1,w2)
colnames(X) <- c("Feature1","Feature2","Feature3","Feature4")
cat(" Glimpse of Dataset X: \n")
print(X[1:5,])
cat("\n Dimension of Dataset: \t Samples:",dim(X)[1],"\t Features:",dim(X)[2])
true=c(rep(1,N),rep(2,N))
colvec = c("deepskyblue3","orange2")[true]
pchs= c(22,24)[true]
pairs(X, col=colvec, pch=pchs)
d=ncol(X)
q.val=rep(0,d)
for(i in 1:d)
{
f=X[,i]
fw1=f[1:N]
fw2=f[(N+1):(2*N)]
xbar=mean(fw1)
ybar=mean(fw2)
s=sqrt(sum((fw1-xbar)^2)+sum((fw2-ybar)^2)/(2*N-2))
q=(xbar-ybar)/(s*sqrt(2/N))
p.val=pt(q,2*N-2)
cat("\n\n Test statistic q for Feature ",i," = ",q,"p-value=",p.val)
q.val[i]=q
}
ord=sort(abs(q.val),decreasing=TRUE,index.return=TRUE)$ix
cat("\n\n Ordering of features based on t-Test=",ord)
cat("IRIS dataset\n")
print(iris[1:5,])
X<-iris[,-5]
class=as.numeric(iris$Species)
cat("\n Samples: ",dim(X)[1],"\t Features: ",dim(X)[2],"\t Classes: 3:- ",paste0(levels(iris$Species),collapse=", "))
colvec = c("coral3","darkseagreen3","darkgoldenrod2")[class]
pchs= c(22,23,24)[class]
pairs(X, col=colvec, pch=pchs)
nclass=length(unique(class))
FDR=rep(0,ncol(X))
for(d in 1:ncol(X))
{
FDRd=0
f=X[,d]
for(i in 1:nclass)
{
for(j in 1:nclass)
{
if(i!=j)
{
fi=f[which(class==i)]
fj=f[which(class==j)]
FDRd=FDRd+((mean(fi)-mean(fj))^2)/(var(fi)+var(fj))
}
}
}
cat("\n FDR for Feature ",d," = ",FDRd)
FDR[d]=FDRd
}
ord=sort(abs(FDR),decreasing=TRUE,index.return=TRUE)$ix
cat("\n\n Ordering of features based on FDR: \n",paste0(colnames(X)[ord],collapse=", "))
N=250
w1<-cbind(rnorm(N,mean=5,sd=1.5),rnorm(N,mean=4,sd=1.2))
w2<-cbind(rnorm(N,mean=9,sd=1.3),rnorm(N,mean=8,sd=1.8))
w3<-cbind(rnorm(N,mean=15,sd=1),rnorm(N,mean=1,sd=1))
X<-rbind(w1,w2,w3)
cat(" Glimpse of Dataset X: \n")
print(X[1:5,])
cat("\n Dimension of Dataset: \t Samples:",dim(X)[1],"\t Features:",dim(X)[2])
true=c(rep(1,N),rep(2,N),rep(3,N))
colvec = c("lightpink2","turquoise2","darkolivegreen")[true]
pchs= c(22,24,21)[true]
plot(X,col="black",bg=colvec,pch=pchs,xlab="Feature1",ylab="Feature2",main="Scatter plot of Data")
$M$ classes: $\space \space \boldsymbol{\omega_1}, \space \boldsymbol{\omega_2}, \space ...\space, \space \boldsymbol{\omega_M}$
Class-wise prior probability: \begin{equation} P(\boldsymbol{\omega_i}) \simeq \frac{N_i}{N} \end{equation} $\space N_i$ number of samples in class $\boldsymbol{\omega_i}$ out of a total of $N$ samples.
Class specific covariance matrix: \begin{equation} \Sigma_i=E[(x-\mu_i)(x-\mu_i)^T] \end{equation}
Global mean vector: \begin{equation} \mu_0= \sum \limits_{i=1}^M P(\boldsymbol{\omega_i}) \mu_i \end{equation}
N=nrow(X)
nclass=length(unique(class))
mu0<-colMeans(X)
S<-list()
Mu<-list()
P<-list()
for(i in 1:nclass)
{
wi=X[which(class==i),]
Ni=nrow(wi)
P[[i]]<-Ni/N
Mu[[i]]<-colMeans(wi)
S[[i]]<-cov(wi)
cat("\n\n Class ",i,": ")
cat("\n Prior P: ",P[[i]])
cat("\n Mean Mu: ",Mu[[i]])
cat("\n Covariance Matrix: \n")
print(S[[i]])
}
Sw=0
Sb=0
for(i in 1:nclass)
{
mui0=Mu[[i]]-mu0
Sw=Sw+ P[[i]]*S[[i]]
Sb=Sb+ P[[i]]*outer(mui0,mui0)
}
Sm=Sw+Sb
cat("\n\n Within-class scatter matrix: \n")
print(Sw)
cat("\n\n Between-class scatter matrix: \n")
print(Sb)
cat("\n\n Mixture scatter matrix: \n")
print(Sm)
$\lvert A \rvert$ denotes determinant of matrix $A$.
J1=sum(diag(Sm))/sum(diag(Sw))
J2=det(solve(Sw)%*%Sm)
J3=sum(diag(solve(Sw)%*%Sm))
cat("\n J1= ",J1,"\n J2= ",J2,"\n J3= ",J3)