|
DATA MINING
Desktop Survival Guide by Graham Williams |
|
|||
Alternative Clustering |
For model-based clustering see the BIC algorithm in the mclust package. This estimates density with a mixture of Gaussians.
For density-based clustering the following implementation of DBSCAN
may be useful. It follows the notation of the original KDD-96 DBSCAN
paper. For large datasets, it may be slow.
#Christian Hennig
distvector <- function(x,data)
{
ddata <- t(data)-x
dv <- apply(ddata^2,2,sum)
}
# data may be nxp or distance matrix
# eps is the dbscan distance cutoff parameter
# MinPts is the minimum size of a cluster
# scale: Should the data be scaled?
# distances: has to be TRUE if data is a distance matrix
# showplot: Should the computation process be visualized?
# countmode: dbscan gives messages when processing point no. (countmode)
dbscan <- function(data,eps,MinPts=5, scale=FALSE, distances=FALSE,
showplot=FALSE,
countmode=c(1,2,3,5,10,100,1000,5000,10000,50000)){
data <- as.matrix(data)
n <- nrow(data)
if (scale) data <- scale(data)
unregpoints <- rep(0,n)
e2 <- eps^2
cv <- rep(0,n)
cn <- 0
i <- 1
for (i in 1:n){
if (i %in% countmode) cat("Processing point ", i," of ",n, ".\n")
unclass <- cv<1
if (cv[i]==0){
if (distances) seeds <- data[i,]<=eps
else{
seeds <- rep(FALSE,n)
seeds[unclass] <- distvector(data[i,],data[unclass,])<=e2
}
if (sum(seeds)+unregpoints[i]<MinPts) cv[i] <- (-1)
else{
cn <- cn+1
cv[i] <- cn
seeds[i] <- unclass[i] <- FALSE
unregpoints[seeds] <- unregpoints[seeds]+1
while (sum(seeds)>0){
if (showplot) plot(data,col=1+cv)
unclass[seeds] <- FALSE
cv[seeds] <- cn
ap <- (1:n)[seeds]
# print(ap)
seeds <- rep(FALSE,n)
for (j in ap){
# if (showplot) plot(data,col=1+cv)
jseeds <- rep(FALSE,n)
if (distances) jseeds[unclass] <- data[j,unclass]<=eps
else{
jseeds[unclass] <- distvector(data[j,],data[unclass,])<=e2
}
unregpoints[jseeds] <- unregpoints[jseeds]+1
# if (cn==1)
# cat(j," sum seeds=",sum(seeds)," unreg=",unregpoints[j],
# " newseeds=",sum(cv[jseeds]==0),"\n")
if (sum(jseeds)+unregpoints[j]>=MinPts){
seeds[jseeds] <- cv[jseeds]==0
cv[jseeds & cv<0] <- cn
}
} # for j
} # while sum seeds>0
} # else (sum seeds + ... >= MinPts)
} # if cv==0
} # for i
if (sum(cv==(-1))>0){
noisenumber <- cn+1
cv[cv==(-1)] <- noisenumber
}
else
noisenumber <- FALSE
out <- list(classification=cv, noisenumber=noisenumber,
eps=eps, MinPts=MinPts, unregpoints=unregpoints)
out
} # dbscan
# classification: classification vector
# noisenumber: number in the classification vector indicating noise points
# unregpoints: ignore...
|