Togaware DATA MINING
Desktop Survival Guide
by Graham Williams
Google

Alternative Clustering

For model-based clustering see the BIC algorithm in the mclust package. This estimates density with a mixture of Gaussians.

For density-based clustering the following implementation of DBSCAN may be useful. It follows the notation of the original KDD-96 DBSCAN paper. For large datasets, it may be slow.

#Christian Hennig
distvector <- function(x,data)
{
  ddata <- t(data)-x
  dv <- apply(ddata^2,2,sum)
}
# data may be nxp or distance matrix
# eps is the dbscan distance cutoff parameter
# MinPts is the minimum size of a cluster
# scale: Should the data be scaled? 
# distances: has to be TRUE if data is a distance matrix
# showplot: Should the computation process be visualized? 
# countmode: dbscan gives messages when processing point no. (countmode)
dbscan <- function(data,eps,MinPts=5, scale=FALSE, distances=FALSE,
                   showplot=FALSE,
                   countmode=c(1,2,3,5,10,100,1000,5000,10000,50000)){
  data <- as.matrix(data)
  n <- nrow(data)
  if (scale) data <- scale(data)
  unregpoints <- rep(0,n)
  e2 <- eps^2
  cv <- rep(0,n)
  cn <- 0
  i <- 1
  for (i in 1:n){
    if (i %in% countmode) cat("Processing point ", i," of ",n, ".\n")
    unclass <- cv<1
    if (cv[i]==0){
      if (distances) seeds <- data[i,]<=eps
      else{
        seeds <- rep(FALSE,n)
        seeds[unclass] <- distvector(data[i,],data[unclass,])<=e2
      }
      if (sum(seeds)+unregpoints[i]<MinPts) cv[i] <- (-1)
      else{
        cn <- cn+1
        cv[i] <- cn
        seeds[i] <- unclass[i] <- FALSE
        unregpoints[seeds] <- unregpoints[seeds]+1
        while (sum(seeds)>0){
          if (showplot) plot(data,col=1+cv)
          unclass[seeds] <- FALSE
          cv[seeds] <- cn
          ap <- (1:n)[seeds]
#          print(ap)
          seeds <- rep(FALSE,n)
          for (j in ap){
#            if (showplot) plot(data,col=1+cv)
            jseeds <- rep(FALSE,n)
            if (distances) jseeds[unclass] <- data[j,unclass]<=eps
            else{
              jseeds[unclass] <- distvector(data[j,],data[unclass,])<=e2
            }
            unregpoints[jseeds] <- unregpoints[jseeds]+1
#            if (cn==1)
#              cat(j," sum seeds=",sum(seeds)," unreg=",unregpoints[j],
#                  " newseeds=",sum(cv[jseeds]==0),"\n")
            if (sum(jseeds)+unregpoints[j]>=MinPts){
              seeds[jseeds] <- cv[jseeds]==0
              cv[jseeds & cv<0] <- cn
            }
          } # for j
        } # while sum seeds>0
      } # else (sum seeds + ... >= MinPts)
    } # if cv==0
  } # for i
  if (sum(cv==(-1))>0){
    noisenumber <- cn+1
    cv[cv==(-1)] <- noisenumber
  }
  else
    noisenumber <- FALSE
  out <- list(classification=cv, noisenumber=noisenumber,
              eps=eps, MinPts=MinPts, unregpoints=unregpoints)
  out
} # dbscan
# classification: classification vector
# noisenumber: number in the classification vector indicating noise points
# unregpoints: ignore...



Copyright © Togaware Pty Ltd
Support further development through the purchase of the PDF version of the book.
The PDF version is a formatted comprehensive draft book (with over 800 pages).
Brought to you by Togaware. This page generated: Sunday, 22 August 2010