# # R procedures for COSA # # Friedman & Meulman (2004): Clustering Objects on Subsets of Attributes # # cosaversion=function() { describe } describe=function() { print(" R/COSA 1.0 03/18/04 Copyright (2004) by Jerome H. Friedman and Jacqueline J. Meulman R/COSA 2.0/COSA_NOVA-1 10/01/09 Copyright (2009) by Jerome H. Friedman and Jacqueline J. Meulman R/COSA 2.0/COSA_NOVA-1 **************************************************************************************************** In this new version of COSA, new output features are added, and two additional parameters are added to the analysis options. 1. The COSA-weights can now be read into the R workspace by using the function 'readweights'. Similarly, the COSA-distances can be re-read (function 'readdist'), as well as the scaled data that are used to derive COSA-distances (function 'readsdata'; see below for scaling options). ww<-readweights(nrow, ncol); dd<-readdist(nrow); sdata<-readsdata(nrow, ncol), where nrow and ncol refer to the numbers of rows and columns in the data, respectively. 2. The user can choose between robust (stand=1) or standard scaling of the data (stand=2). When stand=0, the user can beforehand perform any scaling of the data (preprocessing) before using cosadist, and COSA will leave that user-defined scaling unchanged. The default value is set to 1, the fixed value in the previous value of COSA. A separate function autoscale with standard scaling (robust=F) and robust scaling (robust=T) is also provided. The call for autoscale xx<-autoscale(x, xmi=NA, xmo=NA, robust=F) For example, to autoscale first the columns and then the rows in the data: xx<-autoscale(x); xx<-t(autoscale(t(xx))); d<-cosadist(xx,stand=0) 3. The user can determine the number of inner (niter) and outer iterations (noit) to use in the homotopy strategy (with relax > 0) to transfer from the inverse exponential distance (with relax=0) to the Euclidean distance, obtained when a large enough number of outer iterations is chosen. Starting with the initial value of the homotopy parameter (equal to lambda, with default 0.20) and using increments determined by relax (with default relax = 0.10), 100 outer iterations will give the value 2.2 for the homotopy parameter, where the COSA distances will have become close to the Euclidean distances. The number of inner iterations determines the number of steps taken for the same value of the homotopy parameter, before the value of the homotopy is increased for the next outer iteration. In the previous version of COSA, the number of outer iterations was kept at 1 for relax = 0, while the number of inner iterations was kept at 1 for relax > 0. The new default call for the function cosadist: d<-cosadist(x, lx=rep(1,ncol(x)), targs=NULL, dtargs=NULL, knear=sqrt(nrow(x)), xmiss=9.9e30, lambda=0.2, qntls=c(0.05,0.95), wtcomb='each', relax=0.10, conv=1.0e-5, niter=5, noit=100, stand=1) The old default call was: d<-cosadist(x, lx=rep(1,ncol(x)), targs=NULL, dtargs=NULL, knear=sqrt(nrow(x)), xmiss=9.9e30, lambda=0.2, qntls=c(0.05,0.95), wtcomb='each', relax=0, niter=20) Former COSA results can be re-obtained by the new COSA version by using d<-cosadist(x, relax=0, niter=niter, noit=1) or d<-cosadist(x, relax=relax, niter=1, noit=noit) **************************************************************************************************** ") } cosadist=function (x, lx=rep(1,ncol(x)), targs=NULL, dtargs=NULL, knear=sqrt(nrow(x)), xmiss=9.9e30, lambda=0.2, qntls=c(0.05,0.95), wtcomb='each', relax=0.10, conv=1.0e-5, niter=5, noit=100, stand=1) { nrx=nrow(x) if (missing(lx) && !is.null(attr(x,'lx'))) { lxt=attr(x,'lx')} else {lxt=lx} if (missing(xmiss) && !is.null(attr(x,'xmiss'))) { xmisst=attr(x,'xmiss')} else {xmisst=xmiss} if (missing(targs)) { ktarg=1} else if (is.numeric(targs)) { ktarg=0} else if (targs=='high') { ktarg=1} else if (targs=='high/low') { ktarg=1} else if (targs=='low') { ktarg=2} else {warning(paste(as.character(targs)," invalid value for targs."))} if (missing(dtargs)) { ltarg=0} else { ltarg=1} if (wtcomb=='each') { wtcom=1} else if (wtcomb=='whole') { wtcom=2} else { wtcom=1 warning(paste(as.character(wtcomb), " - invalid value for wtcomb: wtcomb='each' used.")) } ic=c(nrx,ncol(x),trunc(knear),ktarg,niter,wtcom,ltarg,noit,stand) zz=file(paste(cosadir,'/iparms.cos',sep=''),'wb') writeBin(as.integer(ic),zz,size=4) close(zz) zz=file(paste(cosadir,'/lx.cos',sep=''),'wb') writeBin(as.integer(lxt),zz,size=4) close(zz) zz=file(paste(cosadir,'/rparms.cos',sep=''),'wb') writeBin(as.real(c(xmisst,lambda,qntls,relax,conv)),zz,size=4) close(zz) zz=file(paste(cosadir,'/data.cos',sep=''),'wb') writeBin(as.real(x),zz,size=4) close(zz) if (ktarg==0) { zz=file(paste(cosadir,'/targs.cos',sep=''),'wb') writeBin(as.real(targs),zz,size=4) close(zz) } if (ltarg==1) { zz=file(paste(cosadir,'/dtargs.cos',sep=''),'wb') writeBin(as.real(dtargs),zz,size=4) close(zz) } wd=getwd(); setwd(cosadir) if (platform=='windows') { system('cmd.exe /c COSA_2.exe',minimized = F);} else {system('/usr/X11R6/bin/xterm -sb -sl 400 -T COSA -e ./COSA_2.exe')} setwd(wd) stat=scan(paste(cosadir,'/status.cos',sep=''),what='',quiet=T) if (stat!='OK') { readcosa('status.cos'); dis=NULL; invisible(dis)} else { zz=file(paste(cosadir,'/dist.cos',sep=''),'rb') dis=readBin(zz,numeric(),size=4,n=nrx*(nrx-1)/2) close(zz) attr(dis,"Size")=nrx invisible(dis) } } hierclust=function(dist,method='average',labels=F, hang=0.02, sub='',title='', xlab='COSA distance clustering', denplot=T) { z=hclust(dist,method) if (denplot) { plot(z,labels=labels,hang=hang,sub=sub,main='',xlab=xlab) if (!missing(title)) title(title) } invisible(z) } attimp=function (x,group,range=1:min(20,ncol(x)),times=0,lx=rep(1,ncol(x)),donames=T, fast=F,trans=1,maximp=20,xmiss=9.9e30,horiz=F,main='Cluster', att.names=1:ncol(x),names.size=1,ylim=c(0,wtave[o[1]]),ntick=20) { if (maximp<=0) stop(' maximp must have a positive value.') if (missing(lx) && !is.null(attr(x,'lx'))) { lxt=attr(x,'lx')} else {lxt=lx} if (missing(xmiss) && !is.null(attr(x,'xmiss'))) { xmisst=attr(x,'xmiss')} else {xmisst=xmiss} eps=1/maximp; disp=rep(0,ncol(x)); wtave=rep(0,ncol(x)) for (k in 1:ncol(x)) { if (lxt[k]<=0) { wtave[k]=0; next} z=x[x[,k]=xmisst) lu=lu-1 bars=rep(0,lu) for (l in 1:lu) { bars[l]=length(group[x[group,vj]==u[l]])} barplot(bars,names.arg=as.character(u[1:lu]), main=paste('Attribute',as.character(att.names[vj])), xlab=group.name) } } for (j in 1:lv) { vj=atts[j]; othr=1:nrow(x); othr=othr[-group] if (lxt[vj]<4) { othr=othr[x[othr,vj]=xmisst) lu=lu-1 bars=rep(0,lu) for (l in 1:lu) { bars[l]=length(othr[x[othr,vj]==u[l]])} barplot(bars,names.arg=as.character(u[1:lu]), main='',xlab='Others') } } if (all) { for (j in 1:lv) { vj=atts[j] if (lxt[vj]<4) { z=x[x[,vj]=xmisst) lu=lu-1 bars=rep(0,lu) for (l in 1:lu) { bars[l]=length(x[x[,vj]==u[l],vj])} barplot(bars,names.arg=as.character(u[1:lu]),xlab='All',main='') } } } invisible() } readcosa=function(file='cosa.hlp') { wd=getwd(); setwd(cosadir) if (platform=='windows') { system('readfile.exe',input=file, show.output.on.console=T,minimized=T)} else { system(paste('./readfile.exe <',file),intern=T)} setwd(wd) } getclust=function (HCOBJ, N = 20,rec.col='blue',old.col='blue') { retval <- list() oldk <- NULL oldx <- NULL for (n in 1:N) { x <- locator(1) if (is.null(x)) break k <- min(which(rev(HCOBJ$height) < x$y)) k <- max(k, 2) if (!is.null(oldx)) { rect.hclust(HCOBJ, k = oldk, x = oldx, border = old.col) } retval[[n]] <- unlist(rect.hclust(HCOBJ, k = k, x = x$x, border = rec.col)) oldx <- x$x oldk <- k } invisible(retval) } plotimp=function(attimp,atts=attimp$att[1:min(40,length(attimp$att))], imporder=F,donames=T,trans=1, horiz=F,main='Cluster', att.names=1:length(attimp$att),names.size=1, plot=T, ylim=c(0,max(imps)),ntick=20) { if (is.character(atts)) { if (missing(att.names)) stop(' addressing attributes by name requires input "att.names"') attnums=match(atts,att.names) } else { attnums=atts} imps=attimp$imp[match(attnums,attimp$att)] o=1:length(atts); if (imporder) o=order(-imps) if (plot) { if (horiz) { xl='Importance'; yl=''} else { xl=''; yl='Importance'} if (donames) { bplot(imps[o],names=as.character(att.names[attnums[o]]), horiz=horiz,xlab=xl,ylab=yl,zlim=ylim,trans=trans,ntk=ntick, names.size=names.size) title(main) } else { bplot(imps[o],ylab=yl,xlab=xl,zlim=ylim, horiz=horiz,trans=trans,ntk=ntick) title(main) } } if (plot) {invisible(imps)} else {imps} } cosahelp <- function(fun="cosahelp") { if (!is.character(fun)) stop("argument must be of type character.") if (fun!="cosadist" && fun!="hierclust" && fun!="attimp" && fun!="getclust" && fun!="cosahelp" && fun!="attvalues" && fun!="cosaversion" && fun!="plotimp") { stop(paste(fun,"is not an COSA procedure.")) } wd=getwd(); setwd(cosadir) if (platform=='windows'){system(paste('cmd.exe /k type ',fun,'.hlp',sep=''), wait=F)} else { system(paste('/usr/X11R6/bin/xterm -sb -sl 500 -hold -T ',fun, ' -rightbar -e cat ',fun,'.hlp&',sep=''))} setwd(wd) invisible() } weights<-readweights<-function(nrow,ncol){ zz=file(paste(cosadir,'/weights.cos',sep=''),'rb') weights=readBin(zz,numeric(),size=4,n=nrow*ncol) close(zz) weights<-matrix(weights,ncol=ncol,nrow=nrow) weights<-weights*(ncol) } sdata<-readsdata<-function(nrow,ncol){ zz=file(paste(cosadir,'/sdata.cos',sep=''),'rb') sdata=readBin(zz,numeric(),size=4,n=nrow*ncol) close(zz) sdata<-matrix(sdata,ncol=ncol,nrow=nrow) } distances<-readdist<-function(nrx){ zz=file(paste(cosadir,'/dist.cos',sep=''),'rb') dis=readBin(zz,numeric(),size=4,n=nrx*(nrx-1)/2) close(zz) attr(dis,"Size")=nrx invisible(dis) } weightimp=function(w,group,rang,times) { plot(rang,(sort(colsum(w[group,]),decreasing=T))[rang], ylab='Importance',xlab='Attribute',type='l') for (it in 1:times) { mr=trunc(nrow(w)*runif(length(group))+1) lines(rang,(sort(colsum(w[mr,]),decreasing=T))[rang],col=2) } } attimp=function (x,group,range=1:min(20,ncol(x)),times=0,lx=rep(1,ncol(x)),donames=T, fast=F,trans=1,maximp=20,xmiss=9.9e30,horiz=F,main='Cluster', att.names=1:ncol(x),names.size=1,ylim=c(0,wtave[o[1]]),ntick=20) { if (maximp<=0) stop(' maximp must have a positive value.') if (missing(lx) && !is.null(attr(x,'lx'))) { lxt=attr(x,'lx')} else {lxt=lx} if (missing(xmiss) && !is.null(attr(x,'xmiss'))) { xmisst=attr(x,'xmiss')} else {xmisst=xmiss} eps=1/maximp; disp=rep(0,ncol(x)); wtave=rep(0,ncol(x)) for (k in 1:ncol(x)) { if (lxt[k]<=0) { wtave[k]=0; next} z=x[x[,k] 0) {out=list(imp=impp,att=att,impo=impo,randave=wttave/times)} invisible(out) } autoscale=function(x, xmi=NA, xmo=NA, robust=F) { y<-matrix(nrow=nrow(x),ncol=ncol(x)) if (!is.na(xmi)) x[x==xmi]<-NA for (i in 1:ncol(x)) { u<-x[!is.na(x[,i]),i] if (robust) { scl<-IQR(u)/1.349; if (scl > 0) u<-(u-median(u))/scl if (scl == 0) u<-(u-mean(u))/sqrt(var(u)) } else { scl<-var(u); if (scl > 0) { scl<-sqrt(scl); u<-(u-mean(u))/scl} } y[!is.na(x[,i]),i]<-u; y[is.na(x[,i]),i]<-xmo } invisible(y) }