source("packages.R") ## Load neuroblastoma data set and pre-computed optimal Gaussian ## segmentation models. data(neuroblastoma, package="neuroblastoma") load("Segmentor.models.RData") ## Compute model selection functions, label error, and target ## intervals. selection <- Segmentor.models$loss[, modelSelection( .SD, complexity="n.segments" ), by=.(profile.id, chromosome)] changes <- Segmentor.models$segs[1 < start, ] errors <- labelError( selection, neuroblastoma$annotations, changes, change.var="chromStart", label.vars=c("min", "max"), model.vars="n.segments", problem.vars=c("profile.id", "chromosome")) target.dt <- targetIntervals( errors$model.errors, c("profile.id", "chromosome")) errors$model.errors[, pid.chr := paste0(profile.id, ".", chromosome)] setkey(errors$model.errors, pid.chr) ## Make sure that target.dt is the same as the pre-computed target ## matrix in the neuroblastomaProcessed data set. data(neuroblastomaProcessed, package="penaltyLearning") all.pid.chr <- target.dt[, paste0(profile.id, ".", chromosome)] processed.targets <- neuroblastomaProcessed$target.mat[all.pid.chr, ] computed.targets <- target.dt[, cbind(min.log.lambda, max.log.lambda)] dimnames(computed.targets) <- dimnames(processed.targets) stopifnot(all.equal(computed.targets, processed.targets)) processed.features <- neuroblastomaProcessed$feature.mat[all.pid.chr, ] ## Each of these functions should return a list which contains at ## least one element, named predict, which is function that takes a N ## x P feature matrix and returns an N-vector of log(penalty) values. model.fun.list <- list( BIC=function(){ list(predict=function(X.pred){ X.pred[, "log2.n"] }) }, cghseg.k=function(){ pred.dt <- data.table( target.dt[is.train, ], pred.log.lambda=train.feature.mat[, "log.n"]) roc <- ROChange(errors$model.errors, pred.dt, c("profile.id", "chromosome")) min.train.error <- roc$thresholds[threshold=="min.error",] learned.thresh <- min.train.error[, (min.thresh+max.thresh)/2] list(predict=function(X.pred){ X.pred[, "log.n"] + learned.thresh }) }, log.s.log.d=function(){ IntervalRegressionUnregularized( train.feature.mat[, c("log.hall", "log.n")], train.target.mat) }, log.n=function(){ IntervalRegressionUnregularized( train.feature.mat[, c("log.n"), drop=FALSE], train.target.mat) }) ## Comment the following for loop if you don't want to fit the ## multivariate models, which take a long time because of their ## internal cross-validation loop. for(reg.type in c("1sd", "min(mean)", "mean(min)")){ model.fun.list[[paste0("L1.reg.", reg.type)]] <- eval(substitute(function(){ IntervalRegressionCV( train.feature.mat, train.target.mat, fold.vec=as.integer(test.fold.vec[is.train]), incorrect.labels.db=errors$model.errors, reg.type=REGTYPE) }, list(REGTYPE=reg.type))) } set.seed(1) test.error.10fold.list <- list() ## 10-fold CV by simple random sampling. test.fold.vec <- paste(sample(rep(1:10, l=nrow(target.dt)))) for(test.fold in unique(test.fold.vec)){ is.train <- test.fold.vec != test.fold train.target.mat <- processed.targets[is.train, ] train.feature.mat <- processed.features[is.train, ] test.feature.mat <- processed.features[!is.train, ] for(model.name in names(model.fun.list)){ model.fun <- model.fun.list[[model.name]] model <- model.fun() pred.dt <- data.table( target.dt[!is.train,], pred.log.lambda=as.numeric(model$predict(test.feature.mat))) test.roc <- ROChange( errors$model.errors, pred.dt, c("profile.id", "chromosome")) model.test.error <- with(test.roc, { data.table( test.fold, model.name, auc, error.percent=thresholds[threshold=="predicted", error.percent]) }) print(model.test.error) test.error.10fold.list[[paste(test.fold, model.name)]] <- model.test.error } } test.error.10fold <- do.call(rbind, test.error.10fold.list) save(test.error.10fold, file="test.error.10fold.RData")