source("packages.R")

## Load neuroblastoma data set and pre-computed optimal Gaussian
## segmentation models.
data(neuroblastoma, package="neuroblastoma")
load("Segmentor.models.RData")

## Compute model selection functions, label error, and target
## intervals.
selection <- Segmentor.models$loss[, modelSelection(
  .SD, complexity="n.segments"
  ), by=.(profile.id, chromosome)]
changes <- Segmentor.models$segs[1 < start, ]
errors <- labelError(
  selection, neuroblastoma$annotations, changes,
  change.var="chromStart",
  label.vars=c("min", "max"),
  model.vars="n.segments",
  problem.vars=c("profile.id", "chromosome"))
target.dt <- targetIntervals(
  errors$model.errors, c("profile.id", "chromosome"))
errors$model.errors[, pid.chr := paste0(profile.id, ".", chromosome)]
setkey(errors$model.errors, pid.chr)

## Make sure that target.dt is the same as the pre-computed target
## matrix in the neuroblastomaProcessed data set.
data(neuroblastomaProcessed, package="penaltyLearning")
all.pid.chr <- target.dt[, paste0(profile.id, ".", chromosome)]
processed.targets <- neuroblastomaProcessed$target.mat[all.pid.chr, ]
computed.targets <- target.dt[, cbind(min.log.lambda, max.log.lambda)]
dimnames(computed.targets) <- dimnames(processed.targets)
stopifnot(all.equal(computed.targets, processed.targets))
processed.features <- neuroblastomaProcessed$feature.mat[all.pid.chr, ]

## Each of these functions should return a list which contains at
## least one element, named predict, which is function that takes a N
## x P feature matrix and returns an N-vector of log(penalty) values.
model.fun.list <- list(
  BIC=function(){
    list(predict=function(X.pred){
      X.pred[, "log2.n"]
    })
  }, cghseg.k=function(){
    pred.dt <- data.table(
      target.dt[is.train, ],
      pred.log.lambda=train.feature.mat[, "log.n"])
    roc <- ROChange(errors$model.errors, pred.dt, c("profile.id", "chromosome"))
    min.train.error <- roc$thresholds[threshold=="min.error",]
    learned.thresh <- min.train.error[, (min.thresh+max.thresh)/2]
    list(predict=function(X.pred){
      X.pred[, "log.n"] + learned.thresh
    })
  }, log.s.log.d=function(){
    IntervalRegressionUnregularized(
      train.feature.mat[, c("log.hall", "log.n")], train.target.mat)
  }, log.n=function(){
    IntervalRegressionUnregularized(
      train.feature.mat[, c("log.n"), drop=FALSE], train.target.mat)
  })
## Comment the following for loop if you don't want to fit the
## multivariate models, which take a long time because of their
## internal cross-validation loop.
for(reg.type in c("1sd", "min(mean)", "mean(min)")){
  model.fun.list[[paste0("L1.reg.", reg.type)]] <- eval(substitute(function(){
    IntervalRegressionCV(
      train.feature.mat, train.target.mat,
      fold.vec=as.integer(test.fold.vec[is.train]),
      incorrect.labels.db=errors$model.errors,
      reg.type=REGTYPE)
  }, list(REGTYPE=reg.type)))
}

set.seed(1)
test.error.10fold.list <- list()
## 10-fold CV by simple random sampling.
test.fold.vec <- paste(sample(rep(1:10, l=nrow(target.dt))))
for(test.fold in unique(test.fold.vec)){
  is.train <- test.fold.vec != test.fold
  train.target.mat <- processed.targets[is.train, ]
  train.feature.mat <- processed.features[is.train, ]
  test.feature.mat <- processed.features[!is.train, ]
  for(model.name in names(model.fun.list)){
    model.fun <- model.fun.list[[model.name]]
    model <- model.fun()
    pred.dt <- data.table(
      target.dt[!is.train,],
      pred.log.lambda=as.numeric(model$predict(test.feature.mat)))
    test.roc <- ROChange(
      errors$model.errors, pred.dt, c("profile.id", "chromosome"))
    model.test.error <- with(test.roc, {
      data.table(
        test.fold, model.name, auc,
        error.percent=thresholds[threshold=="predicted", error.percent])
    })
    print(model.test.error)
    test.error.10fold.list[[paste(test.fold, model.name)]] <- model.test.error
  }
}
test.error.10fold <- do.call(rbind, test.error.10fold.list)

save(test.error.10fold, file="test.error.10fold.RData")