source("packages.R")
source("animint.R")
data(neuroblastoma, package="neuroblastoma")
load("Segmentor.models.RData")
selection <-
  Segmentor.models$loss[, modelSelection(
    .SD, complexity="n.segments"
  ), by=.(profile.id, chromosome)]
changes <- Segmentor.models$segs[1 < start, ]
errors <- labelError(
  selection, neuroblastoma$annotations, changes,
  change.var="chromStart",
  label.vars=c("min", "max"),
  model.vars="n.segments",
  problem.vars=c("profile.id", "chromosome"))
target.dt <- targetIntervals(
  errors$model.errors, c("profile.id", "chromosome"))
some.profiles <- data.table(
  neuroblastoma$profiles)[target.dt, on=.(profile.id, chromosome)]
feature.dt <-  some.profiles[, list(
  log2.n=log(log(.N)),
  log.mad=log(median(abs(diff(logratio))))
), by=.(profile.id, chromosome)]
train.dt <- data.table(
  feature=feature.dt$log2.n,
  target.dt)

## The log2.n feature is log(log(n = number of data points to segment)).

## The model selection criterion is argmin_K totalSquareLoss_K +
## lambda * K where K is the number of segments in the piecewise
## constant model, and lambda is a non-negative penalty constant.

## The target.mat contains the interval of log(lambda) values for
## which the number of incorrect labels is minimized.

## The BIC model selection criterion is lambda = log(n), where n is
## the number of data points to segment. This implies log(lambda) =
## log(log(n)) = the log2.n feature in all.features.mat.

## We can thus visualize the BIC penalty as a line with slope 1 and
## intercept 0 in a plot of log(lambda) versus log(log(n)).
BIC.df <- data.frame(slope=1, intercept=0, model.name="BIC")
train.dt[, pred.log.lambda := feature ] #for the BIC
train.dt$model.name <- "BIC"
train.dt[, residual := targetIntervalResidual(
  cbind(min.log.lambda, max.log.lambda), pred.log.lambda)]

possible <- train.dt[, list(
  negative=sum(-Inf < min.log.lambda),
  positive=sum(max.log.lambda < Inf),
  total=.N
  )]

fit <- survreg(
  Surv(min.log.lambda, max.log.lambda, type="interval2") ~ feature,
  train.dt, dist="gaussian")
pred.dt <- data.table(train.dt)
pred.dt[, pred.log.lambda := predict(fit)]
pred.dt[, residual := targetIntervalResidual(
  cbind(min.log.lambda, max.log.lambda), pred.log.lambda)]
pred.dt$model.name <- "learned"

model.list <- list(
  BIC=train.dt,
  learned=pred.dt)
roc.list <- list()
auc.list <- list()
pred.dot.list <- list()
auc.polygon.list <- list()
for(model.name in names(model.list)){
  model.dt <- model.list[[model.name]]
  feature.result <- ROChange(errors$model.errors, model.dt, c("profile.id", "chromosome"))
  pred.dot.list[[model.name]] <- data.table(
    model.name, feature.result$thresholds[threshold=="predicted",])
  roc.list[[model.name]] <- data.table(model.name, feature.result$roc)
  auc.list[[model.name]] <- data.table(model.name, auc=feature.result$auc)
  auc.polygon.list[[model.name]] <- data.table(model.name, feature.result$auc.polygon)
}
roc <- do.call(rbind, roc.list)
auc <- do.call(rbind, auc.list)
pred.dot <- do.call(rbind, pred.dot.list)
auc.polygon <- do.call(rbind, auc.polygon.list)

## Zoomed ROC curves.
auc$TPR <- c(0.85, 0.9)
auc$FPR <- 0.2
ggplot()+
  theme_bw()+
  geom_text(aes(FPR, TPR, color=model.name, label=sprintf("AUC = %.4f", auc)),
            vjust=0,
            data=auc)+
  geom_path(aes(FPR, TPR, color=model.name, group=model.name),
            data=roc)+
  scale_fill_manual(values=c(default="grey", min.error="black"))+
  geom_point(aes(FPR, TPR, color=model.name, fill=threshold),
             data=pred.dot,
             size=3,
             shape=21)+
  coord_equal(xlim=c(0, 0.25), ylim=c(0.75, 1))

roc[, mid.thresh := (min.thresh+max.thresh)/2]
pred.dot[, mid.thresh := (min.thresh+max.thresh)/2]
roc[, threshold := "other"]
## We would like to sample some thresholds evenly in the ROC space, so
## here we compute dist.from.zero, which is the cumulative euclidean
## distance from (0,0) in the ROC space.
roc[, dist.from.zero := cumsum(sqrt({
  diff(c(0, FPR))^2 + diff(c(0, TPR))^2
})), by=model.name]
grid.thresh <- roc[is.finite(mid.thresh), {
  .SD[as.integer(seq(1, .N, l=50)[c(1,5,10,15,20, 25, 30:50)]), ]
}, by=model.name]
grid.thresh <- roc[is.finite(mid.thresh), {
  r <- range(dist.from.zero)
  d.vec <- seq(r[1], r[2], l=50)
  i.vec <- sapply(d.vec, function(d)which.min(abs(dist.from.zero-d)))
  .SD[i.vec,]
}, by=model.name]
some.thresh <- unique(rbind(
  grid.thresh[, names(pred.dot), with=FALSE],
  pred.dot))
setkey(some.thresh, model.name, mid.thresh)
some.thresh[, prev.thresh := {
  d <- diff(mid.thresh)/2
  c(mid.thresh[1]-d[1], mid.thresh[-.N]+d)
  }, by=model.name]
some.thresh[, next.thresh := {
  d <- diff(mid.thresh)/2
  c(mid.thresh[-.N]+d, mid.thresh[.N]+d[.N-1])
}, by=model.name]
some.thresh[, stopifnot(
  identical(prev.thresh[-1], next.thresh[-.N])
  ), by=model.name]
some.thresh[, slope := ifelse(model.name=="BIC", 1, coef(fit)[["feature"]])]
some.thresh[, intercept := {
  ifelse(model.name=="BIC", 0, coef(fit)[["(Intercept)"]])+mid.thresh
}]
ggplot()+
  theme_bw()+
  geom_text(aes(FPR, TPR, label=sprintf("AUC = %.4f", auc)),
            vjust=0,
            data=auc)+
  geom_path(aes(FPR, TPR, 
                color=dist.from.zero, group=model.name),
            data=roc)+
  scale_fill_manual(values=c(default="grey", min.error="black"))+
  geom_point(aes(FPR, TPR, fill=threshold),
             data=pred.dot,
             size=3,
             shape=21)+
  scale_color_gradient(low="black", high="red")

both.pred <- rbind(train.dt, pred.dt)
setkey(both.pred, model.name)
setkey(some.thresh, model.name)
some.thresh.pred <- both.pred[some.thresh, allow.cartesian=TRUE]
some.thresh.pred[, pred.log.lambda := slope * feature + intercept]
some.thresh.pred[, residual := targetIntervalResidual(
  cbind(min.log.lambda, max.log.lambda), pred.log.lambda)]
total.thresh.pred <- unique(some.thresh.pred)[, list(
  total.residual=sum(residual),
  intervals=.N
), by=.(model.name, sign.residual=sign(residual), mid.thresh)]
total.thresh.pred[, top := ifelse(model.name=="BIC", 6.5, -4)]
total.thresh.pred[, left := ifelse(model.name=="BIC", 1.45, 1.8)]
total.thresh.pred[, log.penalty := top-(1-sign.residual)*0.7]
residual.thresh.pred <- some.thresh.pred[residual!=0,]

thresh.colors <- c(predicted="green", min.error="black", other="white")
viz <- list(
  title="BIC versus learned penalty in neuroblastoma data",
  thresholds=ggplot()+
    theme_bw()+
    guides(fill="none", color="none")+
    geom_line(aes(mid.thresh, errors, group=model.name, color=model.name,
                  showSelected=model.name), data=some.thresh)+
    geom_tallrect(aes(xmin=prev.thresh, xmax=next.thresh,
                      clickSelects.variable=paste0(model.name, ".thresh"),
                      clickSelects.value=mid.thresh,
                      color=model.name,
                      showSelected=model.name),
                  size=3,
                  alpha=0.5,
                  data=some.thresh)+
    ylab("total incorrect labels")+
    xlab("constant/threshold added to log(penalty)")+
    scale_fill_manual(values=thresh.colors)+
    scale_color_manual(values=c(BIC="red", learned="deepskyblue"))+
    geom_point(aes((max.thresh+min.thresh)/2, errors,
                   clickSelects.variable=paste0(model.name, ".thresh"),
                   clickSelects.value=mid.thresh,
                   color=model.name,
                   showSelected=model.name,
                   showSelected2=threshold,
                   fill=threshold),
               data=pred.dot,
               size=4,
               shape=21),
  roc=ggplot()+
    theme_bw()+
    geom_text(aes(FPR, TPR, color=model.name, label=sprintf("AUC = %.4f", auc)),
              hjust=0,
              data=auc)+
    geom_path(aes(FPR, TPR, color=model.name, group=model.name),
              size=2,
              data=roc)+
    scale_fill_manual(values=thresh.colors, breaks=names(thresh.colors))+
    scale_color_manual(values=c(BIC="red", learned="deepskyblue"))+
    geom_point(aes(FPR, TPR,
                   showSelected.variable=paste0(model.name, ".thresh"),
                   showSelected.value=mid.thresh,
                   showSelected2=threshold,
                   showSelected=model.name),
               data=some.thresh,
               color="grey",
               size=6,
               shape=21)+
    geom_point(aes(FPR, TPR,
                   clickSelects.variable=paste0(model.name, ".thresh"),
                   clickSelects.value=mid.thresh,
                   color=model.name, fill=threshold),
               data=some.thresh,
               size=4,
               alpha=0.8,
               shape=21)+
    coord_equal(),
  regression=ggplot()+
    theme_bw()+
    theme_animint(width=800)+
    geom_text(aes(
      left, log.penalty, 
      color=model.name,
      showSelected.variable=paste0(model.name, ".thresh"),
      showSelected.value=mid.thresh,
      key=paste(model.name, sign.residual),
      label=ifelse(sign.residual==0, sprintf(
        "%d intervals correctly predicted", intervals), sprintf(
        "total too %s = %.1f (%d/%d intervals)",
          ifelse(sign.residual==1, "high", "low"),
          total.residual, intervals,
          possible[, ifelse(sign.residual==1, positive, negative)]))),
              data=total.thresh.pred,
              hjust=0)+
    geom_point(aes(feature, min.log.lambda, fill=limit),
               data=data.table(
                 limit="min", train.dt[is.finite(min.log.lambda),]),
               shape=21)+
    geom_point(aes(feature, max.log.lambda, fill=limit),
               data=data.table(
                 limit="max", train.dt[is.finite(max.log.lambda),]),
               shape=21)+
    geom_abline(aes(slope=slope,
                    showSelected.variable=paste0(model.name, ".thresh"),
                    showSelected.value=mid.thresh,
                    intercept=intercept, color=model.name),
                data=some.thresh)+
    geom_segment(aes(
      feature, pred.log.lambda, xend=feature, color=model.name,
      showSelected.variable=paste0(model.name, ".thresh"),
      showSelected.value=mid.thresh,
      yend=pred.log.lambda-residual),
                 size=1,
                 data=residual.thresh.pred)+
    scale_fill_manual(values=c(min="black", max="white"))+
    scale_color_manual(values=c(BIC="red", learned="deepskyblue"))+
    scale_x_continuous("feature = log(log(n = number of data points to segment))")+
    scale_y_continuous("<-- more changes     log(penalty)     less changes -->"),
  first=list())
pred.thresh.only <- some.thresh[threshold=="predicted",]
for(row.i in 1:nrow(pred.thresh.only)){
  r <- pred.thresh.only[row.i, ]
  viz$first[[paste0(r$model.name, ".thresh")]] <- r$mid.thresh
}
animint2dir(viz, "figure-regression-interactive")