Chapter 14, Named clickSelects/showSelected

This chapter explains how to use named clickSelects/showSelected variables for creating data-driven selector names. This feature makes it easier to write animint code, and makes it faster to compile.

Chapter outline:

We begin by downloading the PSJ data set and computing the data to plot.
We show one method of defining an animint with many selectors, using for loops. This method is technically correct, but computationally inefficient.
We then explain the preferred method for defining an animint with many selectors, using named clickSelects/showSelected. This method is more computationally efficient, and easier to code.

Download data set

The example data come from the PeakSegJoint package. The code below downloads the data set.

if(!file.exists("PSJ.RData")){
  u <- paste0(
    "http://github.com/tdhock/animint-examples",
    "/blob/master/data/PSJ.RData?raw=true"
    )
  library(httr)
  request <- GET(u)
  stop_for_status(request)
  writeBin(content(request), "PSJ.RData")
}
load("PSJ.RData")

Compute data to plot

The section below computes some common data that we will use in two data visualizations below.

res.error <- PSJ$error.total.chunk
ann.colors <- c(
  noPeaks="#f6f4bf",
  peakStart="#ffafaf",
  peakEnd="#ff4c4c",
  peaks="#a445ee")
## prob.regions are the black segments that show which regions are
## mapped to which segmentation problems.
library(data.table)
all.regions <- data.table(do.call(rbind, PSJ$regions.by.problem))
prob.regions.names <- c(
  "bases.per.problem", "problem.i", "problem.name",
  "chromStart", "chromEnd")
prob.regions <- unique(data.frame(all.regions)[, prob.regions.names])
prob.regions$sample.id <- "problems"
all.modelSelection <- data.table(do.call(
  rbind, PSJ$modelSelection.by.problem))
modelSelection.errors <- all.modelSelection[!is.na(errors)]
penalty.range <- all.modelSelection[, c(
  min(max.log.lambda), max(min.log.lambda))]
penalty.mid <- mean(penalty.range)
coverage.counts <- table(PSJ$coverage$sample.id)
facet.rows <- length(coverage.counts)+1
dvec <- diff(log(res.error$bases.per.problem))
dval <- exp(mean(dvec))
dval2 <- (dval-1)/2 + 1
res.error$min.bases.per.problem <- res.error$bases.per.problem/dval2
res.error$max.bases.per.problem <- res.error$bases.per.problem*dval2
modelSelection.labels <- unique(all.modelSelection[, data.table(
  problem.name=problem.name,
  bases.per.problem=bases.per.problem,
  problemStart=problemStart,
  problemEnd=problemEnd,
  min.log.lambda=penalty.mid,
  peaks=max(peaks)+0.5)])

Define data viz using for loops

The R code below constructs a data viz using for loops.

library(animint2)
print(timing.for.construct <- system.time({
  viz.for <- list(
    coverage=ggplot()+
      geom_segment(aes(
        chromStart/1e3, problem.i,
        xend=chromEnd/1e3, yend=problem.i),
        showSelected="bases.per.problem",
        clickSelects="problem.name",
        data=prob.regions)+
      ggtitle("select problem")+
      geom_text(aes(
        chromStart/1e3, problem.i,
        label=sprintf("%d problems mean size %.1f kb",
                      problems, mean.bases/1e3)),
        showSelected="bases.per.problem",
        data=PSJ$problem.labels,
        hjust=0)+
      geom_segment(aes(
        problemStart/1e3, problem.i,
        xend=problemEnd/1e3, yend=problem.i),
        showSelected="bases.per.problem",
        clickSelects="problem.name",
        size=5,
        data=PSJ$problems)+
      scale_y_continuous(
        "aligned read coverage",
        breaks=function(limits){
          floor(limits[2])
        })+
      scale_linetype_manual(
        "error type",
        limits=c(
          "correct", 
          "false negative",
          "false positive"),
        values=c(
          correct=0,
          "false negative"=3,
          "false positive"=1))+
      scale_x_continuous(paste(
        "position on chr11",
        "(kilo bases = kb)"))+
      coord_cartesian(xlim=c(118167.406, 118238.833))+
      geom_tallrect(aes(
        xmin=chromStart/1e3, xmax=chromEnd/1e3,
        fill=annotation),
        alpha=0.5,
        color="grey",
        data=PSJ$filled.regions)+
      scale_fill_manual(values=ann.colors)+
      theme_bw()+
      theme_animint(width=1500, height=facet.rows*100)+
      theme(panel.margin=grid::unit(0, "cm"))+
      facet_grid(sample.id ~ ., labeller=function(df){
        df$sample.id <- sub("McGill0", "", sub(" ", "\n", df$sample.id))
        df
      }, scales="free")+
      geom_line(aes(
        base/1e3, count),
        data=PSJ$coverage,
        color="grey50"),
    resError=ggplot()+
      ggtitle("select problem size")+
      ylab("minimum percent incorrect regions")+
      geom_tallrect(aes(
        xmin=min.bases.per.problem,
        xmax=max.bases.per.problem),
        clickSelects="bases.per.problem",
        alpha=0.5,
        data=res.error)+
      scale_x_log10()+
      geom_line(aes(
        bases.per.problem, errors/regions*100,
        color=chunks, size=chunks),
        data=data.frame(res.error, chunks="this"))+
      geom_line(aes(
        bases.per.problem, errors/regions*100,
        color=chunks, size=chunks),
        data=data.frame(PSJ$error.total.all, chunks="all")),
    modelSelection=ggplot()+
      geom_segment(aes(
        min.log.lambda, peaks,
        xend=max.log.lambda, yend=peaks),
        showSelected=c("bases.per.problem", "problem.name"),
        data=data.frame(all.modelSelection, what="peaks"),
        size=5)+
      geom_text(aes(
        min.log.lambda, peaks,
        label=sprintf(
          "%.1f kb in problem %s",
          (problemEnd-problemStart)/1e3, problem.name)),
        showSelected=c("problem.name", "bases.per.problem"),
        data=data.frame(modelSelection.labels, what="peaks"))+
      geom_segment(aes(
        min.log.lambda, as.integer(errors),
        xend=max.log.lambda, yend=as.integer(errors)),
        showSelected=c("bases.per.problem", "problem.name"),
        data=data.frame(modelSelection.errors, what="errors"),
        size=5)+
      ggtitle("select number of samples with 1 peak")+
      ylab("")+
      facet_grid(what ~ ., scales="free"),
    title="Animint compiler with for loops",
    first=PSJ$first)
  ## For every problem there is a selector (called problem.dot) for the
  ## number of peaks in that problem. So in this for loop we add a few
  ## layers with aes_string(clickSelects=problem.dot) or
  ## aes_string(showSelected=problem.dot) to the coverage and
  ## modelSelection plots.
  for(problem.dot in names(PSJ$modelSelection.by.problem)){
    regions.dt <- PSJ$regions.by.problem[[problem.dot]]
    regions.dt[[problem.dot]] <- regions.dt$peaks
    if(!is.null(regions.dt)){
      viz.for$coverage <- viz.for$coverage+
        geom_tallrect(aes(
          xmin=chromStart/1e3,
          xmax=chromEnd/1e3,
          linetype=status),
          showSelected=c(problem.dot, "bases.per.problem"),
          data=data.frame(regions.dt),
          fill=NA,
          color="black")
    }
    if(problem.dot %in% names(PSJ$peaks.by.problem)){
      peaks <- PSJ$peaks.by.problem[[problem.dot]]
      peaks[[problem.dot]] <- peaks$peaks
      prob.peaks.names <- c(
        "bases.per.problem", "problem.i", "problem.name",
        "chromStart", "chromEnd", problem.dot)
      prob.peaks <- unique(data.frame(peaks)[, prob.peaks.names])
      prob.peaks$sample.id <- "problems"
      viz.for$coverage <- viz.for$coverage +
        geom_segment(aes(
          chromStart/1e3, 0,
          xend=chromEnd/1e3, yend=0),
          clickSelects="problem.name",
          showSelected=c(problem.dot, "bases.per.problem"),
          data=peaks, size=7, color="deepskyblue")+
        geom_segment(aes(
          chromStart/1e3, problem.i,
          xend=chromEnd/1e3, yend=problem.i),
          clickSelects="problem.name",
          showSelected=c(problem.dot, "bases.per.problem"),
          data=prob.peaks, size=7, color="deepskyblue")
    }
    modelSelection.dt <- PSJ$modelSelection.by.problem[[problem.dot]]
    modelSelection.dt[[problem.dot]] <- modelSelection.dt$peaks
    viz.for$modelSelection <- viz.for$modelSelection+
      geom_tallrect(aes(
        xmin=min.log.lambda, 
        xmax=max.log.lambda), 
        clickSelects=problem.dot,
        showSelected=c("problem.name", "bases.per.problem"),
        data=modelSelection.dt, alpha=0.5)
  }
}))

## utilisateur     système      écoulé 
##      19.935       0.056      20.028

Note the timing of the code above. It takes a long time just to evaluate the R code that defines this data viz, since it has so many geoms. Next, we compile the data visualization.

print(timing.for.compile <- system.time({
  animint2dir(viz.for, "Ch14-for")
}))

## Warning: Using size for a discrete variable is not advised.
## Warning: Using size for a discrete variable is not advised.

## Warning in checkSingleShowSelectedValue(meta$selectors): showSelected variables
## with only 1 level: chr11.118184422.118184700peaks,
## chr11.118192951.118193582peaks, chr11.118203893.118204314peaks

## utilisateur     système      écoulé 
##    1995.705      11.266    2068.209

Note that the compilation also takes a long time, since there are so many geoms. The data viz can be viewed on Ch14-for/index.html. In the next section we will create the same data viz, but more efficiently.

Define data viz using named clickSelects/showSelected

In this section we use named clickSelects/showSelected to create a more efficient version of the previous data visualization. In general, any data visualization defined using for loops in R code can be made more efficient by instead using this method.

sample.peaks <- data.table(do.call(rbind, PSJ$peaks.by.problem))
prob.peaks.names <- c(
  "bases.per.problem", "problem.i", "problem.name", "peaks",
  "chromStart", "chromEnd")
problem.peaks <- unique(sample.peaks[, ..prob.peaks.names])
problem.peaks$sample.id <- "problems"
peakvar <- function(position){
  paste0(gsub("[-:]", ".", position), "peaks")
}
all.regions[, selector := peakvar(problem.name)]
sample.peaks[, selector := peakvar(problem.name)]
problem.peaks[, selector := peakvar(problem.name)]
all.modelSelection[, selector := peakvar(problem.name)]
print(timing.named.construct <- system.time({
  viz.named <- list(
    coverage=ggplot()+
      ggtitle("select problem")+
      geom_segment(aes(
        chromStart/1e3, problem.i,
        xend=chromEnd/1e3, yend=problem.i),
        showSelected="bases.per.problem",
        clickSelects="problem.name",
        data=prob.regions)+
      geom_text(aes(
        chromStart/1e3, problem.i,
        label=sprintf(
          "%d problems mean size %.1f kb",
          problems, mean.bases/1e3)),
        showSelected="bases.per.problem",
        data=PSJ$problem.labels,
        hjust=0)+
      geom_segment(aes(
        problemStart/1e3, problem.i,
        xend=problemEnd/1e3, yend=problem.i),
        showSelected="bases.per.problem",
        clickSelects="problem.name",
        size=5,
        data=PSJ$problems)+
      scale_y_continuous(
        "aligned read coverage",
        breaks=function(limits){
          floor(limits[2])
        })+
      scale_linetype_manual(
        "error type",
        limits=c(
          "correct", 
          "false negative",
          "false positive"),
        values=c(
          correct=0,
          "false negative"=3,
          "false positive"=1))+
      scale_x_continuous(paste(
        "position on chr11",
        "(kilo bases = kb)"))+
      coord_cartesian(xlim=c(118167.406, 118238.833))+
      geom_tallrect(aes(
        xmin=chromStart/1e3, xmax=chromEnd/1e3,
        fill=annotation),
        alpha=0.5,
        color="grey",
        data=PSJ$filled.regions)+
      scale_fill_manual(values=ann.colors)+
      theme_bw()+
      theme_animint(width=1500, height=facet.rows*100)+
      theme(panel.margin=grid::unit(0, "cm"))+
      facet_grid(sample.id ~ ., labeller=function(df){
        df$sample.id <- sub("McGill0", "", sub(" ", "\n", df$sample.id))
        df
      }, scales="free")+
      geom_line(aes(
        base/1e3, count),
        data=PSJ$coverage,
        color="grey50")+
      geom_tallrect(aes(
        xmin=chromStart/1e3,
        xmax=chromEnd/1e3,
        linetype=status),
        showSelected=c("selector"="peaks", "bases.per.problem"),
        data=all.regions,
        fill=NA,
        color="black")+
      geom_segment(aes(
        chromStart/1e3, 0,
        xend=chromEnd/1e3, yend=0),
        clickSelects="problem.name",
        showSelected=c("selector"="peaks", "bases.per.problem"),
        data=sample.peaks, size=7, color="deepskyblue")+
      geom_segment(aes(
        chromStart/1e3, problem.i,
        xend=chromEnd/1e3, yend=problem.i),
        clickSelects="problem.name",
        showSelected=c("selector"="peaks", "bases.per.problem"),
        data=problem.peaks, size=7, color="deepskyblue"),
    resError=ggplot()+
      ggtitle("select problem size")+
      ylab("minimum percent incorrect regions")+
      geom_tallrect(aes(
        xmin=min.bases.per.problem,
        xmax=max.bases.per.problem),
        clickSelects="bases.per.problem",
        alpha=0.5,
        data=res.error)+
      scale_x_log10()+
      geom_line(aes(
        bases.per.problem, errors/regions*100,
        color=chunks, size=chunks),
        data=data.frame(res.error, chunks="this"))+
      geom_line(aes(
        bases.per.problem, errors/regions*100,
        color=chunks, size=chunks),
        data=data.frame(PSJ$error.total.all, chunks="all")),
    modelSelection=ggplot()+
      geom_segment(aes(
        min.log.lambda, peaks,
        xend=max.log.lambda, yend=peaks),
        showSelected=c("problem.name", "bases.per.problem"),
        data=data.frame(all.modelSelection, what="peaks"),
        size=5)+
      geom_text(aes(
        min.log.lambda, peaks,
        label=sprintf(
          "%.1f kb in problem %s",
          (problemEnd-problemStart)/1e3, problem.name)),
        showSelected=c("problem.name", "bases.per.problem"),
        data=data.frame(modelSelection.labels, what="peaks"))+
      geom_segment(aes(
        min.log.lambda, as.integer(errors),
        xend=max.log.lambda, yend=as.integer(errors)),
        showSelected=c("problem.name", "bases.per.problem"),
        data=data.frame(modelSelection.errors, what="errors"),
        size=5)+
      ggtitle("select number of samples with 1 peak")+
      ylab("")+
      geom_tallrect(aes(
        xmin=min.log.lambda, 
        xmax=max.log.lambda), 
        clickSelects=c("selector"="peaks"),
        showSelected=c("problem.name", "bases.per.problem"),
        data=all.modelSelection, alpha=0.5)+
      facet_grid(what ~ ., scales="free"),
    title="Animint compiler with named clickSelects/showSelected",
    first=PSJ$first)
### For every problem there is a selector (called problem.name) for
### the number of peaks in that problem. The animint2dir compiler
### creates a selection variable for every unique value of
### clickSelects/showSelected names (and it uses corresponding values
### to set/update the selected value/geoms).
}))

## utilisateur     système      écoulé 
##       0.142       0.000       0.142

It is clear that it takes much less time to evaluate the R code above which uses the named clickSelects/showSelected. We compile it below.

print(timing.named.compile <- system.time({
  animint2dir(viz.named, "Ch14-named")
}))

## Warning: Using size for a discrete variable is not advised.
## Warning: Using size for a discrete variable is not advised.

## utilisateur     système      écoulé 
##       5.644       0.020       5.748

The animint produced above can be viewed on Ch14-named/index.html. Note that it should appear to be the same as the other data viz above. The timings above show that named clickSelects/showSelected are much faster than for loops, in both the definition and compilation steps.

Disk usage comparison

In this section we compute the disk usage of both methods.

viz.dirs.vec <- c("Ch14-for", "Ch14-named")
viz.dirs.text <- paste(viz.dirs.vec, collapse=" ")
(cmd <- paste("du -ks", viz.dirs.text))

## [1] "du -ks Ch14-for Ch14-named"

kb.dt <- fread(cmd)

## Taking input= as a system command because it contains a space ('du -ks Ch14-for Ch14-named'). If it's a filename please remove the space, or use file= explicitly. A variable is being passed to input= and when this is taken as a system command there is a security concern if you are creating an app, the app could have a malicious user, and the app is not running in a secure environment; e.g. the app is running as root. Please read item 5 in the NEWS file for v1.11.6 for more information and for the option to suppress this message.

setnames(kb.dt, c("kilobytes", "path"))
kb.dt

##    kilobytes       path
##        <int>     <char>
## 1:      4308   Ch14-for
## 2:      1572 Ch14-named

The table above shows that the data viz defined using for loops takes about twice as much disk space as the data viz that used named clickSelects/showSelected.

Chapter summary and exercises

The table below summarizes the disk usage and timings presented in this chapter. It is clear that named clickSelects/showSelected are more efficient in both respects, and should be used instead of for loops.

data.frame(
  kilobytes=kb.dt$kilobytes,
  construct.seconds=c(
    timing.for.construct[["elapsed"]],
    timing.named.construct[["elapsed"]]),
  compile.seconds=c(
    timing.for.compile[["elapsed"]],
    timing.named.compile[["elapsed"]]),
  row.names=c("for", "named"))

##       kilobytes construct.seconds compile.seconds
## for        4308            20.028        2068.209
## named      1572             0.142           5.748

Exercises:

Use named clickSelects/showSelected to create a visualization of some data from your domain of expertise.

Next, Chapter 15 explains how to visualize root-finding algorithms.

The animint2 Manual by Toby Dylan Hocking

Chapter contents