Chapter 14, Named clickSelects/showSelected
This chapter explains how to use named clickSelects/showSelected variables for creating data-driven selector names. This feature makes it easier to write animint code, and makes it faster to compile.
Chapter outline:
- We begin by downloading the PSJ data set and computing the data to plot.
- We show one method of defining an animint with many selectors, using for loops. This method is technically correct, but computationally inefficient.
- We then explain the preferred method for defining an animint with many selectors, using named clickSelects/showSelected. This method is more computationally efficient, and easier to code.
Download data set
The example data come from the PeakSegJoint package. The code below downloads the data set.
if(!file.exists("PSJ.RData")){
u <- paste0(
"http://github.com/tdhock/animint-examples",
"/blob/master/data/PSJ.RData?raw=true"
)
library(httr)
request <- GET(u)
stop_for_status(request)
writeBin(content(request), "PSJ.RData")
}
load("PSJ.RData")
Compute data to plot
The section below computes some common data that we will use in two data visualizations below.
res.error <- PSJ$error.total.chunk
ann.colors <- c(
noPeaks="#f6f4bf",
peakStart="#ffafaf",
peakEnd="#ff4c4c",
peaks="#a445ee")
## prob.regions are the black segments that show which regions are
## mapped to which segmentation problems.
library(data.table)
all.regions <- data.table(do.call(rbind, PSJ$regions.by.problem))
prob.regions.names <- c(
"bases.per.problem", "problem.i", "problem.name",
"chromStart", "chromEnd")
prob.regions <- unique(data.frame(all.regions)[, prob.regions.names])
prob.regions$sample.id <- "problems"
all.modelSelection <- data.table(do.call(
rbind, PSJ$modelSelection.by.problem))
modelSelection.errors <- all.modelSelection[!is.na(errors)]
penalty.range <- all.modelSelection[, c(
min(max.log.lambda), max(min.log.lambda))]
penalty.mid <- mean(penalty.range)
coverage.counts <- table(PSJ$coverage$sample.id)
facet.rows <- length(coverage.counts)+1
dvec <- diff(log(res.error$bases.per.problem))
dval <- exp(mean(dvec))
dval2 <- (dval-1)/2 + 1
res.error$min.bases.per.problem <- res.error$bases.per.problem/dval2
res.error$max.bases.per.problem <- res.error$bases.per.problem*dval2
modelSelection.labels <- unique(all.modelSelection[, data.table(
problem.name=problem.name,
bases.per.problem=bases.per.problem,
problemStart=problemStart,
problemEnd=problemEnd,
min.log.lambda=penalty.mid,
peaks=max(peaks)+0.5)])
Define data viz using for loops
The R code below constructs a data viz using for loops.
library(animint2)
print(timing.for.construct <- system.time({
viz.for <- list(
coverage=ggplot()+
geom_segment(aes(
chromStart/1e3, problem.i,
xend=chromEnd/1e3, yend=problem.i),
showSelected="bases.per.problem",
clickSelects="problem.name",
data=prob.regions)+
ggtitle("select problem")+
geom_text(aes(
chromStart/1e3, problem.i,
label=sprintf("%d problems mean size %.1f kb",
problems, mean.bases/1e3)),
showSelected="bases.per.problem",
data=PSJ$problem.labels,
hjust=0)+
geom_segment(aes(
problemStart/1e3, problem.i,
xend=problemEnd/1e3, yend=problem.i),
showSelected="bases.per.problem",
clickSelects="problem.name",
size=5,
data=PSJ$problems)+
scale_y_continuous(
"aligned read coverage",
breaks=function(limits){
floor(limits[2])
})+
scale_linetype_manual(
"error type",
limits=c(
"correct",
"false negative",
"false positive"),
values=c(
correct=0,
"false negative"=3,
"false positive"=1))+
scale_x_continuous(paste(
"position on chr11",
"(kilo bases = kb)"))+
coord_cartesian(xlim=c(118167.406, 118238.833))+
geom_tallrect(aes(
xmin=chromStart/1e3, xmax=chromEnd/1e3,
fill=annotation),
alpha=0.5,
color="grey",
data=PSJ$filled.regions)+
scale_fill_manual(values=ann.colors)+
theme_bw()+
theme_animint(width=1500, height=facet.rows*100)+
theme(panel.margin=grid::unit(0, "cm"))+
facet_grid(sample.id ~ ., labeller=function(df){
df$sample.id <- sub("McGill0", "", sub(" ", "\n", df$sample.id))
df
}, scales="free")+
geom_line(aes(
base/1e3, count),
data=PSJ$coverage,
color="grey50"),
resError=ggplot()+
ggtitle("select problem size")+
ylab("minimum percent incorrect regions")+
geom_tallrect(aes(
xmin=min.bases.per.problem,
xmax=max.bases.per.problem),
clickSelects="bases.per.problem",
alpha=0.5,
data=res.error)+
scale_x_log10()+
geom_line(aes(
bases.per.problem, errors/regions*100,
color=chunks, size=chunks),
data=data.frame(res.error, chunks="this"))+
geom_line(aes(
bases.per.problem, errors/regions*100,
color=chunks, size=chunks),
data=data.frame(PSJ$error.total.all, chunks="all")),
modelSelection=ggplot()+
geom_segment(aes(
min.log.lambda, peaks,
xend=max.log.lambda, yend=peaks),
showSelected=c("bases.per.problem", "problem.name"),
data=data.frame(all.modelSelection, what="peaks"),
size=5)+
geom_text(aes(
min.log.lambda, peaks,
label=sprintf(
"%.1f kb in problem %s",
(problemEnd-problemStart)/1e3, problem.name)),
showSelected=c("problem.name", "bases.per.problem"),
data=data.frame(modelSelection.labels, what="peaks"))+
geom_segment(aes(
min.log.lambda, as.integer(errors),
xend=max.log.lambda, yend=as.integer(errors)),
showSelected=c("bases.per.problem", "problem.name"),
data=data.frame(modelSelection.errors, what="errors"),
size=5)+
ggtitle("select number of samples with 1 peak")+
ylab("")+
facet_grid(what ~ ., scales="free"),
title="Animint compiler with for loops",
first=PSJ$first)
## For every problem there is a selector (called problem.dot) for the
## number of peaks in that problem. So in this for loop we add a few
## layers with aes_string(clickSelects=problem.dot) or
## aes_string(showSelected=problem.dot) to the coverage and
## modelSelection plots.
for(problem.dot in names(PSJ$modelSelection.by.problem)){
regions.dt <- PSJ$regions.by.problem[[problem.dot]]
regions.dt[[problem.dot]] <- regions.dt$peaks
if(!is.null(regions.dt)){
viz.for$coverage <- viz.for$coverage+
geom_tallrect(aes(
xmin=chromStart/1e3,
xmax=chromEnd/1e3,
linetype=status),
showSelected=c(problem.dot, "bases.per.problem"),
data=data.frame(regions.dt),
fill=NA,
color="black")
}
if(problem.dot %in% names(PSJ$peaks.by.problem)){
peaks <- PSJ$peaks.by.problem[[problem.dot]]
peaks[[problem.dot]] <- peaks$peaks
prob.peaks.names <- c(
"bases.per.problem", "problem.i", "problem.name",
"chromStart", "chromEnd", problem.dot)
prob.peaks <- unique(data.frame(peaks)[, prob.peaks.names])
prob.peaks$sample.id <- "problems"
viz.for$coverage <- viz.for$coverage +
geom_segment(aes(
chromStart/1e3, 0,
xend=chromEnd/1e3, yend=0),
clickSelects="problem.name",
showSelected=c(problem.dot, "bases.per.problem"),
data=peaks, size=7, color="deepskyblue")+
geom_segment(aes(
chromStart/1e3, problem.i,
xend=chromEnd/1e3, yend=problem.i),
clickSelects="problem.name",
showSelected=c(problem.dot, "bases.per.problem"),
data=prob.peaks, size=7, color="deepskyblue")
}
modelSelection.dt <- PSJ$modelSelection.by.problem[[problem.dot]]
modelSelection.dt[[problem.dot]] <- modelSelection.dt$peaks
viz.for$modelSelection <- viz.for$modelSelection+
geom_tallrect(aes(
xmin=min.log.lambda,
xmax=max.log.lambda),
clickSelects=problem.dot,
showSelected=c("problem.name", "bases.per.problem"),
data=modelSelection.dt, alpha=0.5)
}
}))
## utilisateur système écoulé
## 19.935 0.056 20.028
Note the timing of the code above. It takes a long time just to evaluate the R code that defines this data viz, since it has so many geoms. Next, we compile the data visualization.
print(timing.for.compile <- system.time({
animint2dir(viz.for, "Ch14-for")
}))
## Warning: Using size for a discrete variable is not advised.
## Warning: Using size for a discrete variable is not advised.
## Warning in checkSingleShowSelectedValue(meta$selectors): showSelected variables
## with only 1 level: chr11.118184422.118184700peaks,
## chr11.118192951.118193582peaks, chr11.118203893.118204314peaks
## utilisateur système écoulé
## 1995.705 11.266 2068.209
Note that the compilation also takes a long time, since there are so many geoms. The data viz can be viewed on Ch14-for/index.html. In the next section we will create the same data viz, but more efficiently.
Define data viz using named clickSelects/showSelected
In this section we use named clickSelects/showSelected to create a more efficient version of the previous data visualization. In general, any data visualization defined using for loops in R code can be made more efficient by instead using this method.
sample.peaks <- data.table(do.call(rbind, PSJ$peaks.by.problem))
prob.peaks.names <- c(
"bases.per.problem", "problem.i", "problem.name", "peaks",
"chromStart", "chromEnd")
problem.peaks <- unique(sample.peaks[, ..prob.peaks.names])
problem.peaks$sample.id <- "problems"
peakvar <- function(position){
paste0(gsub("[-:]", ".", position), "peaks")
}
all.regions[, selector := peakvar(problem.name)]
sample.peaks[, selector := peakvar(problem.name)]
problem.peaks[, selector := peakvar(problem.name)]
all.modelSelection[, selector := peakvar(problem.name)]
print(timing.named.construct <- system.time({
viz.named <- list(
coverage=ggplot()+
ggtitle("select problem")+
geom_segment(aes(
chromStart/1e3, problem.i,
xend=chromEnd/1e3, yend=problem.i),
showSelected="bases.per.problem",
clickSelects="problem.name",
data=prob.regions)+
geom_text(aes(
chromStart/1e3, problem.i,
label=sprintf(
"%d problems mean size %.1f kb",
problems, mean.bases/1e3)),
showSelected="bases.per.problem",
data=PSJ$problem.labels,
hjust=0)+
geom_segment(aes(
problemStart/1e3, problem.i,
xend=problemEnd/1e3, yend=problem.i),
showSelected="bases.per.problem",
clickSelects="problem.name",
size=5,
data=PSJ$problems)+
scale_y_continuous(
"aligned read coverage",
breaks=function(limits){
floor(limits[2])
})+
scale_linetype_manual(
"error type",
limits=c(
"correct",
"false negative",
"false positive"),
values=c(
correct=0,
"false negative"=3,
"false positive"=1))+
scale_x_continuous(paste(
"position on chr11",
"(kilo bases = kb)"))+
coord_cartesian(xlim=c(118167.406, 118238.833))+
geom_tallrect(aes(
xmin=chromStart/1e3, xmax=chromEnd/1e3,
fill=annotation),
alpha=0.5,
color="grey",
data=PSJ$filled.regions)+
scale_fill_manual(values=ann.colors)+
theme_bw()+
theme_animint(width=1500, height=facet.rows*100)+
theme(panel.margin=grid::unit(0, "cm"))+
facet_grid(sample.id ~ ., labeller=function(df){
df$sample.id <- sub("McGill0", "", sub(" ", "\n", df$sample.id))
df
}, scales="free")+
geom_line(aes(
base/1e3, count),
data=PSJ$coverage,
color="grey50")+
geom_tallrect(aes(
xmin=chromStart/1e3,
xmax=chromEnd/1e3,
linetype=status),
showSelected=c("selector"="peaks", "bases.per.problem"),
data=all.regions,
fill=NA,
color="black")+
geom_segment(aes(
chromStart/1e3, 0,
xend=chromEnd/1e3, yend=0),
clickSelects="problem.name",
showSelected=c("selector"="peaks", "bases.per.problem"),
data=sample.peaks, size=7, color="deepskyblue")+
geom_segment(aes(
chromStart/1e3, problem.i,
xend=chromEnd/1e3, yend=problem.i),
clickSelects="problem.name",
showSelected=c("selector"="peaks", "bases.per.problem"),
data=problem.peaks, size=7, color="deepskyblue"),
resError=ggplot()+
ggtitle("select problem size")+
ylab("minimum percent incorrect regions")+
geom_tallrect(aes(
xmin=min.bases.per.problem,
xmax=max.bases.per.problem),
clickSelects="bases.per.problem",
alpha=0.5,
data=res.error)+
scale_x_log10()+
geom_line(aes(
bases.per.problem, errors/regions*100,
color=chunks, size=chunks),
data=data.frame(res.error, chunks="this"))+
geom_line(aes(
bases.per.problem, errors/regions*100,
color=chunks, size=chunks),
data=data.frame(PSJ$error.total.all, chunks="all")),
modelSelection=ggplot()+
geom_segment(aes(
min.log.lambda, peaks,
xend=max.log.lambda, yend=peaks),
showSelected=c("problem.name", "bases.per.problem"),
data=data.frame(all.modelSelection, what="peaks"),
size=5)+
geom_text(aes(
min.log.lambda, peaks,
label=sprintf(
"%.1f kb in problem %s",
(problemEnd-problemStart)/1e3, problem.name)),
showSelected=c("problem.name", "bases.per.problem"),
data=data.frame(modelSelection.labels, what="peaks"))+
geom_segment(aes(
min.log.lambda, as.integer(errors),
xend=max.log.lambda, yend=as.integer(errors)),
showSelected=c("problem.name", "bases.per.problem"),
data=data.frame(modelSelection.errors, what="errors"),
size=5)+
ggtitle("select number of samples with 1 peak")+
ylab("")+
geom_tallrect(aes(
xmin=min.log.lambda,
xmax=max.log.lambda),
clickSelects=c("selector"="peaks"),
showSelected=c("problem.name", "bases.per.problem"),
data=all.modelSelection, alpha=0.5)+
facet_grid(what ~ ., scales="free"),
title="Animint compiler with named clickSelects/showSelected",
first=PSJ$first)
### For every problem there is a selector (called problem.name) for
### the number of peaks in that problem. The animint2dir compiler
### creates a selection variable for every unique value of
### clickSelects/showSelected names (and it uses corresponding values
### to set/update the selected value/geoms).
}))
## utilisateur système écoulé
## 0.142 0.000 0.142
It is clear that it takes much less time to evaluate the R code above which uses the named clickSelects/showSelected. We compile it below.
print(timing.named.compile <- system.time({
animint2dir(viz.named, "Ch14-named")
}))
## Warning: Using size for a discrete variable is not advised.
## Warning: Using size for a discrete variable is not advised.
## utilisateur système écoulé
## 5.644 0.020 5.748
The animint produced above can be viewed on Ch14-named/index.html. Note that it should appear to be the same as the other data viz above. The timings above show that named clickSelects/showSelected are much faster than for loops, in both the definition and compilation steps.
Disk usage comparison
In this section we compute the disk usage of both methods.
viz.dirs.vec <- c("Ch14-for", "Ch14-named")
viz.dirs.text <- paste(viz.dirs.vec, collapse=" ")
(cmd <- paste("du -ks", viz.dirs.text))
## [1] "du -ks Ch14-for Ch14-named"
kb.dt <- fread(cmd)
## Taking input= as a system command because it contains a space ('du -ks Ch14-for Ch14-named'). If it's a filename please remove the space, or use file= explicitly. A variable is being passed to input= and when this is taken as a system command there is a security concern if you are creating an app, the app could have a malicious user, and the app is not running in a secure environment; e.g. the app is running as root. Please read item 5 in the NEWS file for v1.11.6 for more information and for the option to suppress this message.
setnames(kb.dt, c("kilobytes", "path"))
kb.dt
## kilobytes path
## <int> <char>
## 1: 4308 Ch14-for
## 2: 1572 Ch14-named
The table above shows that the data viz defined using for loops takes about twice as much disk space as the data viz that used named clickSelects/showSelected.
Chapter summary and exercises
The table below summarizes the disk usage and timings presented in this chapter. It is clear that named clickSelects/showSelected are more efficient in both respects, and should be used instead of for loops.
data.frame(
kilobytes=kb.dt$kilobytes,
construct.seconds=c(
timing.for.construct[["elapsed"]],
timing.named.construct[["elapsed"]]),
compile.seconds=c(
timing.for.compile[["elapsed"]],
timing.named.compile[["elapsed"]]),
row.names=c("for", "named"))
## kilobytes construct.seconds compile.seconds
## for 4308 20.028 2068.209
## named 1572 0.142 5.748
Exercises:
- Use named clickSelects/showSelected to create a visualization of some data from your domain of expertise.
Next, Chapter 15 explains how to visualize root-finding algorithms.