fit.gada <- function ### Run the first fitting steps of the gada algorithm. (pro ### Profile data.frame. ){ require(gada) gen.info <- data.frame(probe=rownames(pro), chr=pro$chromosome, pos=pro$position) gada.obj <- setupGADAgeneral(pro$logratio,gen.info=gen.info) SBL(gada.obj,estim.sigma2=TRUE) } gada.results <- function ### Recover a matrix of smoothed signals from the gada results. (pro, ### Profile data.frame. fit.list ### List of gada results. Each gada result is a list, one element for ### each chromosome. ){ each.chrom(pro,function(chr){ Y <- chr$logratio chrName <- as.character(chr$chromosome)[1] ## need to return mat[nparam,nprobes] do.call(rbind,lapply(fit.list,function(fit){ names(fit) <- as.character(attr(fit,"chr")) seg.df <- WextIextToSegments(fit[[chrName]]) for(i in 1:nrow(seg.df)){ seg <- seg.df[i,] Y[seg$IniProbe:seg$EndProbe] <- seg$MeanAmp } Y })) }) } run.pelt <- function ### Smooth a profile using the PELT algorithm. (profile, ### A profile data.frame. penalty="SIC", ### character specifying the penalty to use. values=0, ### vector of penalty parameters to try. FUN=cpt.mean ### PELT function to use. ){ require(changepoint) each.chrom(profile,function(chr){ Y <- chr$logratio chr.mat <- do.call(rbind,lapply(values,function(value){ ##cat(sprintf("profile %s chr %s parameter %s\n", ## chr$profile.id[1],chr$chromosome[1],as.character(value))) tryCatch({ result <- FUN(Y,method="PELT",penalty=penalty,pen.value=value) ends <- result@cpts starts <- c(1,ends[-length(ends)]+1) for(i in seq_along(ends)){ end <- ends[i] start <- starts[i] Y[start:end] <- result@param.est$mean[i] } Y },error=function(e){ rep(NA,length(Y)) }) })) to.replace <- is.na(chr.mat[,1]) if(any(to.replace)){ last.ok <- max(which(!to.replace)) chr.mat[to.replace,] <- matrix(chr.mat[last.ok,],sum(to.replace),ncol(chr.mat),byrow=TRUE) } chr.mat }) ### Matrix of smoothed profiles: nparam x nprobes. } dnacopy.smoothvec <- function ### Smooth a profile using DNAcopy. (profile, ### A profile data.frame. var, ### Smoothness variable. vals, ### Smoothness values. ... ### Other arguments, passed to segment. ){ require(DNAcopy) do.call(rbind,lapply(vals,function(v){ set.seed(1) cna <- with(profile,CNA(logratio,chromosome,position)) smoothed.cna <- smooth.CNA(cna) segmented.cna <- segment(smoothed.cna) args <- list(smoothed.cna,verbose=0,v,...) names(args)[3] <- var ##print(args) result <- do.call(segment,args) profile$smooth <- NA for(i in 1:nrow(result$output)){ ri <- result$output[i,] probes.on.segment <- with(ri,{ profile$chromosome==chrom & loc.start <= profile$position & loc.end >= profile$position }) profile$smooth[probes.on.segment] <- ri$seg.mean } profile$smooth })) ### Matrix of smoothed profiles: nparam x nprobes. } each.chrom <- function ### Apply a smoothing function independently to each chromosome of a ### profile. (profile, ### Profile data.frame. FUN ### Function that will take a profile data.frame for one chromosome ### and return a smoothing matrix for that chromosome: nparam x ### nprobes. ){ chroms <- split(profile,profile$chromosome,drop=TRUE) mats <- lapply(chroms,function(pro){ if(any(diff(pro$position) <= 0)) stop("profile positions must be in strictly increasing order") result <- FUN(pro) stopifnot(is.matrix(result)) result }) smooth.mat <- matrix(NA,nrow(mats[[1]]),nrow(profile)) for(chr in names(mats)){ smooth.mat[,profile$chromosome==chr] <- mats[[chr]] } smooth.mat ### Matrix of smoothed profiles for the entire profile: nparam x ### nprobes. } run.cghseg <- function ### Run cghseg maximum likelihood DP segmentation and pick the model ### using picker. (profile, ### Profile data.frame. picker ### Function that gets arguments const.lines (a data.frame with one ### line for each segment in the maximum likelihood segmentation for a ### chrom), smoothed.mat (matrix of smoothed profile for a chrom: kmax ### x nprobes), Y (vector of logratio measurements for a chrom), kmax ### (maximum number of segments to consider), n (number of probes), ### and should return the chosen smoothing vector for the chrom. ){ require(cghseg) each.chrom(profile,function(chr){ Y <- chr$logratio pos.range <- range(chr$pos) n <- length(Y) ##if(chr$chromosome[1]=="4")browser() kmax <- min(20,n)#k is the number of SEGMENTS not BREAKPOINTS cat(sprintf("chrom=%s kmax=%d\n",chr$chromosome[1],kmax)) result <- cghseg:::segmeanCO(Y,kmax) const.lines <- do.call(rbind,lapply(1:kmax,function(k){ th <- result$t.est[k, 1:k] rupt <- matrix(ncol = 2, c(c(1, th[1:k - 1] + 1), th)) d <- data.frame(k,begin = rupt[, 1], end = rupt[, 2], mean = apply(rupt,1,function(z) mean(Y[z[1]:z[2]]))) transform(d, position.begin=chr$position[begin], position.end=chr$position[end]) })) ## matrix( smoothing levels x positions ) eachk <- split(const.lines,const.lines$k) smoothed.mat <- do.call(rbind,lapply(eachk,function(d){ for(i in 1:nrow(d))Y[d[i,"begin"]:d[i,"end"]] <- d[i,"mean"] Y })) best <- picker(smoothed.mat=smoothed.mat, const.lines=const.lines, Y=Y,n=n,kmax=kmax,l=pos.range[2]-pos.range[1]) ##PROBLEM: what if chromosomes are discontiguous?? We NEED TO take ##the profile data frame apart more carefully. best }) ### Smoothing matrix nparam x nprobes. } ### Iterative isotonic regression. iir.basic <- function(x,y,K){ n <- length(x) res <- matrix(rep(0,n*K),ncol=K) u <- rep(0,n) ; b <- rep(0,n) for(k in 1:K){ res.pava<-pava(y-b,long.out=T) u <- res.pava$y res.pava <- pava(y-u,long.out=T,decreasing=T) b <- res.pava$y res[,k] <- u+b } return(res) } ## run.iir <- function(pro,...){ ## ### from ## ###http://www.sites.univ-rennes2.fr/laboratoire-statistique/JEGOU/iir_1.0.tar.gz ## require(iir) ## each.chrom(pro,function(chr){ ## if(chr$chromosome=="Y")return(t(chr$logratio)) ## res <- with(chr,iir::iir(position,logratio,...)) ## t(res$fitted) ## }) ## } run.wbs <- function(pro, ...){ require(wbs) param.list <- list(...) each.chrom(pro, function(chr) { Y <- chr$logratio if(length(Y) < 2){ mean.mat <- matrix(1, 1, length(Y)) }else{ set.seed(1) w <- wbs(Y) arg.list <- c(list(w), param.list) result.list <- do.call(changepoints, arg.list) list.or.na <- result.list$cpt.th change.list <- if(is.list(list.or.na)) { list.or.na }else{ list(integer()) } mean.mat <- matrix(1, length(change.list), length(Y)) for (param.i in seq_along(change.list)) { changes <- sort(change.list[[param.i]]) ends <- c(changes, length(Y)) starts <- c(1, changes + 1) for (seg.i in seq_along(starts)) { start <- starts[[seg.i]] end <- ends[[seg.i]] mean.mat[param.i, start:end] <- mean(Y[start:end]) } } } mean.mat }) } ### Run the cghseg.k penalized model selection procedure. penalized.mean.mat <- function(Y, seg.mean.mat, lambda.vec){ stopifnot(is.numeric(Y)) stopifnot(is.numeric(seg.mean.mat)) stopifnot(is.matrix(seg.mean.mat)) stopifnot(ncol(seg.mean.mat) == length(Y)) stopifnot(is.numeric(lambda.vec)) max.segments <- nrow(seg.mean.mat) rss.vec <- rep(NA, max.segments) for(n.segments in 1:max.segments){ residual.vec <- Y - seg.mean.mat[n.segments, ] rss.vec[n.segments] <- sum(residual.vec * residual.vec) } pen.mean.mat <- matrix(NA, length(lambda.vec), length(Y)) for(lambda.i in seq_along(lambda.vec)){ lambda <- lambda.vec[[lambda.i]] beta <- lambda * length(Y) pen.vec <- beta * 1:n.segments selected.segments <- which.min(rss.vec + pen.vec) pen.mean.mat[lambda.i, ] <- seg.mean.mat[selected.segments, ] } pen.mean.mat } smuce.q.vec <- unique(c(10^seq(-3, -1, l=10), seq(0.2, 30, by=0.2), seq(40, 200, by=10))) run.smuce <- function(pro, param.vec, param.name){ require(stepR) each.chrom(pro, function(chr){ Y <- chr$logratio mean.mat <- matrix(NA, length(param.vec), length(Y)) for(param.i in seq_along(param.vec)){ param.val <- param.vec[[param.i]] fit <- if(is.na(param.val)){ smuceR(Y) # for default version. }else{ param.list <- list(Y, param.val) names(param.list) <- c("y", param.name) do.call(smuceR, param.list) } seg.df <- with(fit, data.frame(leftEnd, rightEnd, value)) for(segment.i in 1:nrow(seg.df)){ seg <- seg.df[segment.i, ] mean.mat[param.i, seg$leftEnd:seg$rightEnd] <- seg$value } } mean.mat }) } ### This is a list of functions, each of which must return a matrix of ### smoothed profiles. The first argument of each function is a ### data.frame that represents a copy number profile, with at least ### columns: position logratio chromosome. We assume that positions ### are already sorted in ascending order p_1 < p_2. The second ### argument to each of these functions should be a vector of ### smoothing parameters, and there should be a default value. The ### matrix returned has 1 row for each parameter, and 1 column for ### each position. smoothers <- list(stat.ergo.alpha=function(profile,lambda){ },flsa=function(profile, l2vals=c(1e-5,1e-4,1e-3,5e-3,1e-2,2e-2,5e-2, 10^seq(-1,2,l=100), 2e2,5e2,1e3,1e4,1e5, 1e6,1e7,1e8,1e9,1e10,1e11,1e12) ){ require(flsa) each.chrom(profile,function(d){ model <- flsa(d$logratio) flsaGetSolution(model,lambda2=l2vals) }) },iir.aicc=function(pro,x=1){ run.iir(pro,criterion="aicc") },iir.aic=function(pro,x=1){ run.iir(pro,criterion="aic") },iir.bic=function(pro,x=1){ run.iir(pro,criterion="bic") },iir.gcv=function(pro,x=1){ run.iir(pro,criterion="gcv") },pelt.default=function(pro,values=0){ run.pelt(pro) },pelt.meanvar.n=function(pro,values=sprintf("n*%.10f",10^seq(-2,1,l=100))){ run.pelt(pro,"Manual",values,cpt.meanvar) },pelt.meanvar=function(pro,values=10^seq(0,3,l=100)){ run.pelt(pro,"Manual",values,cpt.meanvar) },pelt.n=function(pro,values=sprintf("n*%.10f",10^seq(-8,1,l=100))){ run.pelt(pro,"Manual",values) },fpop=function(pro,lambda=10^seq(-8,1,l=100)){ require(fpop) each.chrom(pro, function(chr){ Y <- chr$logratio beta <- length(Y) * lambda chr.mat <- matrix(NA, length(beta), length(Y)) for(beta.i in seq_along(beta)){ res <- Fpop(chr$logratio, beta[[beta.i]]) ends <- res$t.est starts <- c(1, 1+res$t.est[-length(res$t.est)]) for(seg.i in seq_along(starts)){ start <- starts[[seg.i]] end <- ends[[seg.i]] chr.mat[beta.i, start:end] <- mean(Y[start:end]) } } chr.mat }) },multiBinSeg=function(pro, lambda=10^c(seq(-8,1,l=100))){ require(fpop) each.chrom(pro, function(chr){ Y <- chr$logratio n <- length(Y) lambda.mat <- matrix(NA, length(lambda), n) if(n > 1){ max.breaks <- min(20,n-1)#k is the number of BREAKPOINTS not segments result <- multiBinSeg(Y, max.breaks) model.mat <- matrix(NA, n, max.breaks+1) ## The first column is the constant model. model.mat[,1] <- mean(Y) breaks <- result$t.est for(model.i in seq_along(breaks)){ changes <- sort(breaks[1:model.i]) ends <- c(changes, length(Y)) starts <- c(1, changes+1) for(seg.i in seq_along(starts)){ start <- starts[[seg.i]] end <- ends[[seg.i]] model.mat[start:end, model.i+1] <- mean(Y[start:end]) } } ## apply(model.mat, 2, function(yhat)sum(diff(yhat)!=0)) residual.mat <- Y - model.mat mean.sq.residual <- colMeans(residual.mat * residual.mat) n.segments <- seq_along(mean.sq.residual) for(lambda.i in seq_along(lambda)){ l <- lambda[[lambda.i]] crit <- n.segments * l + mean.sq.residual opt.segments <- which.min(crit) lambda.mat[lambda.i, ] <- model.mat[, opt.segments] } } lambda.mat }) },smuce.default=function(pro, alpha.vals=NA){ run.smuce(pro, NA) },smuce.alpha=function(pro, alpha.vals=unique(c(10^seq(-10, -1, l=60), seq(0.1, 0.9, by=0.05)))){ run.smuce(pro, alpha.vals, "alpha") },smuce.q=function(pro, q.vals=smuce.q.vec){ run.smuce(pro, q.vals, "q") },smuce.penalized=function(pro, lambda.vec=10^c(seq(-8,1,l=100))){ require(stepR) each.chrom(pro, function(chr){ Y <- chr$logratio q.mean.mat <- matrix(NA, length(smuce.q.vec), length(Y)) q.changes.vec <- rep(NA, length(smuce.q.vec)) for(param.i in seq_along(smuce.q.vec)){ param.val <- smuce.q.vec[[param.i]] fit <- smuceR(Y, q=param.val) seg.df <- with(fit, data.frame(leftEnd, rightEnd, value)) for(segment.i in 1:nrow(seg.df)){ seg <- seg.df[segment.i, ] q.mean.mat[param.i, seg$leftEnd:seg$rightEnd] <- seg$value } q.changes.vec[[param.i]] <- sum(diff(q.mean.mat[param.i, ]) != 0) } q.segments.vec <- q.changes.vec + 1 max.segments <- 20 seg.mean.mat <- matrix(mean(Y), max.segments, length(Y)) for(n.segments in 2:max.segments){ q.row <- which(q.segments.vec == n.segments)[1] seg.mean.mat[n.segments, ] <- q.mean.mat[q.row, ] } penalized.mean.mat(Y, seg.mean.mat, lambda.vec) }) },wbs.penalized=function(pro, lambda.vec=10^c(seq(-8,1,l=100))){ library(wbs) each.chrom(pro, function(chr){ Y <- chr$logratio w <- wbs(Y) max.segments <- 20 max.changes <- max.segments-1 seg.mean.mat <- matrix(NA, max.segments, length(Y)) seg.mean.mat[1, ] <- mean(Y) for(n.changes in 2:max.changes){ n.segments <- n.changes + 1 fit <- wbs:::changepoints.sbs(w, Kmax=n.changes) ## Confusingly, the changepoints are output in the following ## formats: break.vec <- if(is.list(fit$cpt.th)){#more than 1 change: ## $cpt.th[[1]] ## [1] 3 1 2 4 sort(fit$cpt.th[[1]]) }else if(is.na(fit$cpt.th)){#no changes: ## $cpt.th ## [1] NA integer() }else{#one change. ## $cpt.th ## [1] 2 fit$cpt.th } end.vec <- c(break.vec, length(Y)) start.vec <- c(1, break.vec+1) for(segment.i in seq_along(start.vec)){ start <- start.vec[[segment.i]] end <- end.vec[[segment.i]] seg.data <- Y[start:end] seg.mean <- mean(seg.data) seg.mean.mat[n.segments, start:end] <- seg.mean } } penalized.mean.mat(Y, seg.mean.mat, lambda.vec) }) },wbs.default=function(pro, param=NA){ run.wbs(pro) },wbs.th.const=function(pro, th.const=seq(0, 10, by=0.05)){ run.wbs(pro, th.const=th.const) },wbs.th=function(pro, th=c(0, 10^seq(-1, 2, l=200))){ run.wbs(pro, th=th) },pelt.diffparam=function(pro, values=sprintf("diffparam*%.10f",10^seq(-2,1,l=100))){ run.pelt(pro,"Manual",values) },pelt.manual=function(pro,values=10^seq(-2,1,l=100)){ run.pelt(pro,"Manual",values) },pelt.asymptotic=function(pro, ### for some profiles, for values <= 8e-2, we see ###Error in PELT.mean.norm(data, value) : ### NA/NaN/Inf in foreign function call (arg 4) ###In addition: Warning message: ###In log(log((1 - alpha + exp(-2 * (pi^(1/2)) * exp(blogn/alogn)))^(-1/(2 * : ### NaNs produced ### so lets check for this, and use the closest value if this happens. values=c(1-10^seq(-20,-1,l=80),8*(10^seq(-1,-3,l=20)))){ run.pelt(pro,"Asymptotic",values) },flsa.norm=function(profile, alpha2.vals=c(1e-5,1e-4,1e-3,5e-3,1e-2,2e-2,5e-2, 10^seq(-1,2,l=100), 2e2,5e2,1e3,1e4,1e5, 1e6,1e7,1e8,1e9,1e10,1e11,1e12) ){ require(flsa) each.chrom(profile,function(d){ model <- flsa(d$logratio) l <- max(d$position)-min(d$position) flsaGetSolution(model,lambda2=nrow(d)*1e6*alpha2.vals/l) }) },cghseg.mBIC=function(profile,param=1){ require(cghseg) each.chrom(profile,function(chr){ Y <- chr$logratio if(length(Y)<10)return(rbind(rep(mean(Y),length(Y)))) cghdata <- new("CGHdata",Y) cghoptions <- new("CGHoptions") result <- uniseg(cghdata,cghoptions) sm <- Y segs <- result@mu$Y for(i in 1:nrow(segs)){ sm[segs[i,"begin"]:segs[i,"end"]] <- segs[i,"mean"] } rbind(sm) }) },cghseg.lambda=function(profile,lambda=10^c(seq(-8,-3,l=10), seq(-2,3,l=100), seq(4,8,l=10))){ ## The point of this function is to try to recover the best ## segmentation according to some smart weighting of model complexity ## and signal reconstruction error. First, I wrote the following to ## Guillem ## Hi Guillem. I experimented a bit with the cghseg package this morning, ## and I would like some additional advice, please! ## I found the cghseg:::segmeanCO function which returns the t.est matrix ## of indices of the optimal segments for each k. This is useful, but I ## need a criterion besides k for selecting the level of smoothness. ## To provide some context, I have annotations about where the ## breakpoints really are, and I would like to find a smoothing function ## that agrees with those annotations. For some profiles there are no ## breakpoints, and for some profiles, there are many breakpoints. Thus ## fixing a k and applying that to each profile will certainly not agree ## with all profiles. ## In DNAcopy there is a parameter "undo.SD" which merges adjacent ## segments if the difference of their means is less than the value you ## specify. By varying this parameter I can find a sweet spot that ## minimizes false positive and false negative rates with respect to my ## annotations. I suppose I could do something similar with the output of ## segmeanCO(), but do you have any code already written that does this? ## In GLAD there is a parameter "lambdabreak" which essentially picks the ## number of breakpoints for a smoothing x by minimizing ## error(x) + lambdabreak * complexity(x) ## This is nice because I can again try various levels of lambdabreak and ## just pick the one that gives me a function that maximizes the ## agreement with my annotation data. ## I looked at your definition of BIC in the Kselection() function, and I ## was thinking of adapting it for this purpose, but I was unsure about ## which terms correspond to error and which correspond to model ## complexity. Can you clarify? ## if (SSbg == 0) { ## BIC[k] = gamma.coef + 0.5 * k * log(SSall) - 0.5 * ## sum(log(nk)) + (0.5 - k) * log(n) ## } ## else { ## BIC[k] = 0.5 * (n - k + 1) * log(1 + (SSbg)/(SSall - ## SSbg)) + gamma.coef + 0.5 * k * log(SSall) - ## 0.5 * sum(log(nk)) + (0.5 - k) * log(n) ## } ######################### Then, he wrote back: ## I guess the simplest approach is to recover all best segmentations ## in 1 to K segments (with segmeanCO), then compute the agreement ## with your annotation data for all k and finally pick the k that ## maximizes the agreement with your annotation data. That would be ## very similar to your lambda-break strategy with GLAD (k playing the ## role of the complexity and the likelihood or L2 loss playing the ## role of the error). ## I think, it might be more difficult to play with the mBIC criteria ## since the mBIC penalization takes into account many different type ## of complexity (number of parameters, length of segments, number of ## possible segmentations) In the computation of the BIC, (SSall - ## SSbg) is the L2 loss and can be interpreted as the error. If you ## look at the case SSbg > 0, it could be rewritten as : ## BIC[k] = 0.5 * (n - k + 1) * ( log(SSall) - log(SSall - SSbg) ) ## + 0.5 * k * log(SSall) ## + gamma.coef ## - 0.5 * sum(log(nk)) + (0.5 - k) * log(n) ## = - 0.5 * (n - k + 1) * log(SSall - SSbg) ## + 0.5 * (n + 1) * log(SSall) ## + gamma.coef ## - 0.5 * sum(log(nk)) + (0.5 - k) * log(n) ## where ## a) gamma.coef penalize for the number of possible segmentation (n-1 ## \choose k-1) ; ## b) 0.5 * sum(log(nk)) is the entropy of the segment lengths ## (segmentations with very smaller segments get higher penalty) ; ## c) (0.5 - k) * log(n) penalize the number of parameters to estimate ## d) finally the likelihood is multiplied by 0.5 * (n - k + 1). ## Due to this last term it is not easy to get a nice additive formula ## of the form : error(x) + complexity(x). ## (The case SSbg == 0 correspond to log(SSall) - log(SSall - SSbg) == 0) ## Hopefully this will answer your questions. Let me know if that is ## not the case or if you have additional questions. Cheers, Guillem run.cghseg(profile,function(kmax,Y,const.lines,n,smoothed.mat,...){ lterms <- rep(NA,kmax) rterms <- rep(NA,kmax) ybar <- mean(Y,na.rm=TRUE) SSall <- sum((Y-ybar)^2,na.rm=TRUE) for(k in 1:kmax){ gamma.coef <- lgamma(0.5*(n-k-1)) - lgamma(0.5*(n+1)) mu <- const.lines[const.lines$k==k,] nk <- with(mu,end-begin+1) ### !!! ok SSbg <- sum(nk*(mu$mean-ybar)^2,na.rm=TRUE) rterms[k] <- gamma.coef + 0.5*k*log(SSall)-0.5 * sum(log(nk))+(0.5-k)*log(n) lterms[k] <- if (SSbg ==0){ 0 } else { 0.5*(n-k+1) * log(1+ (SSbg)/(SSall-SSbg)) } } # end k do.call(rbind,lapply(lambda,function(l){ smoothed.mat[which.min(lterms + l*rterms),] })) }) },cghseg.k=function(profile,lambda=10^c(seq(-8,1,l=100))){ ### From Lavielle 2005 p.8, where he suggests to use average squared ### residual + beta*(number of segments) run.cghseg(profile,function(kmax,Y,smoothed.mat,...){ average.residual <- rep(NA,kmax) penalty <- rep(NA,kmax) for(k in 1:kmax){ average.residual[k] <- mean((Y-smoothed.mat[k,])^2) penalty[k] <- k } do.call(rbind,lapply(lambda,function(l){ smoothed.mat[which.min(average.residual + l*penalty),] })) }) },cghseg.k.sqrt=function(profile,lambda=10^c(seq(-1,5,l=100))){ ### Simulations indicate that a factor of sqrt(d) is needed to correct ### for number of points sampled, and 1/sqrt(l) is needed to correct ### for signal size in base pairs. run.cghseg(profile,function(kmax,Y,smoothed.mat,l,...){ total.residual <- rep(NA,kmax) penalty <- rep(NA,kmax) d <- length(Y) for(k in 1:kmax){ total.residual[k] <- sum((Y-smoothed.mat[k,])^2) penalty[k] <- k*sqrt(d/l) } do.call(rbind,lapply(lambda,function(l){ smoothed.mat[which.min(total.residual + l*penalty),] })) }) },cghseg.k.var=function(profile,lambda=10^c(seq(-3,1,l=100))){ ### Simulations indicate that a factor of s.hat^2 is necessary to ### correct for differences in scale of the samples. run.cghseg(profile,function(kmax,Y,smoothed.mat,...){ total.residual <- rep(NA,kmax) penalty <- rep(NA,kmax) d <- length(Y) s.hat <- median(abs(diff(Y))) for(k in 1:kmax){ total.residual[k] <- sum((Y-smoothed.mat[k,])^2) penalty[k] <- k*d*(s.hat^2) } do.call(rbind,lapply(lambda,function(l){ smoothed.mat[which.min(total.residual + l*penalty),] })) }) },cghseg.k.sqrt.var=function(profile,lambda=10^c(seq(2,7,l=100))){ ### Try using all 3 corrections shown to be beneficial in simulations: ### density d, base pairs length l, and scale s.hat. run.cghseg(profile,function(kmax,Y,smoothed.mat,l,...){ total.residual <- rep(NA,kmax) penalty <- rep(NA,kmax) d <- length(Y) s.hat <- median(abs(diff(Y))) for(k in 1:kmax){ total.residual[k] <- sum((Y-smoothed.mat[k,])^2) penalty[k] <- k*sqrt(d/l)*(s.hat^2) } do.call(rbind,lapply(lambda,function(l){ smoothed.mat[which.min(total.residual + l*penalty),] })) }) },cghseg.k.sqrt.d=function(profile,lambda=10^c(seq(-3,0,l=100))){ ### Use sqrt(d) instead of d in cghseg.k run.cghseg(profile,function(kmax,Y,smoothed.mat,...){ total.residual <- rep(NA,kmax) penalty <- rep(NA,kmax) d <- length(Y) s.hat <- median(abs(diff(Y))) for(k in 1:kmax){ total.residual[k] <- sum((Y-smoothed.mat[k,])^2) penalty[k] <- k*sqrt(d) } do.call(rbind,lapply(lambda,function(l){ smoothed.mat[which.min(total.residual + l*penalty),] })) }) },cghseg.k.sqrt.d.var=function(profile,lambda=10^c(seq(-1,3,l=100))){ ### Use the sqrt(d) correction with the scale correction. run.cghseg(profile,function(kmax,Y,smoothed.mat,...){ total.residual <- rep(NA,kmax) penalty <- rep(NA,kmax) d <- length(Y) s.hat <- median(abs(diff(Y))) for(k in 1:kmax){ total.residual[k] <- sum((Y-smoothed.mat[k,])^2) penalty[k] <- k*sqrt(d)*(s.hat^2) } do.call(rbind,lapply(lambda,function(l){ smoothed.mat[which.min(total.residual + l*penalty),] })) }) },cghseg.k1=function(profile,lambda=10^c(seq(-8,1,l=100))){ ### From Lavielle 2005 p.8, where he suggests to use average squared ### residual + beta*(number of segments) run.cghseg(profile,function(kmax,Y,smoothed.mat,...){ average.residual <- rep(NA,kmax) penalty <- rep(NA,kmax) for(k in 1:kmax){ average.residual[k] <- mean((Y-smoothed.mat[k,])^2) penalty[k] <- k - 1 } do.call(rbind,lapply(lambda,function(l){ smoothed.mat[which.min(average.residual + l*penalty),] })) }) },cghseg.k.less=function(profile,lambda=10^c(seq(-4,-1,l=10))){ ### From Lavielle 2005 p.8, where he suggests to use average squared ### residual + beta*(number of segments) run.cghseg(profile,function(kmax,Y,smoothed.mat,...){ average.residual <- rep(NA,kmax) penalty <- rep(NA,kmax) for(k in 1:kmax){ average.residual[k] <- mean((Y-smoothed.mat[k,])^2) penalty[k] <- k } do.call(rbind,lapply(lambda,function(l){ smoothed.mat[which.min(average.residual + l*penalty),] })) }) },cghseg.resid.corr=function(profile,lambda=10^c(seq(-8,1,l=100))){ ### Use correletion coefficient of neighboring residuals instead of ### average squared residual. run.cghseg(profile,function(kmax,Y,smoothed.mat,...){ resid.corr <- rep(NA,kmax) penalty <- rep(NA,kmax) for(k in 1:kmax){ residuals <- Y-smoothed.mat[k,] resid.corr[k] <- cor(residuals[-1],residuals[-length(residuals)]) penalty[k] <- k } resid.corr[is.na(resid.corr)] <- 0 ##plot(penalty,resid.corr,type="l",ylim=c(-1,1));browser() do.call(rbind,lapply(lambda,function(l){ smoothed.mat[which.min(resid.corr + l*penalty),] })) }) },cghseg.klogn=function(profile,lambda=10^c(seq(-8,1,l=100))){ ### From Lavielle 2005 p.8, where he says the schwarz criterion means ### min average squared residual + 2 \sigma^2(\log n)/n * k run.cghseg(profile,function(kmax,Y,n,smoothed.mat,...){ average.residual <- rep(NA,kmax) penalty <- rep(NA,kmax) for(k in 1:kmax){ average.residual[k] <- mean((Y-smoothed.mat[k,])^2) penalty[k] <- k } do.call(rbind,lapply(lambda,function(l){ smoothed.mat[which.min(average.residual + l*penalty*log(n)),] })) }) },cghseg.c=function(profile,cvals=10^c(seq(-8,-3,l=10), seq(-2,3,l=100), seq(4,8,l=10))){ ### From Lavielle 2005 equation 14 run.cghseg(profile,function(kmax,Y,n,smoothed.mat,...){ average.residual <- rep(NA,kmax) kseq <- 1:kmax for(k in 1:kmax){ average.residual[k] <- mean((Y-smoothed.mat[k,])^2) } s <- median(abs(diff(Y))) do.call(rbind,lapply(cvals,function(cval){ pen <- 1+cval*log(n/kseq) smoothed.mat[which.min(average.residual + 2*s^2/n*kseq*pen),] })) }) },cghseg.schwarz=function(profile,sigma=10^c(seq(-8,-3,l=10), seq(-2,3,l=100), seq(4,8,l=10))){ ### From Lavielle 2005 p.8, where he says the schwarz criterion means ### min average squared residual + 2 \sigma^2(\log n)/n * k, and lets ### learn the sigma parameter. run.cghseg(profile,function(kmax,Y,n,smoothed.mat,...){ average.residual <- rep(NA,kmax) penalty <- rep(NA,kmax) for(k in 1:kmax){ average.residual[k] <- mean((Y-smoothed.mat[k,])^2) penalty[k] <- k } do.call(rbind,lapply(sigma,function(s){ smoothed.mat[which.min(average.residual + 2*s^2*log(n)/n*penalty),] })) }) },cghseg.schwarz.auto=function(profile,sigma=1){ ### From Lavielle 2005 p.8, where he says the schwarz criterion means ### min average squared residual + 2 \sigma^2(\log n)/n * k. Here we ### estimate sigma using the median of differences. run.cghseg(profile,function(kmax,Y,n,smoothed.mat,...){ average.residual <- rep(NA,kmax) penalty <- rep(NA,kmax) for(k in 1:kmax){ average.residual[k] <- mean((Y-smoothed.mat[k,])^2) penalty[k] <- k } s <- median(abs(diff(Y))) stat <- average.residual + 2*s^2*log(n)/n*penalty smoothed.mat[which.min(stat),,drop=FALSE] }) },cghseg.min.abs.diff=function(profile,madvals=10^seq(-2,1,l=40)){ ### The idea here is to do something like DNAcopy prune sd=... run.cghseg(profile,function(const.lines,smoothed.mat,...){ eachk <- split(const.lines,const.lines$k) min.abs.diff <- sapply(eachk,with,min(abs(if(length(mean)==1)Inf else diff(mean)))) do.call(rbind,lapply(madvals,function(MAD){ smoothed.mat[which.min(min.abs.diff[min.abs.diff>MAD]),] })) }) },cghFLasso=function(profile,param=1){ require(cghFLasso) t(cghFLasso(profile$logratio,profile$chromosome)$Esti.CopyN) },glad.default=function(profile,param=1){ runglad(profile,lambdabreak=8) },glad.haarseg=function(profile,qvals=10^seq(-80,-1,l=30)){ runglad(profile,breaksFdrQ=qvals,smoothfunc="haarseg") },glad.lambdabreak=function(profile, lambdavals=c(1e-2,1,5,10,20,25,30,35,40,100,500,1e4,1e8)){ runglad(profile,lambdabreak=lambdavals) },glad.MinBkpWeight=function(profile,wvals=2^seq(-12,3,l=30)){ runglad(profile,MinBkpWeight=wvals) },gada=function(pro,Tvals=10^seq(-1,2,l=100)){ fit <- fit.gada(pro) ## do all the elimination procedures in 1 go. be.list <- lapply(Tvals,function(Tval){ BackwardElimination(fit,Tval,1) }) gada.results(pro,be.list) },gada.default=function(pro,param=0){ fit <- fit.gada(pro) gada.results(pro,list(fit)) },dnacopy.default=function(profile,param=1){ dnacopy.smoothvec(profile,"undo.SD",3) },dnacopy.sd=function(profile, sdvals=c(seq(20,4,l=20),2^seq(2.5,-5,l=10)[-1],0)){ dnacopy.smoothvec(profile,"undo.SD",sdvals,undo.splits="sdundo") },dnacopy.prune=function(profile,prunevals=10^seq(-2,1,l=20)){ dnacopy.smoothvec(profile,"undo.prune",prunevals,undo.splits="prune") },dnacopy.alpha=function(profile,alphavals=10^seq(-80,-2,l=20)){ dnacopy.smoothvec(profile,"alpha",alphavals) }) runglad <- function ### Run glad to smooth a profile. (profile, ### Profile data.frame. ... ### Smoothing parameter for glad. ){ require(GLAD) L <- list(...) vals <- L[[1]] x <- transform(profile, PosOrder=seq_along(position), LogRatio=logratio, PosBase=position, Chromosome=chromosome) cgh <- as.profileCGH(x) smooth.vecs <- lapply(vals,function(v){ args <- list(profileCGH=cgh,v) names(args)[2] <- names(L)[1] args <- c(args,L[-1]) smooth <- tryCatch({ args$OnlySmoothing <- TRUE do.call(daglad,args) },error=function(e){ do.call(glad,args) }) smooth$profileValues$Smoothing }) do.call(rbind,smooth.vecs) ### Smoothing matrix nparam x nprobes. } several.breakpoints <- function ### Zero-one loss when breakpoint annotations are considered to mean ### presence of 1 or more breakpoints. (counts, ### Counts of breakpoints of model. anns ### Annotations. ){ ifelse(anns=="normal", ifelse(counts==0,0L,1L), ## normal ifelse(counts>0,0L,1L)) ## breakpoint ### Matrix of 0 and 1. } run.smoothers <- function ### Run several smoothers on a profile, quantifying agreement of the ### smoother with breakpoint annotations. (profile, ### Profile data.frame. breakpoint.labels, ### Annotation data.frame. smooth.funs=smoothers, ### List of smoothing functions to apply to the profile. loss=several.breakpoints, ### Loss function of (breakpoint counts,labels) --- both matrices ### nparam x nlabels, and should return numeric matrix of same size. tosave=c("errors","seconds","parameters","breakpoint.labels"), ### Variables to save to the db directory. db=file.path(Sys.getenv("HOME"),"smooth") ### Location to save gzipped result files. ){ stopifnot(is.data.frame(profile)) counter <- function(x)sapply(x,function(i)sum(i==x)) for(chrom in split(profile,profile$chromosome)){ counts <- counter(chrom$position) repeats <- chrom[counts>1,] if(nrow(repeats)){ print(repeats) stop("some repeated positions were found! most algorithms will break!") } } stopifnot(is.data.frame(breakpoint.labels)) breakpoint.labels <- subset(breakpoint.labels,!is.na(annotation)) if(nrow(breakpoint.labels)==0)stop("need at least 1 annotation") stopifnot(is.list(smooth.funs)) if(is.null(names(smooth.funs)))stop("smooth.funs need to be named") if(any(nchar(names(smooth.funs))==0))stop("all smooth.funs need names") if(!file.exists(db))dir.create(db) smoothing.parameters <- lapply(smooth.funs,function(f)eval(formals(f)[[2]])) for(algorithm in names(smooth.funs)){ print(algorithm) fun <- smooth.funs[[algorithm]] parameters <- smoothing.parameters[[algorithm]] seconds <- system.time({ smoothed.profiles <- fun(profile) })["elapsed"] ## reality checks for result of smoothing function stopifnot(ncol(smoothed.profiles)==nrow(profile)) stopifnot(nrow(smoothed.profiles)==length(parameters)) bpts <- matrix(NA,length(parameters),nrow(breakpoint.labels)) blabs <- matrix(NA,length(parameters),nrow(breakpoint.labels)) pos <- profile$position for(i in 1:nrow(breakpoint.labels)){ l <- breakpoint.labels[i,] probes <- l$min < pos & pos < l$max & profile$chromosome==l$chromosome smoothed.region <- smoothed.profiles[,probes,drop=FALSE] bpts[,i] <- apply(smoothed.region,1,function(x)sum(diff(x)!=0)) blabs[,i] <- as.character(l$annotation) } print(bpts) errors <- loss(bpts,blabs) for(varname in tosave){ if(varname %in% ls()){ var <- get(varname) edir <- file.path(db,profile$profile.id[1],algorithm) if(!file.exists(edir))dir.create(edir,recursive=TRUE) this.base <- sprintf("%s.csv.gz",varname) efn <- file.path(edir,this.base) conn <- gzfile(efn,"w") write.table(var,conn, row.names=FALSE,col.names=FALSE,quote=FALSE,sep=",") close(conn) } } } ### Nothing, the results are saved to files. } ### Smoothing functions used in the article. article.smoothers <- smoothers[c("cghseg.k","flsa.norm","flsa","glad.lambdabreak", "glad.MinBkpWeight","glad.haarseg", "cghseg.mBIC","cghFLasso","glad.default","dnacopy.default", "dnacopy.sd", "dnacopy.alpha", "dnacopy.prune" )]