#!/usr/bin/perl ### ## Copyright 2020 Tomas Moyano (tcmoyano@uc.cl) , Jose Alvarez and Rodrigo Gutierrez. # # These R scripts are modified from https://slowkow.github.io/CENTIPEDE.tutorial (Pique-Regi, R. et al. Accurate inference of transcription factor binding from DNA sequence and chromatin accessibility data. Genome Res. 21, 447–455 (2011).) # The function search_protectedSite is developved for the book chapter Genomic Footprinting Analyses from DNase-seq data to Construct Gene Regulatory Networks. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. #### scripts for centipede. search_protectedSite<-function(conditions, allfimos,FLANK_size,log10p) { for(COND in conditions) { fimo<-allfimos[grep(allfimos,pattern=COND)] for(FIMO_exp in fimo) { try(search_protectedSite_call(COND,BAMfile = paste(COND,"filter.bam",sep="."),FIMOfile = paste("pwms_all_motifs/",FIMO_exp,sep=""),FLANK_size = FLANK_size,log10p = log10p)) } } } search_protectedSite_call<- function(condition,BAMfile,FIMOfile,FLANK_size,log10p ) { cen <- centipede_data( bam_file = BAMfile, fimo_file = FIMOfile, log10p = log10p, flank_size = FLANK_size ) fit <- fitCentipede( Xlist = list(DNase = cen$mat), Y = as.matrix(data.frame( Intercept = rep(1, nrow(cen$mat)) )) ) pdf(paste(FIMOfile,"pdf",sep=".")) imageCutSitesCombined(cen$mat[fit$PostPr >= 1-1E-16,]) plotProfile(fit$LambdaParList[[1]], Mlen = 13) regionsOUT<-NULL regionsOUT<-cen$regions[fit$PostPr >= 1-1E-16,] regionsOUT$start=regionsOUT$star+FLANK_size regionsOUT$stop=regionsOUT$stop-FLANK_size regionsOUT$condition<-condition meme<-strsplit(FIMOfile,".txt.meme")[[1]][1] meme<-strsplit(meme,"/")[[1]][2] regionsOUT$meme<-meme regionsOUT<-as.matrix(regionsOUT) protected<-regionsOUT[,c(1,2,3,10,11,6,12)] write.table(protected,paste(FIMOfile,"protected.txt",sep="."),sep="\t",row.names=F,quote=F,col.names = F) write.table(cen$regions[fit$PostPr >= 1-1E-16,],paste(FIMOfile,"regiones.txt",sep="."),sep="\t",row.names=F,quote=F) write.table(cen$mat[fit$PostPr >= 1-1E-16,],paste(FIMOfile,"mat.txt",sep="."),sep="\t",col.names=NA,quote=F) dev.off() } centipede_data<- function (bam_file, fimo_file, log10p = 4, flank_size = 100) { sites <- read_fimo(fimo_file) sites$start <- sites$start - flank_size sites$stop <- sites$stop + flank_size sites <- sites[with(sites, order(sequence_name, start, stop)), ] bam_index_file <- sprintf("%s.bai", bam_file) if (!file.exists(bam_index_file)) { message("Indexing the BAM file... this may take several minutes.") indexBam(bam_file, overwrite = FALSE) } bam <- Rsamtools::scanBam(file = bam_file, param = Rsamtools::ScanBamParam(which = GRanges(seqnames = sites$sequence_name, ranges = IRanges(start = sites$start, end = sites$stop)), what = c("strand", "pos"))) if (length(bam) == 0) { stop(sprintf("No reads fall in sites from '%s'\n", fimo_file)) } regions <- lapply(names(bam), parse_region) regions <- data.frame(sequence_name = unlist(sapply(regions,function(x) x["chrom"])), start = as.numeric(unlist(sapply(regions,function(x) x["start"]))), stop = as.numeric(unlist(sapply(regions, function(x) x["end"])))) regions$index <- 1:nrow(regions) regions <- merge(regions, sites) regions <- regions[with(regions, order(sequence_name, start, stop)), ] bam <- bam[regions$index] regions <- regions[, !colnames(regions) %in% c("index")] if (nrow(regions) != length(bam)) { stop(sprintf("ERROR: %d regions and %d in bam. '%s'\n", nrow(regions), length(bam), fimo_file)) } mat <- do.call(rbind, lapply(seq(1, length(bam)), function(i) { region <- parse_region(names(bam)[i]) len <- abs(region$end - region$start) + 1 item <- bam[[i]] idx <- item$pos >= region$start & item$pos <= region$end if (sum(idx) == 0) { return(rep(0, 2 * len)) } strand <- item$strand[idx] position <- item$pos[idx] is.neg <- as.numeric(strand == "-") j <- 1 + position - min(position) + (len * is.neg) as.numeric(table(factor(j, levels = seq(1, 2 * len)))) })) rownames(mat) <- names(bam) list(mat = mat, regions = regions) } ### read_fimo<- function (fimo_file, log10p = 4) { sites <- read.delim(fimo_file) sites <- sites[-log10(sites$p.value) > log10p, , drop = FALSE] sites$region <- paste(sites$sequence_name, sites$start, sites$stop) sites <- do.call(rbind, lapply(unique(sites$region), function(region) { x <- sites[sites$region == region, ] x[which.max(x$score), ] })) sites <- sites[, !colnames(sites) %in% c("region")] if (nrow(sites) == 0) { stop(sprintf("No significant sites for '%s'", fimo_file)) } return(sites) } parse_region<- function (region) { xs <- strsplit(region, ":")[[1]] chrom <- xs[1] xs <- as.numeric(strsplit(xs[2], "-")[[1]]) return(list(chrom = chrom, start = xs[1], end = xs[2])) }