# Preparing the genome specific files # Prepare the work directory export WD=$(pwd) # Download the genome, the identifier of each chromosome must not have spaces in the name. wget http://virtualplant.bio.puc.cl/share/DNAse/T10.fa # The ENCODE pipeline for the analysis of DNAse-seq and tother additional tools must be downloaded and unzipped with the following instructions: wget https://github.com/ENCODE-DCC/dnase_pipeline/archive/master.zip -O dnase_pipeline.zip unzip dnase_pipeline.zip wget https://github.com/rthurman/hotspot/archive/master.zip -O hotspot-master.zip unzip hotspot-master.zip #In this and subsequent steps, the programs will be included in the PATH as an environment tool. export PATH=$WD/hotspot-master/hotspot-distr/hotspot-deploy/bin/:$PATH export PATH=$WD/dnase_pipeline-master/dnanexus/dnase-index-bwa/resources/usr/bin/:$PATH # We need to create the file with the mappable regions in the genome. # You need to install bowtie. We recommend install it with conda. conda install bowtie # Index the reference genome for bowtie alignment tool: bowtie-build -f T10.fa T10.bowtie # Once this file is obtained, the mappable regions are generated for a given DNAse-seq read length. In this case, we used 20 bp: $WD/hotspot-master/hotspot-distr/hotspot-deploy/bin/enumerateUniquelyMappableSpace.pl 20 T10.bowtie T10.fa | sort-bed - | bedops -m - > T10.bowtie.read_length20.mappable_only.bed # Align the DNAse reads to the genome export PATH=$WD/dnase_pipeline-master/dnanexus/dnase-index-bwa/resources/usr/bin/:$PATH $WD/dnase_pipeline-master/dnanexus/dnase-index-bwa/resources/usr/bin/dnase_index_bwa.sh T10 T10.fa false T10.bowtie.read_length20.mappable_only.bed # download the DNAse seq file (FASTQ format). wget http://virtualplant.bio.puc.cl/share/DNAse/DH_012_KCl_replicate1.fastq.gz #You can choose the number of processors to use in the alignment process. Here we use 6 processors. export PATH=$WD/dnase_pipeline-master/dnanexus/dnase-align-bwa-se/resources/usr/bin/:$PATH $WD/dnase_pipeline-master/dnanexus/dnase-align-bwa-se/resources/usr/bin/dnase_align_bwa_se.sh T10_bwa_index.tgz DH_012_KCl_replicate1.fastq.gz 6 DH_012_KCl_replicate1 # Filter the alignment files export PATH=$WD/dnase_pipeline-master/dnanexus/dnase-filter-se/resources/usr/bin/:$PATH $WD/dnase_pipeline-master/dnanexus/dnase-filter-se/resources/usr/bin/dnase_filter_se.sh DH_012_KCl_replicate1.bam 10 60 DH_012_KCl_replicate1.filter # Identification of open chromatin regions export PATH=$WD/dnase_pipeline-master/dnanexus/dnase-call-hotspots/resources/usr/bin/:$PATH nohup hotspot2.sh -c chrom_sizes.bed -C center_sites.starch -M T10.bowtie.read_length20.mappable_only.bed -P DH_012_KCl_replicate1.filter.bam DH_012_KCl_replicate1.DH >DH_012_KCl_replicate1.DH.nohup unstarch DH_012_KCl_replicate1.DH/DH_012_KCl_replicate1.filter.hotspots.fdr0.05.starch >DH_012_KCl_replicate1.filter.hotspots.fdr0.05.bed unstarch DH_012_KCl_replicate1.DH/DH_012_KCl_replicate1.filter.peaks.narrowpeaks.starch > DH_012_KCl_replicate1.filter.peaks.narrowpeaks.bed #Scanning for TF binding motifs within DHSs # TF binding sites collect and format. Most of the steps are taken from https://slowkow.github.io/CENTIPEDE.tutorial/ # Download PWM files from the CISBP database (http://cisbp.ccbr.utoronto.ca/) (Weirauch et al., 2014). The PWMs can be downloaded from the website by selecting the binding motifs for the species of interest. Herein, we will use the TF binding sites associated with Arabidopsis thaliana. In the Bulk Download page (http://cisbp.ccbr.utoronto.ca/bulk.php), select Arabidopsis thaliana and download the PWMs and TF info (Download or copy the URL). The filename changes according to the download date. Correct the link for your downloading instance. wget http://cisbp.ccbr.utoronto.ca/tmp/Arabidopsis_thaliana_2020_06_24_5:54_pm.zip -O Arabidopsis_thaliana_pwm.zip unzip Arabidopsis_thaliana_pwm.zip # convert PWM files to MEME wget http://meme-suite.org/meme-software/5.1.1/meme-5.1.1.tar.gz tar -xzvf meme-5.1.1.tar.gz cd meme-5.1.1/ ./configure make make install cd .. # Transform PWM files to MEME files cd pwms_all_motifs echo '$WD/meme-5.1.1/scripts/matrix2meme < <(tail -n+2 $1 | cut -f2-) > $1.meme' > script.pwm2meme.sh ls M*.txt | xargs -n1 -P 2 bash ./script.pwm2meme.sh cd $WD # DNA sequence extraction from open chromatin regions. bedtools getfasta -fi T10.fa -bed DH_012_KCl_replicate1.filter.peaks.narrowpeaks.bed -fo DH_012_KCl_replicate1.filter.peaks.narrowpeaks.bed.fasta # Scanning for TF binding motifs within DHSs echo '$WD/meme-5.1.1/src/fimo --text --parse-genomic-coord $1 DH_012_KCl_replicate1.filter.peaks.narrowpeaks.bed.fasta | gzip > $1__DH_012_KCl_replicate1.narrowpeaks_fimo.gz ' > script.meme_fimo.sh ls pwms_all_motifs/*.meme |xargs -n1 bash script.meme_fimo.sh ####### Genomic footprinting ####### # in R install CENTIPEDE. if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("Rsamtools") install.packages("CENTIPEDE", repos="http://R-Forge.R-project.org") library(Rsamtools) library(CENTIPEDE) # Download http://virtualplant.bio.puc.cl/share/DNAse/DNAse-functions.R , this file contain functions used for run the analysis. source("DNAse-functions.R") . # The FIMO files obtained in the previous step are listed in the "fimos" object with the function "dir." allfimos<-dir("pwms_all_motifs/",pattern="fimo") conditions<-"DH_012_KCl_replicate1" search_protectedSite(conditions, allfimos, FLANK_size=100, log10p=4) # Up to this point in the pipeline, the steps must be repeated for each library analyzed. The following steps will be performed using four DNAse-seq libraries. We will use two biological replicates of KNO3 treatments and two biological replicates of KCl treatments as the control in Arabidopsis roots. As mentioned above, these libraries can be downloaded from PRJNA563066 BioProject (https://www.ncbi.nlm.nih.gov/bioproject/PRJNA563066/) or http://virtualplant.bio.puc.cl/share/DNAse/. # For those who want to continue the proposed pipeline, on the page http://virtualplant.bio.puc.cl/share/DNAse/protected.tgz you can find the rest of the libraries already processed to continue the analysis. # TF/motif assignation. # in Unix: cat *.protected.txt >protected.total # in R protected.total<-read.table("pwms_all_motifs/protected.total") # load the TF_Information.txt file TF_info<-read.table("TF_Information.txt",header=T,sep="\t") # Then, the protected.total and TF_info file are merged and a column with the TF gene identifier is added: protected.TF<-merge(protected.total,TF_info[,c(4,6,9,10)],by.x=7,by.y=1) # select the columns for further use and save the data: protected.bed<-(protected.TF[,c(2:4,1,8,7,9,10,5,6)]) write.table(protected.bed,"protected.TF.sites.txt", sep="\t",row.names=F,col.names=F,quote=F) ## Parse the data. Download the file collapse.pl and TAIR10.1000pb5p.bed from http://virtualplant.bio.puc.cl/share/DNAse/ # in unix: bedtools sort -i protected.TF.sites.txt |perl collapse.pl >protected.TF.sites.conditions.bed # Gene assignemnt for protected sites. bedtools intersect -wo -a protected.TF.sites.conditions.bed -b TAIR10.1000pb5p.bed >protected.TF.sites.conditions.1000_5p.bed #### Network Visualization # Gene selection and network preparation. # in R: protected.sites<-read.table("protected.TF.sites.conditions.1000_5p.bed",sep="\t") # Donwload the response_to_nitrate060min.txt file from http://virtualplant.bio.puc.cl/share/DNAse/. response_to_nitrate060min<-as.matrix(read.table("response_to_nitrate060min.txt")) network<-merge(response_to_nitrate060min , protected.sites,by.x=1,by.y=14) network<-network[,c(1:11)] # The names can be assigned to each column to be used as identifiers in the network. colnames(network)<-c("TARGET","Chr","start","end","motif_id","TF","strand","TF_assign","TF_family","motif_sequence","DH_pattern") #The network object is saved as a .txt file to be used in Cytoscape. write.table(network,"network.txt",sep="\t",col.names=NA,quote=F) # Finally, the list of TFs in the network can be obtained to use it as an attribute with the following command: TFs.att<-cbind(unique(as.matrix(network$TF)),"TF") colnames(TFs.att)<-c("agi","TF") write.table(TFs.att,"TFs.att",sep="\t",row.names=F,quote=F)