# Preparing the genome specific files

# Prepare the work directory 
export WD=$(pwd) 

# Download the genome, the identifier of each chromosome must not have spaces in the name.

wget http://virtualplant.bio.puc.cl/share/DNAse/T10.fa

# The ENCODE pipeline for the analysis of DNAse-seq and tother additional tools must be downloaded and unzipped with the following instructions:

wget https://github.com/ENCODE-DCC/dnase_pipeline/archive/master.zip -O dnase_pipeline.zip

unzip dnase_pipeline.zip
 
wget https://github.com/rthurman/hotspot/archive/master.zip -O hotspot-master.zip
 
unzip hotspot-master.zip
 
#In this and subsequent steps, the programs will be included in the PATH as an environment tool.
 
export PATH=$WD/hotspot-master/hotspot-distr/hotspot-deploy/bin/:$PATH
export PATH=$WD/dnase_pipeline-master/dnanexus/dnase-index-bwa/resources/usr/bin/:$PATH

 
# We need to create the file with the mappable regions in the genome. 
 

# You need to install bowtie. We recommend install it with conda.
 
conda install bowtie
 
# Index the reference genome for bowtie alignment tool:
 
bowtie-build -f T10.fa T10.bowtie
 
# Once this file is obtained, the mappable regions are generated for a given DNAse-seq read length. In this case, we used 20 bp:
 
$WD/hotspot-master/hotspot-distr/hotspot-deploy/bin/enumerateUniquelyMappableSpace.pl 20 T10.bowtie T10.fa | sort-bed - | bedops -m - > T10.bowtie.read_length20.mappable_only.bed
 
# Align the DNAse reads to the genome
 
export PATH=$WD/dnase_pipeline-master/dnanexus/dnase-index-bwa/resources/usr/bin/:$PATH
 
$WD/dnase_pipeline-master/dnanexus/dnase-index-bwa/resources/usr/bin/dnase_index_bwa.sh T10 T10.fa false T10.bowtie.read_length20.mappable_only.bed

# download the DNAse seq file (FASTQ format).
 
wget http://virtualplant.bio.puc.cl/share/DNAse/DH_012_KCl_replicate1.fastq.gz

#You can choose the number of processors to use in the alignment process. Here we use 6 processors.
 
export PATH=$WD/dnase_pipeline-master/dnanexus/dnase-align-bwa-se/resources/usr/bin/:$PATH

$WD/dnase_pipeline-master/dnanexus/dnase-align-bwa-se/resources/usr/bin/dnase_align_bwa_se.sh T10_bwa_index.tgz DH_012_KCl_replicate1.fastq.gz 6 DH_012_KCl_replicate1

# Filter the alignment files 
 
export PATH=$WD/dnase_pipeline-master/dnanexus/dnase-filter-se/resources/usr/bin/:$PATH
 
$WD/dnase_pipeline-master/dnanexus/dnase-filter-se/resources/usr/bin/dnase_filter_se.sh  DH_012_KCl_replicate1.bam 10 60  DH_012_KCl_replicate1.filter
 
# Identification of open chromatin regions

export PATH=$WD/dnase_pipeline-master/dnanexus/dnase-call-hotspots/resources/usr/bin/:$PATH
 
nohup hotspot2.sh -c chrom_sizes.bed -C center_sites.starch -M T10.bowtie.read_length20.mappable_only.bed -P DH_012_KCl_replicate1.filter.bam DH_012_KCl_replicate1.DH  >DH_012_KCl_replicate1.DH.nohup
 
 
unstarch DH_012_KCl_replicate1.DH/DH_012_KCl_replicate1.filter.hotspots.fdr0.05.starch >DH_012_KCl_replicate1.filter.hotspots.fdr0.05.bed
unstarch DH_012_KCl_replicate1.DH/DH_012_KCl_replicate1.filter.peaks.narrowpeaks.starch  > DH_012_KCl_replicate1.filter.peaks.narrowpeaks.bed
 
 
#Scanning for TF binding motifs within DHSs 

# TF binding sites collect and format. Most of the steps are taken from https://slowkow.github.io/CENTIPEDE.tutorial/

# Download PWM files from the CISBP database (http://cisbp.ccbr.utoronto.ca/)  (Weirauch et al., 2014). The PWMs can be downloaded from the website by selecting the binding motifs for the species of interest. Herein, we will use the TF binding sites associated with Arabidopsis thaliana. In the Bulk Download page (http://cisbp.ccbr.utoronto.ca/bulk.php), select Arabidopsis thaliana and download the PWMs and TF info (Download or copy the URL). The filename changes according to the download date. Correct the link for your downloading instance.
 
wget http://cisbp.ccbr.utoronto.ca/tmp/Arabidopsis_thaliana_2020_06_24_5:54_pm.zip -O Arabidopsis_thaliana_pwm.zip
 
unzip Arabidopsis_thaliana_pwm.zip

# convert PWM files to MEME

wget http://meme-suite.org/meme-software/5.1.1/meme-5.1.1.tar.gz
tar -xzvf meme-5.1.1.tar.gz
cd meme-5.1.1/
./configure
make
make install
cd ..
 
# Transform PWM files to MEME files

cd pwms_all_motifs
 
echo '$WD/meme-5.1.1/scripts/matrix2meme < <(tail -n+2 $1 | cut -f2-) > $1.meme' > script.pwm2meme.sh
 
ls M*.txt | xargs -n1 -P 2 bash ./script.pwm2meme.sh

cd $WD

# DNA sequence extraction from open chromatin regions. 

bedtools getfasta -fi T10.fa -bed DH_012_KCl_replicate1.filter.peaks.narrowpeaks.bed -fo  DH_012_KCl_replicate1.filter.peaks.narrowpeaks.bed.fasta

# Scanning for TF binding motifs within DHSs
 
echo '$WD/meme-5.1.1/src/fimo --text --parse-genomic-coord $1 DH_012_KCl_replicate1.filter.peaks.narrowpeaks.bed.fasta | gzip > $1__DH_012_KCl_replicate1.narrowpeaks_fimo.gz ' > script.meme_fimo.sh 

ls pwms_all_motifs/*.meme |xargs -n1 bash script.meme_fimo.sh
 

####### Genomic footprinting  #######

# in R install CENTIPEDE.


if (!requireNamespace("BiocManager", quietly = TRUE))  install.packages("BiocManager")
BiocManager::install("Rsamtools")
install.packages("CENTIPEDE", repos="http://R-Forge.R-project.org")
library(Rsamtools)
library(CENTIPEDE)
            
# Download http://virtualplant.bio.puc.cl/share/DNAse/DNAse-functions.R , this file contain functions used for run the analysis.
 
 
source("DNAse-functions.R")    	.          
 
# The FIMO files obtained in the previous step are listed in the "fimos" object with the function "dir."
 
allfimos<-dir("pwms_all_motifs/",pattern="fimo")
 
 
conditions<-"DH_012_KCl_replicate1"
 
search_protectedSite(conditions, allfimos, FLANK_size=100, log10p=4)
 

# Up to this point in the pipeline, the steps must be repeated for each library analyzed. The following steps will be performed using four DNAse-seq libraries. We will use two biological replicates of KNO3 treatments and two biological replicates of KCl treatments as the control in Arabidopsis roots. As mentioned above, these libraries can be downloaded from PRJNA563066 BioProject (https://www.ncbi.nlm.nih.gov/bioproject/PRJNA563066/) or http://virtualplant.bio.puc.cl/share/DNAse/. 
# For those who want to continue the proposed pipeline, on the page http://virtualplant.bio.puc.cl/share/DNAse/protected.tgz you can find the rest of the libraries already processed to continue the analysis.

# TF/motif assignation.

# in Unix:
cat *.protected.txt >protected.total
 
# in R
protected.total<-read.table("pwms_all_motifs/protected.total")

# load the TF_Information.txt file 

TF_info<-read.table("TF_Information.txt",header=T,sep="\t")
 
# Then, the protected.total and TF_info file are merged and a column with the TF gene identifier is added:
 
protected.TF<-merge(protected.total,TF_info[,c(4,6,9,10)],by.x=7,by.y=1)
 
# select the columns for further use and save the data:

protected.bed<-(protected.TF[,c(2:4,1,8,7,9,10,5,6)])
 
 write.table(protected.bed,"protected.TF.sites.txt", sep="\t",row.names=F,col.names=F,quote=F)
 
## Parse the data. Download the file collapse.pl and  TAIR10.1000pb5p.bed from http://virtualplant.bio.puc.cl/share/DNAse/
# in unix:

bedtools sort -i protected.TF.sites.txt |perl collapse.pl >protected.TF.sites.conditions.bed
 
# Gene assignemnt for protected sites.

bedtools intersect -wo -a protected.TF.sites.conditions.bed -b TAIR10.1000pb5p.bed >protected.TF.sites.conditions.1000_5p.bed
 
 
#### Network Visualization
# Gene selection and network preparation.
# in R:

protected.sites<-read.table("protected.TF.sites.conditions.1000_5p.bed",sep="\t")

# Donwload the response_to_nitrate060min.txt file from http://virtualplant.bio.puc.cl/share/DNAse/.

response_to_nitrate060min<-as.matrix(read.table("response_to_nitrate060min.txt"))
 
network<-merge(response_to_nitrate060min , protected.sites,by.x=1,by.y=14)

network<-network[,c(1:11)]
 
# The names can be assigned to each column to be used as identifiers in the network.
 
colnames(network)<-c("TARGET","Chr","start","end","motif_id","TF","strand","TF_assign","TF_family","motif_sequence","DH_pattern")

#The network object is saved as a .txt file to be used in Cytoscape.
 
write.table(network,"network.txt",sep="\t",col.names=NA,quote=F)

# Finally, the list of TFs in the network can be obtained to use it as an attribute with the following command:

TFs.att<-cbind(unique(as.matrix(network$TF)),"TF")

colnames(TFs.att)<-c("agi","TF")

write.table(TFs.att,"TFs.att",sep="\t",row.names=F,quote=F)