# This file contain all instructions described in Moyano et al. (2014) XXX to create gene networks.
# You need to download the data and uncompress in the working directory before you start. For details, see the 
# manuscript.
#
# Please note that data, methods and parameters used at each step are meant to be for demonstration purposes 
# and by no means should be taken as the only way or as a general rule to do data analyses in all cases. 
# Changes in methods and parameters can have a significant impact on your final results and should be carefully 
# evaluated and decided upon depending on your scientific aims as well as experimental design. 
#
# Last version September 9, 2014.

# To select the user working directory use the following command. Replace the text inside the quotes with the correct location for the folder with the example data sets. Please note that in windows operative systems you need to use the backslash symbol and start with your hardrive (e.g. “C:\”):
setwd("/Users/example")


# First select the bioconductor repository, download and install the affy package. Documentation on how to install packages can be obtained from the bioconductor website (http://www.bioconductor.org). You need to do this only once and can skip to line 4 if you have the affy package already installed:
source("http://bioconductor.org/biocLite.R")
biocLite("affy")

# Affymetrix *.CEL files are read and normalized using the following commands. 
# To load the affy package, read the *.CEL files and normalize the data using the RMA method. Documentation for this package can be obtained from the Bioconductor website (http://www.bioconductor.org/packages/release/bioc/html/affy.html)
library(affy)

# To read all CEL files in the working directory:
Data<-ReadAffy()

# To normalize the data (for details see http://www.bioconductor.org/packages/release/bioc/html/affy.html):
eset<-rma(Data)
norm.data<-exprs(eset)

# The norm.data R object contains the normalized expression for every probeset in the ATH1 microarrays used in this example. In order to convert the probeset IDs to Arabidopsis gene identifiers, the file ftp://ftp.arabidopsis.org/home/tair/Microarrays/Affymetrix/affy_ATH1_array_elements-2010-12-20.txt download from the TAIR database and place in the folder with the microarray data. In order to avoid ambiguous probeset associations (i.e. probesets that have multiple matches to genes), we only used probes that match only one gene in the Arabidopsis genome.
affy_names<-read.delim("affy_ATH1_array_elements-2010-12-20.txt",header=T)

# Select the columns that contain the probeset ID and corresponding AGI number. Please note that the positions used to index the matrix depend on the input format of the array elements file. You can change these numbers to index the corresponding columns if you are using a different format:
probe_agi<-as.matrix(affy_names[,c(1,5)])

# To associate the probeset with the corresponding AGI locus:
normalized.names<-merge(probe_agi,norm.data,by.x=1,by.y=0)[,-1]

# To remove probesets that do not match the Arabidopsis genome:
normalized.arabidopsis <-normalized.names[grep("AT",normalized.names[,1]),]

# To remove ambiguous probes:
normalized.arabidopsis.unambiguous<-normalized.arabidopsis[grep(pattern=";",normalized.arabidopsis[,1], invert=T),]

# In some cases, multiple probes match the same gene, due to updates in the annotation of the genome. To remove duplicated genes in the matrix:
normalized.agi.final<-normalized.arabidopsis.unambiguous[!duplicated(normalized.arabidopsis.unambiguous[,1]),]

# To assign the AGI number as row name:
rownames(normalized.agi.final)<-normalized.agi.final[,1]
normalized.agi.final<-normalized.agi.final[,-1]

#The resulting gene expression dataset contains unique row identifies (i.e. AGI locus), and different expression values obtained from different experiments on each column (Table 3).
#Table 3 
#Gene Expression Matrix. View of five rows and three columns of normalized data matrix file, showing header and normalized gene expression values for regulated genes.
#  GSM1054974_Col_0_KCl_2H_R1.CEL.gz GSM1054975_Col_0_KCl_2H_R2.CEL.gz GSM1054976_Col_0_KCl_2H_R3.CEL.gz
#ATMG00640      4.188101    	4.109671    	4.130230   
#ATMG00650    	4.417411    	4.542037    	4.536882    	
#ATMG00660    	5.658079    	5.717082    	5.296106  
#ATMG00670    	4.759369    	4.849271    	4.965505    
#ATMG00680    	4.434071    	4.395689    	4.468155  

# To export this data matrix from R to a tab-delimited file use the following command. The file will be written to the folder that you set up as your working directory in R using the setwd() command in line 1 above:
write.table (normalized.agi.final,"normalized.agi.final.txt", sep="\t",col.names=NA,quote=F)

# Read a gene list and gene expression matrix files. The id.genes.txt file is a text file with one locus identifier per line. You can generate any list of interest in a text file.
id.genes<-sort(unique(as.matrix(read.table("id.genes.txt"))))

# To read back into R the gene expression data matrix table we created in command line 16 use the following instructions:
normalized.agi.final <-read.table("normalized.agi.final.txt", header=T, row.names=1)

# Selects rows using the identifiers in the gene list.
data.interest<-na.exclude(normalized.agi.final [id.genes,])

# data.interest will contain gene expression values for the genes of
# interest we uploaded from the id.genes.txt  file. 
# The function na.exclude removes  rows corresponding to genes that were not found in the dataset.

# Calculating correlation of gene expression.
# In the next section, we will generate a matrix containing a list of all possible pairs of genes and their correlation coefficient, p-value and adjusted p-value. 
# Correlation networks are informative to associate genes that are involved in the same biological pathway or that are part of protein complexes. 
# The list of pairs generated will also be of use for querying interaction data (see below). The following function, cor.pairwise, uses two arguments. 
# data defines the gene expression matrix file and outputfile defines the name of the output file containing all the correlations between pairs of genes.

cor.pairwise<-function(data,outputfile){
  i=1
  out<-NULL
  while(i<dim(data)[1]){
    k=i+1
    while(k<=dim(data)[1]){
      corre1<-cor.test(t(data[i,]),t(data[k,]), method="pearson")
      corre<-cbind(corre1$estimate,corre1$p.value)
      bl<- t(rownames(data)[c(i,k)])
      corre<-cbind(bl,corre)
      write.table(corre,outputfile,sep="\t", quote=FALSE, append=TRUE, col.names=FALSE, row.names=FALSE)
      out<-rbind(out,corre)
      k=k+1
    }
    print (rownames(data)[i])
    i=i+1
  }
  pvals<-out[,4]
  pval.adjust=p.adjust(pvals,method="fdr")
  out<-cbind(out,pval.adjust)
  colnames(out)<-c("id1","id2","cor","pval","adj.pval")
  rownames(out)<-make.names(rownames(out),unique=TRUE)
  return(out)
}

# We use the function as follows:
output.cor<-cor.pairwise(data=(data.interest),outputfile="outputfile.txt")

# The object output.cor will include the gene pair names, correlation, p-values and adjusted p-values.
head(output.cor)
#"id1" "id2" "cor" "pval" "adj.pval"
#"cor"   "AT1G01190" "AT1G02310" "0.00984765500952154" "0.905766518240984" "0.923136140717168"
#"cor.1" "AT1G01190" "AT1G02340" "-0.314457776961519" "0.000104921377641099" "0.000238208398080831"
#"cor.2" "AT1G01190" "AT1G03080" "-0.0591117526465646" "0.476962883461703" "0.5370372510918"
#"cor.3" "AT1G01190" "AT1G04770" "0.460972202377037" "4.21544221751446e-09" "1.75950337755804e-08"
#"cor.4" "AT1G01190" "AT1G05300" "-0.450849475164985" "1.00189008902704e-08" "3.97715603233857e-08"
#"cor.5" "AT1G01190" "AT1G05340" "0.266714601102548" "0.00109271801107358" "0.00211189306519128"

# You can filter these correlations by defining a threshold for the adjusted p-values or correlation coefficients.
# In this example, we set a p ≤ 0.01 and cor ≥ 0.75.
output.cor_pval<-output.cor[output.cor[,5]<=0.01,]
output.cor_pval_075<-output.cor_pval[abs(as.numeric(output.cor_pval[,3]))>=0.75,]
output.filtered<-output.cor_pval_075 

# Adding publicly available information to the network 
# In order to intersect interaction data from other sources (e.g. protein:protein interactions) and the correlation data contained in
# output.filtered, we will use the merge function as follows:
# First, we load the interaction table from the Interaction Database Table. 
# To facilitate this example we have prepared a text file with all the data sources we will be using. You can download this file from http://virtualplant.bio.puc.cl/share/pfg/interaction.data.txt
interaction.data<-as.matrix(read.table("interaction.data.txt",sep="\t",header=T, fill=T))

# Then we combine the table with the recently created coexpression data matrix:
output.filtered.interaction<-unique(rbind(merge(interaction.data,output.filtered,by.x=c(1,2),by.y=c(1,2)),merge(interaction.data,output.filtered,by.x=c(1,2),by.y=c(2,1))))

# In the specific case of the regulatory interaction database AGRIS (Table 1), there is only information about TF binding sites in the promoter of Arabidopsis genes (BindingSite.tbl).
# This data can be converted into a matrix containing each gene and all the described members of the TF gene families predicted to bind their promoters. 
# This matrix can then be used as above to be intersected with the correlation matrix. Data from TF families and their members can be obtained from AtTFDB (families_data.tbl) in AGRIS.  
bstable<-as.matrix(read.table("BindingSite.tbl",sep="\t"))
fam.tf<-as.matrix(read.table("families_data.tbl",sep="\t"))

# Once files are loaded, we can create a new object, agris.bs, selecting only the columns of interest for our analysis. 
# In this case we chose from bstable the columns describing the binding site, promoter and TF  family.
agris.bs<-cbind(toupper(substr(bstable[,7] ,start=1,stop=9)), bstable[,c(2,10,11)])

# In this example, for promoters that are bound more than once by the same family of TFs, we use the command unique to merge the repeated binding sites present, leaving one for each type.
agris.prom.fam<-unique((agris.bs[,c(1,3)]))

# To create a table with only the minimal information from the transcription 
# factor with the respective family:
fam.tf.gen<-cbind(fam.tf[,1],toupper(fam.tf[,2]))

# To merge the binding site present in the promoters with the family of transcription factor able to bind to the promoter sequence:
pairs.fam.bs<-as.matrix(merge(fam.tf.gen,agris.prom.fam,by.x=1,by.y=2))

# Then to parse the data for further use:
agris.pairs<-as.matrix(cbind(pairs.fam.bs[,c(2,3)],"TF_TARGET","AGRIS", pairs.fam.bs[,1]))
colnames(agris.pairs)<-c("TF","TARGET","interaction_type","source","family")
agris.pairs<-unique(agris.pairs)

# To verify the final Transcripition Factor – Target pairs:
head(agris.pairs)

# TF	TARGET	interaction_type	source	family
# [1,]	AT3G24650	AT4G21390	TF_TARGET	AGRIS	ABI3VP1
# [2,]	AT3G24650	AT4G09070	TF_TARGET	AGRIS	ABI3VP1
# [3,]	AT3G24650	AT1G53130	TF_TARGET	AGRIS	ABI3VP1
# [4,]	AT3G24650	AT1G32200	TF_TARGET	AGRIS	ABI3VP1
# [5,]	AT3G24650	AT2G27040	TF_TARGET	AGRIS	ABI3VP1
# [6,]	AT3G24650	AT1G10960	TF_TARGET	AGRIS	ABI3VP1

# Once created the TF-TARGET table (agris.pairs) we intersect this table with the correlation data (output.filtered).
output.filtered.agris<- unique(rbind(merge(agris.pairs,output.filtered,by.x=c(1,2),by.y=c(1,2)),merge(agris.pairs,output.filtered,by.x=c(1,2),by.y=c(2,1))))

# Create the final table adding header and all the information available:
write.table( t(c("id1","id2","type","source","cor","p.val","p.val.adj","info1")),"out.info.txt",sep="\t",row.names=F,quote=F,col.names=F)
write.table(output.filtered.agris[,c(1:4,6:8,5)], "out.info.txt",sep="\t",append=T, row.names=F, col.names=F,quote=F)
write.table(output.filtered.interaction,"out.info.txt", sep="\t",append=T, row.names=F, col.names=F,quote=F)

# Add to the thable all the correlation data passing filters.
cor.pairs<-cbind(output.filtered[,1:2],"correlation_pair", "own_analysis",output.filtered[,3:5])
write.table(cor.pairs ,"out.info.txt",append=T, sep="\t", quote=F, row.names=F,col.names=F)

# The final table containing the correlations and interaction information from the analysis is stored in the file out.info.txt. Interaction information for each gene-pair, including the correlation, p value, literature interaction type and the source.
#
#id1     id2     cor     p.val   p.val.adj       type    source  info1   info2
#AT4G22070       AT4G23700       TF_TARGET       AGRIS   0.771985968692119       0       WRKY
#AT4G22070       AT5G64120       TF_TARGET       AGRIS   0.751826459624124       0       WRKY
#AT5G10030       AT5G10210       TF_TARGET       AGRIS   0.75558173996937        0       bZIP