#===================================================================================#
#                                                                                   #
#                                         PCA                                       #
#                                                                                   #
#===================================================================================#

#==========================================#
#                                          #
#                  DADA2                   #
#                                          #
#==========================================#
{
  setwd('/Users/sara91/Documents/PhD/Materials_and_methods/QQAD/methods_paper/dada2/')
  
  library(ggplot2)
  library(reshape2)
  library(plyr)
  library(data.table)
  library(dplyr)
  library(viridis)
  library(BBmisc)
  require(caret)
  library(ggfortify)
  library(ggpubr)
  
  
  #data = read.csv('nodA_seqid_pcamatrix.csv', sep = ',')
  #head(data)
  data = read.csv('allgenes_dada2.csv', sep = ';')
  head(data)
  names(data)[names(data)=="X..."] <- "sampleID"
  head(data)
  
  
  # Normalize data by row and sample
  data_norm <- data[,-1]
  noda <- data_norm[,1:18]
  nodd <- data_norm[,19:40]
  reca <- data_norm[,41:53]
  rpob <- data_norm[,54:68]
  
  for (i in 1:nrow(data_norm)) {
    normalized1 = (noda[i,]/(sum(noda[i,])))
    noda[i,] <- normalized1
    
    normalized2 = (nodd[i,]/(sum(nodd[i,])))
    nodd[i,] <- normalized2
    
    normalized3 = (reca[i,]/(sum(reca[i,])))
    reca[i,] <- normalized3
    
    normalized4 = (rpob[i,]/(sum(rpob[i,])))
    rpob[i,] <- normalized4
  }  
  
  data_norm <- cbind(rpob, reca, noda, nodd)
  
  ###### REMOVE ROWS
  # Remove Aarhus samples 
  #data <- data[-c(25:32),]
  
  
  # Create plot column for colouring
  p2.data <- data.frame(do.call(rbind, strsplit(as.character(data$sampleID), "")))
  p3.data <-  paste(p2.data$X1) 
  plot.names <- p3.data
  
  data.pca <- data_norm
  # Replace 0s with 0.00001
  data.pca[data.pca == 0] <- 0.0001
  
  # Log transform data for pca
  # Change end row value to be the ncolumns
  x = ncol(data.pca)
  data.pca[ ,1:x] <- log10(data.pca[ ,1:x])
  
  # Remove 0 variance using caret for when Aarhus plots are removed previously
  NZV <- nearZeroVar(data.pca, saveMetrics = TRUE)
  nzv.col <- nearZeroVar(data.pca)
  if(length(nzv.col) > 0) data.pca <- data.pca[, -nzv.col]
  
  
  d.pca <- prcomp(data.pca,
                  center = TRUE,
                  scale. = TRUE)
  
  # Create percent variance label for axis 
  percentVar <- d.pca$sdev^2 / sum( d.pca$sdev^2 )
  percentVar <- round(100 * percentVar)
  
  plotcol <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
  
  (pcaplot <- ggplot(d.pca, aes(PC1, PC2, color = plot.names , fill = plot.names)) +
      coord_fixed() +
      geom_point(size=3) +
      #must manually apply the percentage variance in this case
      xlab(paste0("PC1: ",percentVar[1],"% variance")) +
      ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
      #geom_polygon(data = hulls, alpha = 0.5) +
      stat_chull(geom = "polygon", alpha = 0.2) +
      #stat_ellipse(geom = "polygon", alpha = 0.2, type = "t", level = 0.95) + #if you want to look at ellipses of t-distibuted data 95%. 
      #stat_conf_ellipse(geom = "polygon", alpha = 0.2, level = 0.95, bary = TRUE) + #if you want to look at ellipses of mean 95%. bary = false is the same as stat_ellipse(type = "euclid", level - 0.95).
      stat_mean(size = 5) +
      scale_color_manual(values = plotcol) +
      scale_fill_manual(values = plotcol) +
      theme_bw() +
      theme(aspect.ratio=1,
            legend.title = element_blank(),
            axis.text.x = element_text(size = 20),
            axis.text.y = element_text(size = 20),
            axis.title=element_text(size = 22),
            legend.text = element_text(size = 20)))
  
  
  ggsave('norm_dada2_all_PC12.pdf', plot = pcaplot, width = 25, height = 20, unit = 'cm')
  
}

#==========================================#
#                                          #
#           Individual genes               #
#                                          #
#==========================================#

{
  setwd('/Users/sara91/Documents/PhD/Materials_and_methods/QQAD/methods_paper/dada2/')
  
  library(ggplot2)
  library(reshape2)
  library(plyr)
  library(data.table)
  library(dplyr)
  library(viridis)
  library(BBmisc)
  require(caret)
  
  #data = read.csv('nodA_seqid_pcamatrix.csv', sep = ',')
  #head(data)
  data = read.csv('allgenes_dada2.csv', sep = ';')
  head(data)
  names(data)[names(data)=="X..."] <- "sampleID"
  head(data)
  
  
  # Create plot column for colouring
  p2.data <- data.frame(do.call(rbind, strsplit(as.character(data$sampleID), "")))
  p3.data <-  paste(p2.data$X1) 
  plot.names <- p3.data
  
  
  # Remove name column
  data.pca <- data[,-1]
  
  # Subset genes 
  nodA <- data.pca[,1:18]
  nodD <- data.pca[,19:40]
  recA <- data.pca[,41:53]
  rpoB <- data.pca[,54:68]
  
  for (i in 1:nrow(data.pca)) {
    normalized1 = (nodA[i,]/(sum(nodA[i,])))
    nodA[i,] <- normalized1
    
    normalized2 = (nodD[i,]/(sum(nodD[i,])))
    nodD[i,] <- normalized2
    
    normalized3 = (recA[i,]/(sum(recA[i,])))
    recA[i,] <- normalized3
    
    normalized4 = (rpoB[i,]/(sum(rpoB[i,])))
    rpoB[i,] <- normalized4
  }  
  
  
  # Replace 0s with 0.00001
  nodA[nodA == 0] <- 0.0001
  nodD[nodD == 0] <- 0.0001
  recA[recA == 0] <- 0.0001
  rpoB[rpoB == 0] <- 0.0001
  
  # Log transform data for pca
  # Change end row value to be the ncolumns
  x = ncol(nodA)
  nodA[ ,1:x] <- log10(nodA[ ,1:x])
  y = ncol(nodD)
  nodD[ ,1:y] <- log10(nodD[ ,1:y])
  z = ncol(recA)
  recA[ ,1:z] <- log10(recA[ ,1:z])
  q = ncol(rpoB)
  rpoB[ ,1:q] <- log10(rpoB[ ,1:q])
  
  # Remove 0 variance using caret for when Aarhus plots are removed previously
  nzv.col <- nearZeroVar(nodA)
  if(length(nzv.col) > 0) nodA <- nodA[, -nzv.col]
  nzv.col <- nearZeroVar(nodD)
  if(length(nzv.col) > 0) nodD <- nodD[, -nzv.col]
  nzv.col <- nearZeroVar(recA)
  if(length(nzv.col) > 0) recA <- recA[, -nzv.col]
  nzv.col <- nearZeroVar(rpoB)
  if(length(nzv.col) > 0) rpoB <- rpoB[, -nzv.col]
  
  
  # PCA analysis 
  nodA.pca <- prcomp(nodA,
                     center = TRUE,
                     scale. = TRUE)
  nodD.pca <- prcomp(nodD,
                     center = TRUE,
                     scale. = TRUE)
  recA.pca <- prcomp(recA,
                     center = TRUE,
                     scale. = TRUE)
  rpoB.pca <- prcomp(rpoB,
                     center = TRUE,
                     scale. = TRUE)
  
  # Create percent variance label for axis 
  nodApercentVar <- nodA.pca$sdev^2 / sum( nodA.pca$sdev^2 )
  nodApercentVar <- round(100 * nodApercentVar)
  nodDpercentVar <- nodD.pca$sdev^2 / sum( nodD.pca$sdev^2 )
  nodDpercentVar <- round(100 * nodDpercentVar)
  recApercentVar <- recA.pca$sdev^2 / sum( recA.pca$sdev^2 )
  recApercentVar <- round(100 * recApercentVar)
  rpoBpercentVar <- rpoB.pca$sdev^2 / sum( rpoB.pca$sdev^2 )
  rpoBpercentVar <- round(100 * rpoBpercentVar)
  
  plotcol <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
  
  (nodApcaplot <- ggplot(nodA.pca, aes(PC1, PC2, color = plot.names , fill = plot.names)) +
      coord_fixed() +
      geom_point(size=3) +
      #must manually apply the percentage variance in this case
      xlab(paste0("PC1: ",nodApercentVar[1],"% variance")) +
      ylab(paste0("PC2: ",nodApercentVar[2],"% variance")) + 
      #geom_polygon(data = hulls, alpha = 0.5) +
      stat_chull(geom = "polygon", alpha = 0.2) +
      #stat_ellipse(geom = "polygon", alpha = 0.2, type = "t", level = 0.95) + #if you want to look at ellipses of t-distibuted data 95%. 
      #stat_conf_ellipse(geom = "polygon", alpha = 0.2, level = 0.95, bary = TRUE) + #if you want to look at ellipses of mean 95%. bary = false is the same as stat_ellipse(type = "euclid", level - 0.95).
      stat_mean(size = 5) +
      scale_color_manual(values = plotcol) +
      scale_fill_manual(values = plotcol) +
      theme_bw() +
      theme(aspect.ratio=1,
            legend.title = element_blank(),
            axis.text.x = element_text(size = 20),
            axis.text.y = element_text(size = 20),
            axis.title=element_text(size = 22),
            legend.text = element_text(size = 20)))
  
  (nodDpcaplot <- ggplot(nodD.pca, aes(PC1, PC2, color = plot.names , fill = plot.names)) +
      coord_fixed() +
      geom_point(size=3) +
      #must manually apply the percentage variance in this case
      xlab(paste0("PC1: ",nodDpercentVar[1],"% variance")) +
      ylab(paste0("PC2: ",nodDpercentVar[2],"% variance")) + 
      #geom_polygon(data = hulls, alpha = 0.5) +
      stat_chull(geom = "polygon", alpha = 0.2) +
      #stat_ellipse(geom = "polygon", alpha = 0.2, type = "t", level = 0.95) + #if you want to look at ellipses of t-distibuted data 95%. 
      #stat_conf_ellipse(geom = "polygon", alpha = 0.2, level = 0.95, bary = TRUE) + #if you want to look at ellipses of mean 95%. bary = false is the same as stat_ellipse(type = "euclid", level - 0.95).
      stat_mean(size = 5) +
      scale_color_manual(values = plotcol) +
      scale_fill_manual(values = plotcol) +
      theme_bw() +
      theme(aspect.ratio=1,
            legend.title = element_blank(),
            axis.text.x = element_text(size = 20),
            axis.text.y = element_text(size = 20),
            axis.title=element_text(size = 22),
            legend.text = element_text(size = 20)))
  
  (recApcaplot <- ggplot(recA.pca, aes(PC1, PC2, color = plot.names , fill = plot.names)) +
      coord_fixed() +
      geom_point(size=3) +
      #must manually apply the percentage variance in this case
      xlab(paste0("PC1: ",recApercentVar[1],"% variance")) +
      ylab(paste0("PC2: ",recApercentVar[2],"% variance")) + 
      #geom_polygon(data = hulls, alpha = 0.5) +
      stat_chull(geom = "polygon", alpha = 0.2) +
      #stat_ellipse(geom = "polygon", alpha = 0.2, type = "t", level = 0.95) + #if you want to look at ellipses of t-distibuted data 95%. 
      #stat_conf_ellipse(geom = "polygon", alpha = 0.2, level = 0.95, bary = TRUE) + #if you want to look at ellipses of mean 95%. bary = false is the same as stat_ellipse(type = "euclid", level - 0.95).
      stat_mean(size = 5) +
      scale_color_manual(values = plotcol) +
      scale_fill_manual(values = plotcol) +
      theme_bw() +
      theme(aspect.ratio=1,
            legend.title = element_blank(),
            axis.text.x = element_text(size = 20),
            axis.text.y = element_text(size = 20),
            axis.title=element_text(size = 22),
            legend.text = element_text(size = 20)))
  
  (rpoBpcaplot <- ggplot(rpoB.pca, aes(PC1, PC2, color = plot.names , fill = plot.names)) +
      coord_fixed() +
      geom_point(size=3) +
      #must manually apply the percentage variance in this case
      xlab(paste0("PC1: ",rpoBpercentVar[1],"% variance")) +
      ylab(paste0("PC2: ",rpoBpercentVar[2],"% variance")) + 
      #geom_polygon(data = hulls, alpha = 0.5) +
      stat_chull(geom = "polygon", alpha = 0.2) +
      #stat_ellipse(geom = "polygon", alpha = 0.2, type = "t", level = 0.95) + #if you want to look at ellipses of t-distibuted data 95%. 
      #stat_conf_ellipse(geom = "polygon", alpha = 0.2, level = 0.95, bary = TRUE) + #if you want to look at ellipses of mean 95%. bary = false is the same as stat_ellipse(type = "euclid", level - 0.95).
      stat_mean(size = 5) +
      scale_color_manual(values = plotcol) +
      scale_fill_manual(values = plotcol) +
      theme_bw() +
      theme(aspect.ratio=1,
            legend.title = element_blank(),
            axis.text.x = element_text(size = 20),
            axis.text.y = element_text(size = 20),
            axis.title=element_text(size = 22),
            legend.text = element_text(size = 20)))
  ggsave('dada2_nodA_PC12.pdf', plot = nodApcaplot, width = 25, height = 20, unit = 'cm')
  ggsave('dada2_nodD_PC12.pdf', plot = nodDpcaplot, width = 25, height = 20, unit = 'cm')
  ggsave('dada2_recA_PC12.pdf', plot = recApcaplot, width = 25, height = 20, unit = 'cm')
  ggsave('dada2_rpoB_PC12.pdf', plot = rpoBpcaplot, width = 25, height = 20, unit = 'cm')
  
}  

#===================================================================================#
#                                                                                   #
#                                     RATIOS                                        #
#                                                                                   #
#===================================================================================#


library(ggplot2)
library("lattice")
library(dplyr)
library(reshape2)
#==========================================#
#                                          #
#                  rpoB                    #
#                                          #
#==========================================#

setwd("/Users/sara91/Documents/PhD/Materials_and_methods/QQAD/methods_paper/dada2/synth_mix/")
d <- read.csv("rpoB.csv", sep = ';', dec = ",")
names(d)[names(d)=="X..."] <- "Sequence"
head(d)

###   EXPECTED V OBSERVED RATIO   ###
{# Correlation platinum rpoB = 0.9870113
  cor(d$expected[1:23], d$ratio[1:23])
  # Correlation phusion rpoB = 0.9988713
  cor(d$expected[24:37], d$ratio[24:37])
  
  
  # Plot
  plot <- ggplot(d, aes(x = expected, y = ratio, fill = factor(polymerase))) + 
    coord_fixed() +
    theme_bw() + 
    geom_point(size = 4, aes(color = factor(polymerase))) +
    scale_colour_brewer(palette="Set1") +
    labs(x = "Expected proportion", y = "Observed proportion") +
    theme(aspect.ratio=1,
          legend.title = element_blank(),
          axis.text.x = element_text(size = 20),
          axis.text.y = element_text(size = 20),
          axis.title=element_text(size = 22),
          legend.text = element_text(size = 20))
  
  #function to save specific ggplot graph.  additional parameters(width=,height=,units='cm')
  ggsave('rpoB_ratio_exp_v_obs.pdf', plot = plot, width = 25, height = 20, unit = 'cm')
}

###   CHIMERA FREQUENCY   ###
{
  plot <- ggplot(d, aes(x = expected, y = chisprop, fill = factor(polymerase))) + 
    coord_fixed() +
    theme_bw() + 
    geom_point(size = 4, aes(color = factor(polymerase))) +
    scale_colour_brewer(palette="Set1") +
    labs(x = "Expected proportion", y = "Chimeras/total counts") +
    ylim(0, 0.25) +
    theme(aspect.ratio=1,
          legend.title = element_blank(),
          axis.text.x = element_text(size = 20),
          axis.text.y = element_text(size = 20),
          axis.title=element_text(size = 22),
          legend.text = element_text(size = 20))
  
  #function to save specific ggplot graph.  additional parameters(width=,height=,units='cm')
  ggsave('rpoB_ratio_chimeras.pdf', plot = plot, width = 25, height = 20, unit = 'cm')
  
  
}


#==========================================#
#                                          #
#                 recA                     #
#                                          #
#==========================================#

setwd("/Users/sara91/Documents/PhD/Materials_and_methods/QQAD/methods_paper/dada2/synth_mix/")
d <- read.csv("recA.csv", sep = ';', dec = ",")
names(d)[names(d)=="X..."] <- "Sequence"
head(d)

###   EXPECTED V OBSERVED RATIO   ###
{# Correlation platinum recA = 0.9907104
  cor(d$expected[1:23], d$ratio[1:23])
  # Correlation phusion recA = 0.9522759
  cor(d$expected[24:37], d$ratio[24:37])
  
  
  # Plot
  plot <- ggplot(d, aes(x = expected, y = ratio, fill = factor(polymerase))) + 
    coord_fixed() +
    theme_bw() + 
    geom_point(size = 4, aes(color = factor(polymerase))) +
    scale_colour_brewer(palette="Set1") +
    labs(x = "Expected proportion", y = "Observed proportion") +
    theme(aspect.ratio=1,
          legend.title = element_blank(),
          axis.text.x = element_text(size = 20),
          axis.text.y = element_text(size = 20),
          axis.title=element_text(size = 22),
          legend.text = element_text(size = 20))
  
  #function to save specific ggplot graph.  additional parameters(width=,height=,units='cm')
  ggsave('recA_ratio_exp_v_obs.pdf', plot = plot, width = 25, height = 20, unit = 'cm')
}

###   CHIMERA FREQUENCY   ###
{
  plot <- ggplot(d, aes(x = expected, y = chisprop, fill = factor(polymerase))) + 
    coord_fixed() +
    theme_bw() + 
    geom_point(size = 4, aes(color = factor(polymerase))) +
    scale_colour_brewer(palette="Set1") +
    labs(x = "Expected proportion", y = "Chimeras/total counts") +
    ylim(0, 0.25) +
    theme(aspect.ratio=1,
          legend.title = element_blank(),
          axis.text.x = element_text(size = 20),
          axis.text.y = element_text(size = 20),
          axis.title=element_text(size = 22),
          legend.text = element_text(size = 20))
  
  #function to save specific ggplot graph.  additional parameters(width=,height=,units='cm')
  ggsave('recA_ratio_chimeras.pdf', plot = plot, width = 25, height = 20, unit = 'cm')
  
  
}

#==========================================#
#                                          #
#                 nodA                     #
#                                          #
#==========================================#

setwd("/Users/sara91/Documents/PhD/Materials_and_methods/QQAD/methods_paper/dada2/synth_mix/")
d <- read.csv("nodA.csv", sep = ';', dec = ",")
names(d)[names(d)=="X..."] <- "Sequence"
head(d)

###   EXPECTED V OBSERVED RATIO   ###
{# Correlation platinum nodA = 0.9970802
  cor(d$expected[1:23], d$ratio[1:23])
  # Correlation phusion nodA = 0.9952959
  cor(d$expected[24:37], d$ratio[24:37])
  
  
  # Plot
  plot <- ggplot(d, aes(x = expected, y = ratio, fill = factor(polymerase))) + 
    coord_fixed() +
    theme_bw() + 
    geom_point(size = 4, aes(color = factor(polymerase))) +
    scale_colour_brewer(palette="Set1") +
    labs(x = "Expected proportion", y = "Observed proportion") +
    theme(aspect.ratio=1,
          legend.title = element_blank(),
          axis.text.x = element_text(size = 20),
          axis.text.y = element_text(size = 20),
          axis.title=element_text(size = 22),
          legend.text = element_text(size = 20))
  
  #function to save specific ggplot graph.  additional parameters(width=,height=,units='cm')
  ggsave('nodA_ratio_exp_v_obs.pdf', plot = plot, width = 25, height = 20, unit = 'cm')
}

###   CHIMERA FREQUENCY   ###
{
  plot <- ggplot(d, aes(x = expected, y = chisprop, fill = factor(polymerase))) + 
    coord_fixed() +
    theme_bw() + 
    geom_point(size = 4, aes(color = factor(polymerase))) +
    scale_colour_brewer(palette="Set1") +
    labs(x = "Expected proportion", y = "Chimeras/total counts") +
    ylim(0, 0.25) +
    theme(aspect.ratio=1,
          legend.title = element_blank(),
          axis.text.x = element_text(size = 20),
          axis.text.y = element_text(size = 20),
          axis.title=element_text(size = 22),
          legend.text = element_text(size = 20))
  
  #function to save specific ggplot graph.  additional parameters(width=,height=,units='cm')
  ggsave('nodA_ratio_chimeras.pdf', plot = plot, width = 25, height = 20, unit = 'cm')
  
  
}


#==========================================#
#                                          #
#                 nodD                     #
#                                          #
#==========================================#

setwd("/Users/sara91/Documents/PhD/Materials_and_methods/QQAD/methods_paper/dada2/synth_mix/")
d <- read.csv("nodD.csv", sep = ';', dec = ",")
names(d)[names(d)=="X..."] <- "Sequence"
head(d)

###   EXPECTED V OBSERVED RATIO   ###
{# Correlation platinum nodD = 0.998199
  cor(d$expected[1:23], d$ratio[1:23])
  # Correlation phusion noD = 0.9983725
  cor(d$expected[24:37], d$ratio[24:37])
  
  
  # Plot
  plot <- ggplot(d, aes(x = expected, y = ratio, fill = factor(polymerase))) + 
    coord_fixed() +
    theme_bw() + 
    geom_point(size = 4, aes(color = factor(polymerase))) +
    scale_colour_brewer(palette="Set1") +
    labs(x = "Expected proportion", y = "Observed proportion") +
    theme(aspect.ratio=1,
          legend.title = element_blank(),
          axis.text.x = element_text(size = 20),
          axis.text.y = element_text(size = 20),
          axis.title=element_text(size = 22),
          legend.text = element_text(size = 20))
  
  #function to save specific ggplot graph.  additional parameters(width=,height=,units='cm')
  ggsave('nodD_ratio_exp_v_obs.pdf', plot = plot, width = 25, height = 20, unit = 'cm')
}

###   CHIMERA FREQUENCY   ###
{
  plot <- ggplot(d, aes(x = expected, y = chisprop, fill = factor(polymerase))) + 
    coord_fixed() +
    theme_bw() + 
    geom_point(size = 4, aes(color = factor(polymerase))) +
    scale_colour_brewer(palette="Set1") +
    labs(x = "Expected proportion", y = "Chimeras/total counts") +
    ylim(0, 0.25) +
    theme(aspect.ratio=1,
          legend.title = element_blank(),
          axis.text.x = element_text(size = 20),
          axis.text.y = element_text(size = 20),
          axis.title=element_text(size = 22),
          legend.text = element_text(size = 20))
  
  #function to save specific ggplot graph.  additional parameters(width=,height=,units='cm')
  ggsave('nodD_ratio_chimeras.pdf', plot = plot, width = 25, height = 20, unit = 'cm')
  
  
}

#===================================================================================#
#                                                                                   #
#                                    HEAT MAPS                                      #
#                                                                                   #
#===================================================================================#


# http://www.roymfrancis.com/a-guide-to-elegant-tiled-heatmaps-in-r/

#==========================================#
#                                          #
#                  dada2                  #
#                                          #
#==========================================#
{
  setwd('/Users/sara91/Documents/PhD/Materials_and_methods/QQAD/methods_paper/dada2/')
  
  library(ggplot2)
  library(reshape2)
  library(plyr)
  library(data.table)
  library(dplyr)
  library(viridis)
  
  #data = read.csv('nodA_seqid_pcamatrix.csv', sep = ',')
  #head(data)
  data = read.csv('allgenes_dada2_0.7thresh.csv', sep = ';')
  head(data)
  names(data)[names(data)=="X..."] <- "sampleID"
  head(data)
  # Transform sample ID to numeric type, and sort them numerically
  #data$sample <- as.numeric(data$sample)
  #data <- data[order(data$sample),]
  
  data_norm <- data[,-1]
  #normalized = (data_norm[1,]-min(data_norm[1,]))/(max(data_norm[1,])-min(data_norm[1,]))
  #normalized2 = (data_norm[1,]/(sum(data_norm[1,])))
  
  
  # Normalize data by row and sample
  noda <- data_norm[,1:18]
  nodd <- data_norm[,19:40]
  reca <- data_norm[,41:53]
  rpob <- data_norm[,54:68]
  
  for (i in 1:nrow(data_norm)) {
    normalized1 = (noda[i,]/(sum(noda[i,])))
    noda[i,] <- normalized1
    
    normalized2 = (nodd[i,]/(sum(nodd[i,])))
    nodd[i,] <- normalized2
    
    normalized3 = (reca[i,]/(sum(reca[i,])))
    reca[i,] <- normalized3
    
    normalized4 = (rpob[i,]/(sum(rpob[i,])))
    rpob[i,] <- normalized4
  }  
  
  #data_norm[,1:10] <- noda
  #data_norm[,11:28] <- nodd
  #data_norm[,29:37] <- reca
  #data_norm[,38:50] <- rpob
  
  data_norm <- cbind(rpob, reca, noda, nodd)
  # Normalize data by row
  #for (i in 1:nrow(data_norm)) {
  #  normalized2 = (data_norm[i,]/(sum(data_norm[i,])))
  #  data_norm[i,] <- normalized2
  #}  
  
  # Add sampleID back in data frame 
  data_norm$sampleID <- data$sampleID
  
  
  # Reshape from long to wide
  data_norm <- as.data.frame(data_norm)
  p1.data <- melt(data_norm, id='sampleID')
  
  # Create gene column for facetting 
  p2.data <- data.frame(do.call(rbind, strsplit(as.character(p1.data$variable), "")))
  p3.data <-  paste(p2.data$X1, p2.data$X2, p2.data$X3, p2.data$X4, sep='') 
  p1.data$gene <- p3.data  
  
  # Transform data with log or squareroot 
  p1.data$value <- log10(as.numeric(p1.data$value))
  #p1.data$value <- sqrt(p1.data$value)
  
  
  # Replace NAs with 0
  #p1.data[is.na(p1.data )] <- 0.0001
  #p1.data$value[is.infinite(p1.data$value)] <- 0.0001
  y = subset(p1.data, value > -100 )
  x = min(y$value)
  #p1.data[is.na(p1.data )] <- x
  p1.data$value[is.infinite(p1.data$value)] <- x
  
  # Use pretty row names and discard first column
  row.names(p1.data) <- p1.data$SampleID
  p1.data <- p1.data[ , !(names(p1.data) %in% c('SampleID'))]
  
  # Force order of facets to match list 
  # https://stackoverflow.com/questions/14262497/fixing-the-order-of-facets-in-ggplot
  p1.data$gene_f = factor(p1.data$gene, levels=c('rpoB','recA','nodA','nodD'))
  
  # Plot 
  p = ggplot(p1.data, aes(x=variable, y=sampleID, fill=value)) + 
    geom_tile() +
    #geom_tile(colour="white",size=0.05) +
    scale_y_discrete(expand=c(0,0)) +
    scale_x_discrete(expand=c(0,0)) +
    theme_bw() +
    facet_grid(~gene_f, switch = "x", scales = "free_x", space = "free_x") + 
    theme(axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          axis.title=element_text(size = 22),
          axis.text.y = element_text(size = 20),
          legend.text = element_text(size = 20),
          legend.title = element_blank(),
          strip.text.x = element_text(size = 20), 
          strip.text = element_text(face = "italic")) +
    labs(x="Amplicon", y = "Sample ID") +
    scale_fill_viridis()
  p
  
  # reverse direction of colour palette: scale_fill_distiller(palette="BrBG", trans = "reverse")
  
  #function to save specific ggplot graph.  additional parameters(width=,height=,units='cm')
  ggsave('dada2_heat_log.pdf', plot = p, width = 50, height = 20, unit = 'cm')
}