##Following DADA2 tutorial v1.8##

#Install DADA2 and applicable packages##
#source("https://bioconductor.org/biocLite.R")##
#biocLite("dada2")##

##load DADA2 pkg##
library(dada2); packageVersion("dada2")


##################################################
##################### recA #######################
##################################################
{
##Define path for unzipped fastq read files##
path <- '/Users/sara91/Documents/PhD/Materials_and_methods/QQAD/methods_paper/dada2/synth_mix/recA/'
list.files(path)

##string manipulation to get matched lists of the forward and reverse fastq files##
fnFs <- sort(list.files(path, pattern="L001_R1_001.fastq", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="L001_R2_001.fastq", full.names = TRUE))
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)


forward_plot <- plotQualityProfile(fnFs[1:4])
reverse_plot <- plotQualityProfile(fnRs[1:4])

## Place filtered files in filtered/subdirectory##
filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))

out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(250,250),
                     matchIDs=TRUE,
                     maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE,
                     compress=TRUE, multithread=TRUE)
head(out)


##Learn error rates##

errF <- learnErrors(filtFs, multithread=TRUE)
errR <- learnErrors(filtRs, multithread=TRUE)

##Plot eror rates to check##
errorplot_f <- plotErrors(errF, nominalQ=TRUE)
errorplot_r <- plotErrors(errR, nominalQ=TRUE)


##Dereplication##
derepFs <- derepFastq(filtFs, verbose=TRUE)
derepRs <- derepFastq(filtRs, verbose=TRUE)

# Name the derep-class objects by the sample names
names(derepFs) <- sample.names
names(derepRs) <- sample.names


##Sample Inference##
dadaFs <- dada(derepFs, err=errF, multithread=TRUE)
dadaRs <- dada(derepRs, err=errR, multithread=TRUE)

dadaFs[[1]]

##merge paired reads##
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)
head(mergers[[1]])

##construct an amplicon sequence variant table (ASV) table##
seqtab <- makeSequenceTable(mergers)
dim(seqtab)

# Inspect distribution of sequence lengths##
table(nchar(getSequences(seqtab)))

##remove chimeras##
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
dim(seqtab.nochim)

write.csv(seqtab.nochim, '/Users/sara91/Documents/PhD/Materials_and_methods/QQAD/methods_paper/dada2/synth_mix/asv_table_recA.csv') 
sum(seqtab.nochim)/sum(seqtab)
}

##################################################
##################### rpoB #######################
##################################################
{
##Define path for unzipped fastq read files##
path <- '/Users/sara91/Documents/PhD/Materials_and_methods/QQAD/methods_paper/dada2/synth_mix/rpoB/'
list.files(path)

##string manipulation to get matched lists of the forward and reverse fastq files##
fnFs <- sort(list.files(path, pattern="L001_R1_001.fastq", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="L001_R2_001.fastq", full.names = TRUE))
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)


forward_plot <- plotQualityProfile(fnFs[1:4])
reverse_plot <- plotQualityProfile(fnRs[1:4])

## Place filtered files in filtered/subdirectory##
filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))

out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(250,250),
                     matchIDs=TRUE,
                     maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE,
                     compress=TRUE, multithread=TRUE)
head(out)


##Learn error rates##

errF <- learnErrors(filtFs, multithread=TRUE)
errR <- learnErrors(filtRs, multithread=TRUE)

##Plot eror rates to check##
errorplot_f <- plotErrors(errF, nominalQ=TRUE)
errorplot_r <- plotErrors(errR, nominalQ=TRUE)


##Dereplication##
derepFs <- derepFastq(filtFs, verbose=TRUE)
derepRs <- derepFastq(filtRs, verbose=TRUE)

# Name the derep-class objects by the sample names
names(derepFs) <- sample.names
names(derepRs) <- sample.names


##Sample Inference##
dadaFs <- dada(derepFs, err=errF, multithread=TRUE)
dadaRs <- dada(derepRs, err=errR, multithread=TRUE)

dadaFs[[1]]

##merge paired reads##
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)
head(mergers[[1]])

##construct an amplicon sequence variant table (ASV) table##
seqtab <- makeSequenceTable(mergers)
dim(seqtab)

# Inspect distribution of sequence lengths##
table(nchar(getSequences(seqtab)))

##remove chimeras##
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
dim(seqtab.nochim)

write.csv(seqtab.nochim, '/Users/sara91/Documents/PhD/Materials_and_methods/QQAD/methods_paper/dada2/synth_mix/asv_table_rpoB.csv') 
sum(seqtab.nochim)/sum(seqtab)
}


##################################################
##################### nodA #######################
##################################################
{
##Define path for unzipped fastq read files##
path <- '/Users/sara91/Documents/PhD/Materials_and_methods/QQAD/methods_paper/dada2/synth_mix/nodA/'
list.files(path)

##string manipulation to get matched lists of the forward and reverse fastq files##
fnFs <- sort(list.files(path, pattern="L001_R1_001.fastq", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="L001_R2_001.fastq", full.names = TRUE))
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)


forward_plot <- plotQualityProfile(fnFs[1:4])
reverse_plot <- plotQualityProfile(fnRs[1:4])

## Place filtered files in filtered/subdirectory##
filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))

out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(250,250),
                     matchIDs=TRUE,
                     maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE,
                     compress=TRUE, multithread=TRUE)
head(out)


##Learn error rates##

errF <- learnErrors(filtFs, multithread=TRUE)
errR <- learnErrors(filtRs, multithread=TRUE)

##Plot eror rates to check##
errorplot_f <- plotErrors(errF, nominalQ=TRUE)
errorplot_r <- plotErrors(errR, nominalQ=TRUE)


##Dereplication##
derepFs <- derepFastq(filtFs, verbose=TRUE)
derepRs <- derepFastq(filtRs, verbose=TRUE)

# Name the derep-class objects by the sample names
names(derepFs) <- sample.names
names(derepRs) <- sample.names


##Sample Inference##
dadaFs <- dada(derepFs, err=errF, multithread=TRUE)
dadaRs <- dada(derepRs, err=errR, multithread=TRUE)

dadaFs[[1]]

##merge paired reads##
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)
head(mergers[[1]])

##construct an amplicon sequence variant table (ASV) table##
seqtab <- makeSequenceTable(mergers)
dim(seqtab)

# Inspect distribution of sequence lengths##
table(nchar(getSequences(seqtab)))

##remove chimeras##
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
dim(seqtab.nochim)

write.csv(seqtab.nochim, '/Users/sara91/Documents/PhD/Materials_and_methods/QQAD/methods_paper/dada2/synth_mix/asv_table_nodA.csv') 
sum(seqtab.nochim)/sum(seqtab)
}


##################################################
##################### nodD #######################
##################################################

{
##Define path for unzipped fastq read files##
path <- '/Users/sara91/Documents/PhD/Materials_and_methods/QQAD/methods_paper/dada2/synth_mix/nodD/'
list.files(path)

##string manipulation to get matched lists of the forward and reverse fastq files##
fnFs <- sort(list.files(path, pattern="L001_R1_001.fastq", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="L001_R2_001.fastq", full.names = TRUE))
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)


forward_plot <- plotQualityProfile(fnFs[1:4])
reverse_plot <- plotQualityProfile(fnRs[1:4])

## Place filtered files in filtered/subdirectory##
filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))

out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(250,250),
                     matchIDs=TRUE,
                     maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE,
                     compress=TRUE, multithread=TRUE)
head(out)


##Learn error rates##

errF <- learnErrors(filtFs, multithread=TRUE)
errR <- learnErrors(filtRs, multithread=TRUE)

##Plot eror rates to check##
errorplot_f <- plotErrors(errF, nominalQ=TRUE)
errorplot_r <- plotErrors(errR, nominalQ=TRUE)


##Dereplication##
derepFs <- derepFastq(filtFs, verbose=TRUE)
derepRs <- derepFastq(filtRs, verbose=TRUE)

# Name the derep-class objects by the sample names
names(derepFs) <- sample.names
names(derepRs) <- sample.names


##Sample Inference##
dadaFs <- dada(derepFs, err=errF, multithread=TRUE)
dadaRs <- dada(derepRs, err=errR, multithread=TRUE)

dadaFs[[1]]

##merge paired reads##
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)
head(mergers[[1]])

##construct an amplicon sequence variant table (ASV) table##
seqtab <- makeSequenceTable(mergers)
dim(seqtab)

# Inspect distribution of sequence lengths##
table(nchar(getSequences(seqtab)))

##remove chimeras##
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
dim(seqtab.nochim)

write.csv(seqtab.nochim, '/Users/sara91/Documents/PhD/Materials_and_methods/QQAD/methods_paper/dada2/synth_mix/asv_table_nodD.csv') 
sum(seqtab.nochim)/sum(seqtab)
}