# Protocol for UNOISE

# Below protocol based off the following links:
# https://www.drive5.com/usearch/manual/pipe_readprep.html
# https://www.drive5.com/usearch/manual/ex_min2.html

# First define sample names
# Merge (assemble) paired reads
# https://www.drive5.com/usearch/manual/cmd_fastq_mergepairs.html
for f in 1_raw_reads/*R1_001.fastq; do
        usearch -fastq_mergepairs $f -fastqout "${f/R1_001.fastq/merged.fq}" -relabel @
done

# Split merged reads into each gene using MAUI-seq gene separating script. Requires a list.txt file listing all the merged read files to split.
python3 sort_qqad_genes_1b.py

# move the genes into their own separate files. Then continue with the following commands for each gene within each gene file. 


# Remove primers from the reads using the remove_primers.py script. Moves primerless reads into a directory called 'forward'.
for f in *.fastq; do
	python3 remove_primers.py $f 
done

# Remove sequences that have a length less than expected amplicon size. A few sequences after primer trimming are smaller than the expected amplicon size.
# Therefore it is best to remove them.
# The cut off length should be different for each gene. e.g: rpoB = 253, recA = 251, nodD = 253, nodA = 259.
# From this point onwards nodD is used as an example. 
for f in *_nodd.fastq ; do
        cat $f | paste - - - - | awk 'length($2)  == 253' | sed 's/\t/\n/g' > "${f/.fastq/_trim.fastq}"
done

# Quality filter reads
# https://www.drive5.com/usearch/manual/pipe_readprep_filter.html
for f in nodD.*_trim.fastq; do
	usearch -fastq_filter $f -fastq_maxee 1.0  -fastaout "${f/_trim.fastq/_filtered.fa}"
done

# Pooling samples
# Combine all reads for all samples, for generating OTUs and making an OTU table. 
# Pool samples, i.e. concatenate reads for all samples that were sequenced in the same run.
# https://www.drive5.com/usearch/manual/pool_samples.html
cat *filtered.fa > filtered_concat_allsamples.fa

# Dereplication
# The input sequences to cluster_otus or unoise3 must be a set of unique sequences sorted in order of decreasing abundance with size annotations in the labels.
# I suggest you use -relabel Uniq so that the unique sequences are labeled Uniq1, Uniq2 and so on. The input to fastx_uniques should be the reads after any quality filtering or length trimming.
# The -fastaout option specifies a FASTA output file for the unique sequences. Sequences are sorted by decreasing abundance.
# The -minuniquesize option sets a minimum abundance. Unique sequences with a lower abundance are discarded. Default is 1, which means that all unique sequences are output.
# Reverse-complemented matching for nucleotide sequences is supported by specifying -strand both.
# Find unique read sequences and abundances
# https://www.drive5.com/usearch/manual/upp_derep.html
# https://www.drive5.com/usearch/manual/cmd_fastx_uniques.html
usearch -fastx_uniques filtered_concat_allsamples.fa -sizeout -relabel Uniq -fastaout nodD_zotus_uniques.fa -tabbedout nodD_zotus_uniqueclustering_summary.txt

# Denoise by unoise3 to produce (ZOTUs)
# The -tabbedout option specifies a tabbed text filename which reports the processing done for each sequence, e.g. if it is classified as noisy or chimeric.
# https://www.drive5.com/usearch/manual/pipe_otus.html
# https://www.drive5.com/usearch/manual/cmd_unoise3.html
usearch -unoise3 nodD_zotus_uniques.fa -zotus nodD_zotus.fa -tabbedout nodD_unoise3.txt

# concatenate all read files that have had primers removed (but before any quality filtering etc.). This will be used as input for making the ZOTU table.
cat *_nodd_trim.fastq > noprimers_concat_allsamples.fa

# ZOTU output table
# otutab command: Input should be reads before quality filtering and before discarding low-abundance unique sequences, e.g. singletons
# https://www.drive5.com/usearch/manual/pipe_otutab.html
usearch -otutab noprimers_concat_allsamples.fa -zotus nodD_zotus.fa -otutabout nodD_zotutable_raw.txt -mapout nodD_zotus_zmap.txt