######## sea_cucumber ###### check ram usage watch -n 1 free -m 3 ######## conserved miRNA analysis ###### adapter trimming gzcat SRR026762.fastq.gz | head -n 20000 | grep ATCTCGTATGCCGTCTTCTGCTTG ## trimming with fastq-mcf fastq-mcf ../../reference/adapters_sRNA.fa SRR1027675DA.fastq.gz SRR1027674NA.fastq.gz\ -o SRR1027675DA_trim.fastq -o SSRR1027674NA_trim.fastq\ -q 30 -P 33 -l 15 -k 0 --max-ns 1 # -k 0 means 0 percent is skewed fastq-mcf ../adapters_sRNA.fa transcriptome_pooled.fastq -o transcriptome_pooled_trim.fastq -q 30 -P 33 -l 15 -k 0 --max-ns 1 ###### move specific files sudo su find -type f | grep _piRNA.fasta |xargs sudo mv -t ../../reference/negative_ref exit ###### grab the non-spaced content grep -v "^$" mmu_piRNA.fasta > mmu_piRNA_w.fasta ###### chagne U to T # part of fastx-toolkit fasta_nucleotide_changer -i ./piRNA.fasta -o ./piRNA_DNA.fasta -d #### a loop for changing U to T # DO NOT RUN for files in ./*.fasta do grep -v "^$" "$files" > temp.fasta; fasta_nucleotide_changer -i ./temp.fasta -o ../piRNA_DNA.fasta. -d done ###### combine fasta files for files in ./*.fasta do cat "$files"; done > ./piRNA.fasta ###### bowtie index building sudo bowtie-build neg_ref.fasta neg_ref.ebwt # combined miRBase ref sudo bowtie-build mature_DNA.fa miRNA_ref.ebwt # mature miRNA ref sudo bowtie-build hairpin_DNA.fa hairpin_ref.ebwt # hairpin ref ###### bowtie alignment to remove neg RNAs # output file naming scheme follows: sampleID_trim_flt.fastq # bowtie mistmach is set to the default (-n 2). Up to 2 mistaches in the seed region sudo bowtie -p 8 -q neg_ref.ebwt ../../raw/resp_tree/SRR1027674NA_trim.fastq --un ../../output/SRR1027674NA_trim_flt.fastq --al ./old/SRR1027674NA_trim_neg_align.fastq # reads processed: 12235347 # reads with at least one reported alignment: 3730139 (30.49%) # reads that failed to align: 8505208 (69.51%) # Reported 3730139 alignments to 1 output stream(s) sudo bowtie -p 8 -q neg_ref.ebwt ../../raw/resp_tree/SRR1027675DA_trim.fastq --un ../../output/SRR1027675DA_trim_flt.fastq --al ./old/SRR1027675DA_trim_neg_align.fastq # reads processed: 12993723 # reads with at least one reported alignment: 4754900 (36.59%) # reads that failed to align: 8238823 (63.41%) # Reported 4754900 alignments to 1 output stream(s) ###### download miRBase references sudo wget ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz # mature miRNA sudo wget ftp://mirbase.org/pub/mirbase/CURRENT/hairpin.fa.gz # hairpin sudo gunzip mature.fa.gz # unpack gz files sudo fasta_formatter hairpin.fa hairpin_sg.fa # conver multi-line fasta to single line fasta ###### change multi-line fasta into single-line fasta # part of fastx-toolkit sudo fasta_formatter -i hairpin.fa -o hairpin_sg.fa # the default is to change into signle line ###### manually change all the U to T # make sure there are no mis-edittings # 's/U/T/g' "s" means "substitute"; "U/T" means "change from U to T"; "g" means "global": apply for the whole file sudo sed -i 's/U/T/g' hairpin.fa sudo sed -i '.bak' 's/U/T/g' hairpin.fa # OSX version of sed requires an extension for the backup file ('.bak') with the -i flag ###### combine multiple files without using for loop cat file1 file2 file3 > combined ###### align reads with miRBase # make sure to navegate to reference folder ## -l 20 -n 0 -v 2 -a --best --strata (-l seed sequence length: 20; -n maximum seed sequence mistmach: 0; -v aglinment mistmach: 2; -a report all valid results; --best --strata report best results when alinged to multiple regions) ## control sudo bowtie -p 8 -q -l 20 -n 0 -v 2 -a --best --strata miRBase_ref.ebwt ../../output/SRR1027674NA_trim_flt.fastq --al ../../output/SRR1027674NA_miRBase.fastq --un ../../output/SRR1027674NA_miRBase_unaligned.fastq # reads processed: 8505208 # reads with at least one reported alignment: 615570 (7.24%) # reads that failed to align: 7889638 (92.76%) # Reported 3552759 alignments to 1 output stream(s) ## stressed sudo bowtie -p 8 -q -l 20 -n 0 -v 2 -a --best --strata miRBase_ref.ebwt ../../output/SRR1027675DA_trim_flt.fastq --al ../../output/SRR1027675DA_miRBase.fastq --un ../../output/SRR1027675DA_miRBase_unaligned.fastq # reads processed: 8238823 # reads with at least one reported alignment: 779581 (9.46%) # reads that failed to align: 7459242 (90.54%) # Reported 4442407 alignments to 1 output stream(s) #### mature miRNA only alignment # make sure to navegate to reference folder ## -l 20 -n 0 -v 2 -a --best --strata (-l seed sequence length: 20; -n maximum seed sequence mistmach: 0; -v aglinment mistmach: 2; -a report all valid results; --best --strata report best results when alinged to multiple regions) ## control # align and output bam file # bowtie -S output sam file # samtools view arguments: -S input is sam file; -b output is bam file; -h with header; -o output file name # make sure to take the unaligned sequences as the input for the hairpin, to avoid aligning sea cucumber mature RNA to the hairpin again sudo su bowtie -p 8 -q -S -l 20 -n 0 -v 2 -a --best --strata miRNA_ref.ebwt ../../output/SRR1027674NA_trim_flt.fastq --al ../../output/SRR1027674NA_miRBase_miRNA.sam | samtools view -S -b -h - > ../../output/SRR1027674NA_miRBase_miRNA.bam # mature miRNA exit # [samopen] SAM header is present: 35828 sequences. # reads processed: 8505208 # reads with at least one reported alignment: 559039 (6.57%) # reads that failed to align: 7946169 (93.43%) # Reported 1710001 alignments to 1 output stream(s) ## stressed # output bam format # bowtie -S output sam file # samtools view arguments: -S input is sam file; -b output is bam file; -h with header; -o output file name sudo su bowtie -p 8 -q -S -l 20 -n 0 -v 2 -a --best --strata miRNA_ref.ebwt ../../output/SRR1027675DA_trim_flt.fastq --al ../../output/SRR1027675DA_miRBase_miRNA.sam | samtools view -S -b -h - > ../../output/SRR1027675DA_miRBase_miRNA.bam # mature miRNA exit # [samopen] SAM header is present: 35828 sequences. # reads processed: 8238823 # reads with at least one reported alignment: 708071 (8.59%) # reads that failed to align: 7530752 (91.41%) # Reported 2140775 alignments to 1 output stream(s) #### hairpin only alignment ## control # generate the unaligned sequence file for hairpin alignment sudo bowtie -p 8 -q -l 20 -n 0 -v 2 -a --best --strata miRNA_ref.ebwt ../../output/SRR1027674NA_trim_flt.fastq --un ../../output/SRR1027674NA_miRBase_miRNA_unaglined.fastq # align sudo bowtie -p 8 -q -S -l 20 -n 0 -v 2 -a --best --strata hairpin_ref.ebwt ../../output/SRR1027674NA_miRBase_miRNA_unaglined.fastq --al ../../output/SRR1027674NA_miRBase_hairpin.sam | samtools view -S -b -h - > ../../output/SRR1027674NA_miRBase_hairpin.bam # hairpin # [samopen] SAM header is present: 28645 sequences. # reads processed: 7946169 # reads with at least one reported alignment: 56531 (0.71%) # reads that failed to align: 7889638 (99.29%) # Reported 116815 alignments to 1 output stream(s) ## stressed # generate the unaligned sequence file for hairpin alignment sudo bowtie -p 8 -q -l 20 -n 0 -v 2 -a --best --strata miRNA_ref.ebwt ../../output/SRR1027675DA_trim_flt.fastq --un ../../output/SRR1027675DA_miRBase_miRNA_unaglined.fastq # align sudo bowtie -p 8 -q -S -l 20 -n 0 -v 2 -a --best --strata hairpin_ref.ebwt ../../output/SRR1027675DA_miRBase_miRNA_unaglined.fastq --al ../../output/SRR1027675DA_miRBase_hairpin.sam | samtools view -S -b -h - > ../../output/SRR1027675DA_miRBase_hairpin.bam # hairpin # [samopen] SAM header is present: 28645 sequences. # reads processed: 7530752 # reads with at least one reported alignment: 71510 (0.95%) # reads that failed to align: 7459242 (99.05%) # Reported 147465 alignments to 1 output stream(s) ###### count reads # make sure to navegate to output folder ## control # SRR1027674NA_miRBase_miRNA.bam is the mature miRNA file samtools sort -n SRR1027674NA_miRBase_miRNA.bam SRR1027674NA_miRBase_miRNA_sort_name # sort by name sudo su samtools view SRR1027674NA_miRBase_miRNA_sort_name.bam | awk '{print $3}' | sort | uniq -c | sort -nr > SRR1027674NA_miRBase_miRNA.txt exit # SRR1027674NA_miRBase_hairpin.bam is the hairpin file samtools sort -n SRR1027674NA_miRBase_hairpin.bam SRR1027674NA_miRBase_hairpin_sort_name # sort by name sudo su samtools view SRR1027674NA_miRBase_hairpin_sort_name.bam | awk '{print $3}' | sort | uniq -c | sort -nr > SRR1027674NA_miRBase_hairpin.txt exit ## stressed # SRR1027675DA_miRBase_miRNA.bam is the mature miRNA files samtools sort -n SRR1027675DA_miRBase_miRNA.bam SRR1027675DA_miRBase_miRNA_sort_name # sort by name sudo su samtools view SRR1027675DA_miRBase_miRNA_sort_name.bam | awk '{print $3}' | sort | uniq -c | sort -nr > SRR1027675DA_miRBase_miRNA.txt exit # SRR1027675DA_miRBase_hairpin.bam is the hairpin file samtools sort -n SRR1027675DA_miRBase_hairpin.bam SRR1027675DA_miRBase_hairpin_sort_name # sort by name sudo su samtools view SRR1027675DA_miRBase_hairpin_sort_name.bam | awk '{print $3}' | sort | uniq -c | sort -nr > SRR1027675DA_miRBase_hairpin.txt exit ######## novel miRNA prediction ###### align to miRBase hairpin to get trainning set (potential/unknown hairpin), and to get unligned for transcriptome alignment #### align to miRBase-ref hairpin without aligning to mature miRNA ref prior. also, not perfect match (2 mistmaches). # make sure to navegate to reference folder ## -l 20 -n 0 -v 2 -a --best --strata (-l seed sequence length: 20; -n maximum seed sequence mistmach: 0; -v aglinment mistmach: 2; -a report all valid results; --best --strata report best results when alinged to multiple regions) ## control sudo bowtie -p 8 -q -l 20 -n 0 -v 2 -a --best --strata hairpin_ref.ebwt ../../output/SRR1027674NA_trim_flt.fastq --al ../../output/SRR1027674NA_hairpin_train.fastq --un ../../output/SRR1027674NA_hairpin_unaligned_test.fastq # the unaglined file will be used for transcriptome aglinment # reads processed: 8505208 # reads with at least one reported alignment: 615570 (7.24%) # reads that failed to align: 7889638 (92.76%) # Reported 1867278 alignments to 1 output stream(s) # output bam file for trainning set sudo su bowtie -p 8 -q -S -l 20 -n 0 -v 2 -a --best --strata hairpin_ref.ebwt ../../output/SRR1027674NA_trim_flt.fastq --al ../../output/SRR1027674NA_hairpin_train.sam | samtools view -S -b -h - > ../../output/SRR1027674NA_hairpin_train.bam # hairpin trainning set exit ## stressed sudo bowtie -p 8 -q -l 20 -n 0 -v 2 -a --best --strata hairpin_ref.ebwt ../../output/SRR1027675DA_trim_flt.fastq --al ../../output/SRR1027675DA_hairpin_train.fastq --un ../../output/SRR1027675DA_hairpin_unaligned_test.fastq # reads processed: 8238823 # reads with at least one reported alignment: 779569 (9.46%) # reads that failed to align: 7459254 (90.54%) # Reported 779569 alignments to 1 output stream(s) # output bam file for trainning set sudo su sudo bowtie -p 8 -q -S -l 20 -n 0 -v 2 -a --best --strata hairpin_ref.ebwt ../../output/SRR1027675DA_trim_flt.fastq --al ../../output/SRR1027675DA_hairpin_train.sam | samtools view -S -b -h - > ../../output/SRR1027675DA_hairpin_train.bam exit #### count reads # make sure to navegate to output folder ## control samtools sort -n SRR1027674NA_hairpin_train.bam SRR1027674NA_hairpin_train_sort_name # sort by name sudo su samtools view SRR1027674NA_hairpin_train_sort_name.bam | awk '{print $3}' | sort | uniq -c | sort -nr > SRR1027674NA_hairpin_train.txt exit ## stressed samtools sort -n SRR1027675DA_hairpin_train.bam SRR1027675DA_hairpin_train_sort_name # sort by name sudo su samtools view SRR1027675DA_hairpin_train_sort_name.bam | awk '{print $3}' | sort | uniq -c | sort -nr > SRR1027675DA_hairpin_train.txt exit ##### align to trnascriptome to get test set (potential/unknown hairpin) ### prepare reference transcriptome ## download the raw reads for transcriptome sudo su fastq-dump -Z SRR414930 > ./transcriptome.fastq # part of the sratoolkit exit ## fastqc, trim adapters and filter low quality reads sudo fastq-mcf ../adapters_sRNA.fa transcriptome_pooled.fastq -o transcriptome_pooled_trim.fastq -q 30 -P 33 -l 15 -k 0 --max-ns 1 ## convert from fastq to fasta sudo fastq_to_fasta -i trinitytest.fastq -o trinitytest.fasta ## De novo assembly awk '{if(NR%4==3) $0=sprintf("'"+${index}_%d"'",(1+i++)); print;}' $transcriptome_pooled_trim.fastq | awk '{if(NR%4==1) $0=sprintf("'"@${index}_%d"'",(1+i++)); print;}' >$fo_ok Trinity --seqType fa --max_memory 30G --normalize_reads --single trinitytest.fasta --CPU 2 --min_contig_length 80 sudo mv Trinity.fasta asmbld_transcriptome.fasta # rename ## download EST for the animals from NCBI (e.g., filename: sequence.fasta) # e.g., Apostichopus japonicus and Holothuria glaberrima ## change multi-line fasta into single-line fasta # part of fastx-toolkit sudo fasta_formatter -i asmbld_transcriptome.fasta -o asmbld_transcriptome_sg.fasta # the default is to change into signle line sudo fasta_formatter -i sequence.fasta -o sequence_sg.fasta # the default is to change into signle line sudo fasta_formatter -i positive_training.fasta -o positive_training_sg.fasta ## combine the assmebled transcriptome and EST files sudo cat asmbld_transcriptome_sg.fasta sequence_sg.fasta > transcriptomeRef.fasta ## build transcriptome index sudo bowtie-build transcriptomeRef.fasta transcriptomeRef.ebwt ## align to transcriptome # -t report run time; # control sudo su bowtie -p 8 -q -S -l 20 -n 0 -v 2 -t transcriptomeRef.ebwt ../../output/SRR1027674NA_hairpin_unaligned_test.fastq --al ../../output/SRR1027674NA_hairpin_transcriptome.sam | samtools view -S -b -h - > ../../output/SRR1027674NA_hairpin_transcriptome.bam exit # Time loading forward index: 00:00:05 # Time loading mirror index: 00:00:04 # [samopen] SAM header is present: 1000686 sequences. # End-to-end 2/3-mismatch full-index search: 00:01:58 # reads processed: 7889638 # reads with at least one reported alignment: 4775359 (60.53%) # reads that failed to align: 3114279 (39.47%) # Reported 4775359 alignments to 1 output stream(s) # Time searching: 00:02:10 # Overall time: 00:02:10 # stressed sudo su bowtie -p 8 -q -S -l 20 -n 0 -v 2 -t transcriptomeRef.ebwt ../../output/SRR1027675DA_hairpin_unaligned_test.fastq --al ../../output/SRR1027675DA_hairpin_transcriptome.sam | samtools view -S -b -h - > ../../output/SRR1027675DA_hairpin_transcriptome.bam exit # Time loading forward index: 00:00:04 # Time loading mirror index: 00:00:04 # [samopen] SAM header is present: 1000686 sequences. # End-to-end 2/3-mismatch full-index search: 00:02:02 # reads processed: 7459242 # reads with at least one reported alignment: 4025252 (53.96%) # reads that failed to align: 3433990 (46.04%) # Reported 4025252 alignments to 1 output stream(s) # Time searching: 00:02:13 # Overall time: 00:02:13 ## sort bam by name # make sure to go to the output folder samtools sort -n SRR1027674NA_hairpin_transcriptome.bam SRR1027674NA_hairpin_transcriptome_sort_name # control samtools sort -n SRR1027675DA_hairpin_transcriptome.bam SRR1027675DA_hairpin_transcriptome_sort_name # stressed ## count reads # control sudo su samtools view SRR1027674NA_hairpin_transcriptome_sort_name.bam | awk '{print $3}' | sort | uniq -c | sort -nr > SRR1027674NA_hairpin_transcriptome.txt exit # stressed sudo su samtools view SRR1027675DA_hairpin_transcriptome_sort_name.bam | awk '{print $3}' | sort | uniq -c | sort -nr > SRR1027675DA_hairpin_transcriptome.txt exit # replace the "|" with "_" in the fasta reference file sudo sed -i 's/[|]/_/g' testRef.fasta sudo sed -i '.bak' 's/[|]/_/g' testRef.fasta # OSX version of sed requires an extension for the backup file ('.bak') with the -i flag