# RNA-Seq STAR mapping with Snakemake
#
# Read the description of this pipeline at https://evodify.com/rna-seq-star-snakemake/
# 
# To run this pipeline:
# snakemake -s Snakefile -j 100 --cluster-config cluster.yaml --cluster "sbatch -A {cluster.account} -t {cluster.time} -p {cluster.partition} -n {cluster.n}"
#
# cluster.yaml file is also available at https://evodify.com/rna-seq-star-snakemake/


SAMPLES, = glob_wildcards('/path/to/fastq/{sample}_L001_R1.fastq.gz') 

rule allout:
        input:
            directory('canFam3STAR'),
            expand('{sample}_pass1/SJ.out.tab', sample=SAMPLES),
            directory('SJ'),
            expand('SJ/{sample}_pass1SJ.filtered.tab', sample=SAMPLES),
            expand('{sample}_pass2/Aligned.sortedByCoord.out.bam', sample=SAMPLES),
            expand('{sample}_HTSeq_union_gff3_no_gene_ID.log', sample=SAMPLES),
            expand('{sample}_HTSeq.csv', sample=SAMPLES)
            
rule index:
        input:
            fa = 'canFam3.fa', # provide your reference FASTA file
            gtf = 'canFam3.gtf' # provide your GTF file
        output:
            directory('canFam3STAR') # you can rename the index folder
        threads: 20 # set the maximum number of available cores
        shell:
            'mkdir {output} && '
            'STAR --runThreadN {threads} '
            '--runMode genomeGenerate '
            '--genomeDir {output} '
            '--genomeFastaFiles {input.fa} '
            '--sjdbGTFfile {input.gtf} '
            '--sjdbOverhang 100'

rule pass1:
        input:
            R1L1 = 'fastq/{sample}/{sample}_L001_R1.fastq.gz', # may need adjustment if your fastq file name format is different
            R1L2 = 'fastq/{sample}/{sample}_L002_R1.fastq.gz', # note each sample has 4 fastq files ~ 2 lanes per file
            R2L1 = 'fastq/{sample}/{sample}_L001_R2.fastq.gz',
            R2L2 = 'fastq/{sample}/{sample}_L002_R2.fastq.gz',
            refdir = directory('canFam3STAR')
        params:
            outdir = '{sample}_pass1',
            rmbam = '{sample}_pass1/Aligned.out.bam'
        output:
            '{sample}_pass1/SJ.out.tab'
        threads: 20 # set the maximum number of available cores
        shell:
            'rm -rf {params.outdir} &&' # be careful with this. I don't know why, but Snakemake had problems without this cleaning.
            'mkdir {params.outdir} && ' # snakemake had problems finding output files with --outFileNamePrefix, so I used this approach instead
            'cd {params.outdir} && '
            'STAR --runThreadN {threads} '
            '--genomeDir {input.refdir} '
            '--readFilesIn {input.R1L1},{input.R1L2} {input.R2L1},{input.R2L2} '
            '--readFilesCommand zcat '
            '--outSAMtype BAM Unsorted && rm {params.rmbam} && cd ..'
            
rule SJdir:
        output:
            directory('SJ')
        threads: 1
        shell:
            'mkdir {output}'

rule filter:
        input:
            '{sample}_pass1/SJ.out.tab',
            directory('SJ')
        output:
            'SJ/{sample}_pass1SJ.filtered.tab'
        threads: 1
        shell:
            '''awk "{ { if (\$7 >= 3) print \$0 } }" {input[0]} > {input[0]}.filtered && '''
            'mv {input[0]}.filtered {output}'

rule pass2:
        input:
            R1L1 = 'fastq/{sample}/{sample}_L001_R1.fastq.gz',
            R1L2 = 'fastq/{sample}/{sample}_L002_R1.fastq.gz',
            R2L1 = 'fastq/{sample}/{sample}_L001_R2.fastq.gz',
            R2L2 = 'fastq/{sample}/{sample}_L002_R2.fastq.gz',
            SJfiles = 'SJ/{sample}_pass1SJ.filtered.tab',
            refdir = directory('canFam3STAR')
        params:
            outdir = '{sample}_pass2',
            id = '{sample}'
        output:
            '{sample}_pass2/Aligned.sortedByCoord.out.bam'
        threads: 20 # set the maximum number of available cores
        shell:
            'rm -rf {params.outdir} &&' # be careful with this. I don't know why, but Snakemake had problems without this cleaning.
            'mkdir {params.outdir} && '
            'cd {params.outdir} && '
            'STAR --runThreadN {threads} '
            '--genomeDir {input.refdir} '
            '--readFilesIn {input.R1L1},{input.R1L2} {input.R2L1},{input.R2L2} '
            '--readFilesCommand zcat '
            '--outSAMtype BAM SortedByCoordinate '
            '--sjdbFileChrStartEnd {input.SJfiles} '
            '--outSAMattrRGline ID:{params.id} '
            '--quantMode GeneCounts '

rule htseq:
        input:
            bam = '{sample}_pass2/Aligned.sortedByCoord.out.bam',
            gff = 'canFam3.gff3'
        output:
            '{sample}_HTSeq_union_gff3_no_gene_ID.log',
            '{sample}_HTSeq.csv'
        threads: 1
        shell:
            'htseq-count -m union -s no -t gene -i ID -r pos -f bam {input.bam} {input.gff} &> {output[0]} && '
            'grep ENS {output[0]} | sed "s/gene://g" > {output[1]}'