5 Advanced challenge

You are given a set of TSV files. Each file contains a single column with the header “Entry” containing a set of UniProt protein IDs belonging to a single species (different for each file). You are given a sample sheet containing a list of animal species and the path to the TSV file containing UniProt IDs for that species. Finally, you are given a CSV file containing all the natural amino acids found in proteins with their one-letter codes, three-letter codes, common names, and whether they have a polar or apolar side chain. Your goal is to produce a box plot showing, for each species, the distribution of the fraction of polar versus apolar amino acids (across the proteins provided). You should produce a plot like the one following and place it in the folder ../nextflow_output/challenge (relative to your project directory).

Your input files will be created in the folder ../nextflow_inputs (after running the commands in the Setup section), and you can explore them manually to see their structure. You may use any programming language for the task, but you should embed any computational step in a Nextflow process. It is preferable if you split the work into as many independent processes as possible, instead of doing everything in a monolithic process. It may be needed to use some functionality of the Groovy language and Nextflow that we did not discuss in order to complete this challenge.

Click here to see some hints

Hint 1: you can retrieve protein sequences from UniProt programmatically by running the following (replacing <uniprot-id> with the actual UniProt ID of the protein you want to download)

wget https://rest.uniprot.org/uniprotkb/<uniprot-id>.fasta

Hint 2: you may want to look up the flatMap channel operator and the collect method for Groovy lists

5.1 Setup

Create the required sample sheet file by running the following command from your project directory (make sure that the folder ../nextflow_inputs exists before running the command)

echo "filename,species" > ../nextflow_inputs/samplesheet.csv
echo "$(pwd)/../nextflow_inputs/arabidopsis.tsv,Arabidopsis taliana" >> ../nextflow_inputs/samplesheet.csv
echo "$(pwd)/../nextflow_inputs/mouse.tsv,Mus musculus" >> ../nextflow_inputs/samplesheet.csv
echo "$(pwd)/../nextflow_inputs/zebrafish.tsv,Danio rerio" >> ../nextflow_inputs/samplesheet.csv

Download the required data files by running the following command (as before, make sure that the folder ../nextflow_inputs exists before running the command)

wget -P ../nextflow_inputs https://raw.githubusercontent.com/saulpierotti-ebi/saulpierotti-ebi.github.io/main/assets/arabidopsis.tsv
wget -P ../nextflow_inputs https://raw.githubusercontent.com/saulpierotti-ebi/saulpierotti-ebi.github.io/main/assets/zebrafish.tsv
wget -P ../nextflow_inputs https://raw.githubusercontent.com/saulpierotti-ebi/saulpierotti-ebi.github.io/main/assets/mouse.tsv
wget -P ../nextflow_inputs https://raw.githubusercontent.com/saulpierotti-ebi/saulpierotti-ebi.github.io/main/assets/polar_apolar_table.csv

5.2 Solution

You can see the solution to this challenge by expanding the following hidden sections.

Click here to see the solution (main.nf file)

nextflow.enable.dsl = 2

process get_sequences {
    input:
        tuple(
            val(species),
            path(id_list)
        )

    output:
        tuple(
            val(species),
            path("*.fasta")
        )

    script:
        """
        cat $id_list \
            | tail -n +2 \
            | xargs -I{} wget https://rest.uniprot.org/uniprotkb/{}.fasta
        """
}

process count_aa {
    label "r_tidyverse"

    input:
        tuple(
            val(protein),
            val(species),
            path(sequence)
        )

    output:
        tuple(
            val(protein),
            val(species),
            path("${sequence.simpleName}.aa_counts.csv")
        )

    script:
        """
        #!/usr/bin/env Rscript

        library("tidyverse")

        df <- read_lines("${sequence}", skip=1) %>%
            paste(collapse = '') %>%
            strsplit(split = '') %>%
            table() %>%
            as_tibble() %>%
            rename(residue = ".") %>%
            mutate(
                freq = n / sum(n),
                protein = "${protein}",
                species = "${species}"
            )

        write_csv(df, "${sequence.simpleName}.aa_counts.csv")
        """
}

process count_by_polarity {
    label "r_tidyverse"

    input:
        tuple(
            val(protein),
            val(species),
            path(aa_counts),
            path(polar_apolar_table)
        )

    output:
        tuple(
            val(protein),
            val(species),
            path("${aa_counts.simpleName}.polarity_counts.csv")
        )

    script:
        """
        #!/usr/bin/env Rscript

        library("tidyverse")

        pol_table <- read_csv("${polar_apolar_table}") %>%
            summarise(
                residue = single_letter_code,
                polarity
            )

        df <- read_csv("${aa_counts}") %>%
            left_join(pol_table) %>%
            group_by(polarity, species, protein) %>%
            summarise(freq = sum(freq))

        write_csv(df, "${aa_counts.simpleName}.polarity_counts.csv")
        """
}

process aggregate_results {
    label "r_tidyverse"

    input:
        path "*.csv"

    output:
        path "aggregated_polarity_counts.csv"

    script:
        """
        #!/usr/bin/env Rscript

        library("tidyverse")

        input_files <- list.files(pattern = "*.csv")
        df <- map_dfr(input_files, read_csv)

        write_csv(df, "aggregated_polarity_counts.csv")
        """
}

process make_plot {
    label "r_tidyverse"
    publishDir "../nextflow_output/challenge"

    input:
        path aggregated_polarity_counts

    output:
        path "final_plot.pdf"

    script:
        """
        #!/usr/bin/env Rscript

        library("tidyverse")

        df <- read_csv("${aggregated_polarity_counts}")

        ggplot(df, aes(x=species, y=freq, color=polarity)) +
            geom_boxplot()

        ggsave("final_plot.pdf")
        """
}

workflow {
    Channel.fromPath( params.samplesheet ).splitCsv( header: true ).set{ input_ch }

    input_ch.map{ [ it["species"], it["filename"] ] }
        .set{ get_sequences_in_ch }

    get_sequences( get_sequences_in_ch )

    get_sequences.out
        .flatMap{
            def species = it[0]
            def all_sequences = it[1]
            all_sequences.collect{ sequence -> [species, sequence] }
        }
        // add the sequence name as a value
        .map{ [ it[1].simpleName ] + it }
        .set{ count_aa_in_ch }

    count_aa( count_aa_in_ch )

    Channel.fromPath( params.polar_apolar_table ).set{ polar_apolar_table_ch }
    count_aa.out.combine( polar_apolar_table_ch ).set{ count_by_polarity_in_ch }

    count_by_polarity( count_by_polarity_in_ch )
    aggregate_results( count_by_polarity.out.map{ it[2] }.collect() )
    make_plot( aggregate_results.out )
}

Click here to see the solution (nextflow.config file)

// you can also put comments in nextflow.config
workDir = "../nextflow_workdir"

withLabel:r_tidyverse {
    conda = "r tidyverse"
}

params {
  samplesheet = "../nextflow_inputs/samplesheet.csv"
  polar_apolar_table = "../nextflow_inputs/polar_apolar_table.csv"
}