# Loading libraries:
suppressMessages({
library(ALDEx2)
library(data.table)
library(dplyr)
library(IRdisplay)
library(KEGGREST)
library(MGnifyR)
library(pathview)
library(tidyjson)
library(IRdisplay)
})source("lib/variable_utils.r")
source("lib/kegg_pathways_utils.r")
# display_markdown(file = '../_resources/mgnifyr_help.md')
Pathways Visualization
Pathways Visualization
In this notebook we aim to demonstrate how the MGnifyR tool can be used to fetch functional annotation results generated through the MGnify metagenomic analyisis pipelines. Then we show how to generate the pathways visualization using Pathview in R.
MGnifyR is a library that provides a set of tools for easily accessing and processing MGnify data in R, making queries to MGnify databases through the MGnify API. The benefit of MGnifyR is that data can either be fetched in tsv format or be directly combined in a phyloseq object to run an analysis in a custom workflow.
This is an interactive code notebook (a Jupyter Notebook). To run this code, click into each cell and press the ▶ button in the top toolbar, or press shift+enter
Contents
# Create your session mgnify_client object
= mgnify_client(usecache = T, cache_dir = '/home/jovyan/.mgnify_cache') mg
# Setting tables and figures size to display (these will be reset later):
options(repr.matrix.max.cols=150, repr.matrix.max.rows=500)
options(repr.plot.width=2, repr.plot.height=2)
Drawing presence/absence KOs for one metagenomic sample
1.1. Fetching data from MGnify & Pathways Selection
<- get_variable_from_link_or_input('PATHWAY_STUDY_IDS', name = 'Study Accession', default = 'MGYS00006180,MGYS00006178') <- c(strsplit(PATHWAY_STUDY_IDS, ",")[[1]]) PATHWAY_STUDY_IDS print(paste("Using", PATHWAY_STUDY_IDS, "as Study Accession"))
PATHWAY_STUDY_IDS
Type a Study Accession [default: MGYS00006180,MGYS00006178]
[1] "Using MGYS00006180 as Study Accession"
[2] "Using MGYS00006178 as Study Accession"
- Both the studies chosen by Alejandra Escobar are related to the gut bacteria of honeybees.
# Custom Pathways Selection
# Most general pathways include
# Glycolysis / Gluconeogenesis, Citrate cycle (TCA cycle), Pentose phosphate pathway,
# Fatty acid biosynthesis, Pyrimidine metabolism, Oxidative phosphorylation
<- PathwaysSelection() CUSTOM_PATHWAY_IDS
Pathways Selection :
For the most general & most complete pathways, input ‘G’
Press Enter to generate the most complete pathways
To add custom pathways, input pathway numbers (ex: 00053,01220)
Type a Pathways Accession [default: ] G
Using 00010 - Glycolysis / Gluconeogenesis : https://www.kegg.jp/pathway/map00010 as a Custom Pathway
Using 00020 - Citrate cycle (TCA cycle) : https://www.kegg.jp/pathway/map00020 as a Custom Pathway
Using 00030 - Pentose phosphate pathway : https://www.kegg.jp/pathway/map00030 as a Custom Pathway
Using 00061 - Fatty acid biosynthesis : https://www.kegg.jp/pathway/map00061 as a Custom Pathway
Using 01232 - Nucleotide metabolism : https://www.kegg.jp/pathway/map01232 as a Custom Pathway
Using 00240 - Pyrimidine metabolism : https://www.kegg.jp/pathway/map00240 as a Custom Pathway
Using 00190 - Oxidative phosphorylation : https://www.kegg.jp/pathway/map00190 as a Custom Pathway
- Fetching the analysis accession list using the study accessions.
<- capture.output({
output <- mgnify_analyses_from_studies(mg, PATHWAY_STUDY_IDS)
all_accessions <- mgnify_get_analyses_metadata(mg, all_accessions)
all_metadata })
- Keeping just the first analysis accession to fetch the kegg orthologs count table from the MGnify API and transform from JSON to matrix.
= head(all_accessions, 1)
accession = paste0('analyses/',accession,'/kegg-orthologs') ko_loc
= mgnify_retrieve_json(mg, path = ko_loc)
ko_json = as.data.frame(ko_json %>% spread_all)[ , c("attributes.accession", "attributes.count")]
ko_data = data.frame(ko_data, row.names=1)
ko_data colnames(ko_data)[1] = 'counts'
= data.matrix(ko_data)
ko_matrix head(ko_matrix, 3)
counts | |
---|---|
K10822 | 115 |
K02030 | 91 |
K02004 | 79 |
- Fetch the modules completeness table and filter out completeness < 100%.
= paste0('analyses/',accession,'/kegg-modules')
comp_loc = mgnify_retrieve_json(mg, path = comp_loc)
ko_comp_json = as.data.frame(ko_comp_json %>% spread_all)
ko_comp = ko_comp[ko_comp$attributes.completeness == 100,][, c("attributes.accession")]
modules head(modules)
- 'M00001'
- 'M00002'
- 'M00003'
- 'M00004'
- 'M00005'
- 'M00006'
1.2. Selecting the most complete pathways
# Setting up function that collects KEGG pathways for a given list of IDs, excluding chemical structure & global maps <- function(ids_list) { collect_pathways = list() pathways for (id in ids_list) { = as.list(keggLink("pathway", id)) current_pathway for (index in grep("map", current_pathway)) { = gsub("*path:", "", current_pathway[index]) clean_id # Discarding chemical structure (map010XX), global (map011XX), and overview (map012XX) maps = substring(clean_id, 1, 6) prefix if(is.na(match("map010", prefix)) & is.na(match("map011", prefix)) & is.na(match("map012", prefix)) ){ = append(pathways, clean_id) pathways } } }return(pathways) }
- Now we need to collect the list of template pathways where these complete modules can be draw. This step takes less than 1 minute to run.
= collect_pathways(modules)
md_pathways head(md_pathways)
- 'map00010'
- 'map00010'
- 'map00010'
- 'map00020'
- 'map00030'
- 'map00030'
- In order to draw the most complete pathways maps, we will use the list of templates we obtained in the previous step and select only pathways having all their constituent modules.
# Counting the number of modules we have in each pathway
= list()
our_pathways_counts for (path_element in md_pathways) {
if (path_element %in% names(our_pathways_counts)) {
= our_pathways_counts[[path_element]] + 1
new_value = new_value
our_pathways_counts[path_element] else {
} = 1
our_pathways_counts[path_element]
}
}
# Counting the number of modules expected in each pathway
= unique(md_pathways)
u_pathways = list()
exp_pathways_counts for (path in u_pathways) {
= length(as.list(keggLink("module", path)))
mod_count = mod_count
exp_pathways_counts[path]
}
# Selecting the pathways having all their constituent modules. We remove the 'map' prefix as pathview doesn't like it
= list()
to_draw for (pathway in names(our_pathways_counts)) {
= our_pathways_counts[[pathway]]
our_value = exp_pathways_counts[[pathway]]
exp_value = our_value / exp_value
ratio if (ratio == 1) {
= gsub("map", "", pathway)
nude_id = append(to_draw, nude_id)
to_draw
}
}
# Adding the custom pathways to to_draw if not present already
for (pathway in CUSTOM_PATHWAY_IDS){
if (!(pathway %in% to_draw)) {
= append(to_draw, pathway)
to_draw
} }
# printing name of the pathways to be drawn
for (pathway in to_draw){
display_markdown(paste(pathway, "--> ", get_pathway_info(pathway)[1]," : ",get_pathway_info(pathway)[2], sep=" "))
}
00010 –> Glycolysis / Gluconeogenesis : https://www.kegg.jp/pathway/map00010
00480 –> Glutathione metabolism : https://www.kegg.jp/pathway/map00480
00430 –> Taurine and hypotaurine metabolism : https://www.kegg.jp/pathway/map00430
00521 –> Streptomycin biosynthesis : https://www.kegg.jp/pathway/map00521
00020 –> Citrate cycle (TCA cycle) : https://www.kegg.jp/pathway/map00020
00030 –> Pentose phosphate pathway : https://www.kegg.jp/pathway/map00030
00061 –> Fatty acid biosynthesis : https://www.kegg.jp/pathway/map00061
01232 –> Nucleotide metabolism : https://www.kegg.jp/pathway/map01232
00240 –> Pyrimidine metabolism : https://www.kegg.jp/pathway/map00240
00190 –> Oxidative phosphorylation : https://www.kegg.jp/pathway/map00190
1.3. Ready to draw!
- As we are plotting absence/presence, we set the number of bins = 2, the scale in one direction, and use 1 as limit.
Takes a couple of minutes depending on the number of pathways
- Clearing the current working directory and Displaying all the generated figures that are stored at the
pathway_plots/
directory.
generatePathwayPlots()
Glycolysis / Gluconeogenesis
Citrate cycle (TCA cycle)
Pentose phosphate pathway
Fatty acid biosynthesis
Oxidative phosphorylation
Pyrimidine metabolism
Taurine and hypotaurine metabolism
Glutathione metabolism
Streptomycin biosynthesis
Nucleotide metabolism
Resources and Documentation
MGnify
- https://docs.mgnify.org/src/docs/analysis.html#assembly-analysis-pipeline - Description of the latest MGnify analysis pipeline and the tools it uses.
- https://academic.oup.com/nar/article/48/D1/D570/5614179 - This paper highlights the platform’s functionalities for metagenomic data assembly and analysis. It also describes he implementation of KEGG modules in MGnify.
Pathview
- The official documentation for Pathview, a tool for pathway-based data integration and visualization. It provides an overview of the tool and explains how to use it effectively.
- You can find more information at https://pathview.uncc.edu/overview#kegg_view .
KEGGREST
- KEGGREST is a Bioconductor package that provides programmatic access to the KEGG database.
- More about KEGGREST at http://www.bioconductor.org/packages/release/bioc/vignettes/KEGGREST/inst/doc/KEGGREST-vignette.html .