diff --git a/DESCRIPTION b/DESCRIPTION index d76c01e..f536413 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: MicroBioMap Type: Package Title: Access the microbiome compendium from R -Version: 0.99.13 +Version: 0.99.14 Description: The MicroBioMap offers access to a dataset including more than 168,000 samples of publicly available 16S rRNA amplicon sequencing data, all processed using the same pipeline @@ -16,7 +16,10 @@ Imports: Matrix, data.table, BiocFileCache, - R.utils + R.utils, + dplr, + vegan, + tibble Encoding: UTF-8 Authors@R: c( diff --git a/LICENSE.md b/LICENSE.md index d4e83a1..9691111 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,6 +1,6 @@ # MIT License -Copyright (c) 2023 MicroBioMap authors +Copyright (c) 2026 The University of Chicago Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/R/constants.R b/R/constants.R index 36e85fb..6f7cae8 100644 --- a/R/constants.R +++ b/R/constants.R @@ -1 +1,4 @@ -canonical_doi <- 'https://doi.org/10.5281/zenodo.8186993' +canonical_doi <- c( + 'compendium' = 'https://doi.org/10.5281/zenodo.8186993' + , 'projection' = 'https://doi.org/10.5281/zenodo.19631960' +) diff --git a/R/loader.R b/R/loader.R index 2b42d09..f20615d 100644 --- a/R/loader.R +++ b/R/loader.R @@ -1,93 +1,90 @@ -.getVersions <- function(bfc, verbose=FALSE) { - # Determines the most recent version of the compendium - # and retrieves the manifest that describes all available releases. - # Returns a data.table listing all versions and the necessary URLs - # This requires the canonical_doi configuration value stored in - # constants.R, which always resolves to the most recent version. - - # Check if we've already cached the manifest. - # If not, make an HTTP call to get the URL we need - rpath <- BiocFileCache::bfcquery(bfc, 'manifest')$rpath - if(length(rpath) == 0) { - if(verbose) { - print('Retrieving version information.') - } - resolve <- curl::curl_fetch_memory(canonical_doi) - if(resolve$status_code != 200) { - stop(paste0( - 'Could not resolve canonical DOI. Status code: ', - resolve$status_code - )) - } +#' @importFrom data.table fread setkey +#' @importClassesFrom Matrix TsparseMatrix +#' @import TreeSummarizedExperiment +#' @import R.utils +#' @import ape +#' @importFrom BiocFileCache BiocFileCache bfcrpath bfcquery bfcnew +#' Compendium metadata retrieval +#' +#' Determines the most recent version of the compendium +#' and retrieves the manifest that describes all available releases. +#' @returns +#' This requires the canonical_doi configuration value stored in +#' constants.R, which always resolves to the most recent version. +#' @param bfc BiocFileCache object to use +#' @param entry A string from ['compendium','projection'] indicating which manifest to return. +#' @returns a data.table listing all versions and the necessary URLs +.getVersions <- function(bfc, entry, verbose=FALSE) { + rpath <- BiocFileCache::bfcquery(bfc, entry)$rpath # check if we've already cached the manifest + if(length(rpath) == 0) { + if(verbose) { + print('Retrieving version information.') + } + resolve <- curl::curl_fetch_memory(canonical_doi[entry]) + if(resolve$status_code != 200) { + stop(paste0( + 'Could not resolve canonical DOI. Status code: ', + resolve$status_code + )) + } - if(verbose) { - print('Determined data address:') - print(resolve$url) - } - manifest <- paste0(resolve$url, '/files/manifest.csv') - - rpath <- tryCatch( - { - bfcrpath(bfc, manifest) - }, - error = function(msg){ - print('Could not retrieve manifest file. Falling back to manifest as of v1.1.0') + if(verbose) { + print('Determined data address:') + print(resolve$url) + } + manifest <- paste0(resolve$url, '/files/manifest.csv') + + rpath <- tryCatch( + { + bfcrpath(bfc, manifest) + }, + error = function(msg){ + print('Could not retrieve manifest file. Falling back to manifest as of April 2026.') + towrite <- data.table::data.table( + version = c('1.1.0', '1.0.1'), + zenodo_id = c('13733642', '10452633') + ) + if(entry == 'projection') { + print('Falling back to projection manifest.') towrite <- data.table::data.table( - version = c('1.1.0', '1.0.1'), - zenodo_id = c('13733642', '10452633'), - default = c(TRUE, FALSE) + version = c('0.3.0','0.2.0', '0.1.0'), + zenodo_id = c('20040560','19633215', '19631961'), + default = c(TRUE, FALSE, FALSE) ) - # we save this to the cache so the app remembers not to keep looking online - # for a manifest every time the version information is needed - savepath <- BiocFileCache::bfcnew(bfc, 'manifest', ext='.csv') - data.table::fwrite(towrite, file=savepath) - savepath } - ) - } - else { - if(verbose) { - print('Cached version information found.') + + # we save this to the cache so the app remembers not to keep looking online + # for a manifest every time the version information is needed + savepath <- BiocFileCache::bfcnew(bfc, entry, ext='.csv') + data.table::fwrite(towrite, file=savepath) + savepath } + ) + } + else { + if(verbose) { + print('Cached version information found.') } - results <- data.table::fread(rpath) - - colnames(results) <- c('version','zenodo_id','default') - results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/taxonomic_table.csv.gz') + } + results <- data.table::fread(rpath) + # TODO: This is a silly bandage + colnames(results) <- c('version','zenodo_id','default') + results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/taxonomic_table.csv.gz') + results$coldata_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/sample_metadata.tsv') + if(entry == 'projection') { + results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/loadings.txt') results$coldata_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/sample_metadata.tsv') - data.table::setkey(results, version) - results + } + data.table::setkey(results, version) + results } -.getCompendiumData <- function(version, bfc) { - versions <- .getVersions(bfc) - rpath <- bfcrpath(bfc, versions[version]$data_url) - data.table::fread(rpath) -} - -.getCompendiumColdata <- function(version, bfc) { - versions <- .getVersions(bfc) - rpath <- bfcrpath(bfc, versions[version]$coldata_url) - sampdat <- as.data.frame(data.table::fread(rpath)) - rownames(sampdat) <- paste(sampdat[[2]], sampdat[[3]], sep = "_") - sampdat -} - -#' load all compendium data into a TreeSummarizedExperiment +#' Compendium download #' +#' Load all compendium data into a TreeSummarizedExperiment +#' @param version an optional parameter indicating which compendium version to retrieve #' @param bfc BiocFileCache object to use -#' #' @returns a `TreeSummarizedExperiment` -#' -#' @importFrom data.table fread setkey -#' @importClassesFrom Matrix TsparseMatrix -#' @import TreeSummarizedExperiment -#' @import R.utils -#' @import ape -#' @importFrom BiocFileCache BiocFileCache bfcrpath bfcquery bfcnew -#' -#' @export -#' #' @examples #' cpd <- getCompendium() #' @@ -96,9 +93,9 @@ #' assayNames(cpd) #' head(colData(cpd)) #' - +#' @export getCompendium <- function(version=NA, bfc = BiocFileCache::BiocFileCache()) { - versions <- .getVersions(bfc) + versions <- .getVersions(bfc, 'compendium') if(is.na(version)) { # If the user has not specified a version, grab whichever @@ -106,7 +103,7 @@ getCompendium <- function(version=NA, bfc = BiocFileCache::BiocFileCache()) { version <- versions[versions$default,]$version[1] } print(paste('Retrieving compendium version',version)) - dat <-.getCompendiumData(version, bfc) + dat <-.getProjectData(version, 'compendium', bfc) coldat <- .getCompendiumColdata(version, bfc) sampnames <- dat[[2]] diff --git a/R/projection.R b/R/projection.R new file mode 100644 index 0000000..7a8e218 --- /dev/null +++ b/R/projection.R @@ -0,0 +1,184 @@ +#' @import dplyr +#' @importFrom vegan rrarefy +#' @importFrom tibble rownames_to_column column_to_rownames + +#library(tidyr) +.gm_mean = function(x){ + exp(mean(log(x[x > 0]))) +} + +.rclr <- function(a) { + answer <- log(a/.gm_mean(a)) + answer[] <- lapply(answer, function(i) if(is.numeric(i)) ifelse(is.infinite(i), 0, i) else i) + answer +} + +#' Projection retrieval and formatting +#' +#' Filters a data frame containing coordinates for +#' multiple projections and formats it for further use. +#' @param pname string indicating the name of the projection to use +#' @param load_all a matrix of weights (with a "projection" field on which to filter) +#' @returns A data frame of weights for a single projection, +#' plus a "taxon" field containing a string unique to that taxon. +.loadProjection <- function(pname, load_all) { + load_all |> + filter(projection==pname) |> + mutate(taxon = paste(kingdom,phylum,class,order,family, sep='||')) # TODO: use numbers for this instead +} + +#' Projection library builder +#' +#' Utility for working with remotely hosted projection files. +#' @param pname string indicating the name of the projection to use +#' @param bfc BiocFileCache object to use +#' @returns A **function** that allows a user to load a projection using +#' its name. +#' @examples +#' lib <- projection_library() +#' projected <- project_it(my_data, projection_library('full')) +#' +#' @export +projection_library <- function(version=NA, bfc = BiocFileCache::BiocFileCache()) { + versions <- .getVersions(bfc, entry='projection') + print(versions) + + if(is.na(version)) { + # If the user has not specified a version, grab whichever + # is indicated in the manifest as the default (i.e. most recent) + version <- versions[versions$default,]$version[1] + } + print(paste('Retrieving projections version',version)) + load_all <- .getProjectData(version, 'projection', bfc) + + curried <- function(pname) { + .loadProjection(pname, load_all) + } + curried +} + +#' User data formatting +#' +#' @description +#' Formats user input data from two input files into a single data frame for further analysis. +#' Consolidates read counts at the family level. +#' @details +#' Taxonomic information is provided across two files: the matrix of read counts uses +#' id-based column names ("TAX1", "TAX2", etc). The second file associates those column +#' names with inferred taxonomic ranks. +#' This function modifies the count table by annotating the column names with the +#' taxonomic information. +#' @param userdata data frame countaining samples in rows and taxa in columns +#' @param usertaxa data frame associating the column names from userdata with kingdom, phylum, class, +#' order, and family classifications. +#' @returns A data frame with the same number of rows as userdata and one column for each distinct +#' family in the usertaxa file that was observed at least once in the dataset. +#' +#' @export +loadUserData <- function(userdata, usertaxa) { + test <- userdata |> + pivot_longer(!sample + , names_to = 'col_id' + , values_to = 'countnum' + ) |> # one row for each sample/taxon count + filter(countnum > 0) |> + left_join(usertaxa, by='col_id') |> # add taxon names + mutate(famlevel = paste(kingdom,phylum,class,order,family, sep='||')) |> + select(!c('kingdom','phylum','class','order','family')) |> + group_by(sample, famlevel) |> # summarize at family level + summarise( + aggcount = sum(countnum) + ) |> + ungroup() |> + pivot_wider(names_from='famlevel' # make taxon table + , values_from='aggcount' + , values_fill=0 + ) +} + +#' Sample rarefaction +#' +#' @description +#' Helper function for rarefying samples in a taxonomic table to a set read count. +#' @details +#' Throwing away data is generally not helpful, but this step is done to ensure +#' samples being projected into an existing latent space are of the same size as the samples +#' that were included in the original ordination. +#' @param userdata data frame countaining samples in rows and taxa in columns. (Generally here the output of `loadUserData()`) +#' @param level The desired number of reads per sample. **Samples with fewer reads than this will be filtered out.** +#' @returns A data frame of the same dimensions as userdata, but with rarefied read counts that result in +#' all rows summing to `level`. +#' +#' @export +rarefy_dataset <- function(userdata, level=3000, seed=NA) { + if(!is.na(seed)) set.seed(seed) + + userdata |> + unique(by=!c('sample')) |> # ensure no duplicate entries + rowwise() |> + mutate( + totalreads = sum(c_across(where(is.numeric))) + ) |> + filter( + totalreads >= 3000 # make sure each has enough reads + ) |> # TODO log the results of this + select(!totalreads) |> + column_to_rownames('sample') |> + rrarefy(3000) |> + as.data.frame() |> + rownames_to_column('sample') +} + +#' Robust centered log-ratio transformation +#' +#' @description +#' Helper function for applying rCLR to a data frame with a "sample" column. +#' @details +#' This step should happen *after* rarefaction, and converts a matrix of non-negative read counts into +#' one containing unbounded floating-point numbers. +#' @param userdata data frame countaining samples in rows and taxa in columns. +#' @returns A data frame of the same dimensions as userdata, but with rCLR-transformed values +#' +#' @export +transform <- function(userdata) { + userdata |> + column_to_rownames('sample') |> + .rclr() |> + rownames_to_column('sample') +} + +#' Projection operation +#' +#' @description +#' Projects a taxonomic table into an existing ordination using a set of pre-calculated loadings. +#' @details +#' This is the final step in the projection process. The input describes samples with taxa, and the output +#' describes the same taxa with principal components. Note that this step doesn't require rCLR-transformed +#' values *per se* -- input data should be transformed using the same process as the data used in the original +#' ordination. For the compendium ordinations currently available, this means rCLR. +#' @param indata data frame countaining samples in rows and taxa in columns. +#' @param loadings data frame containing taxa in each row (i.e. the indata column names) and +#' a principal component ("PC1", "PC2", etc) in each column. The values in each +#' cell indicate the weight that taxon's read count should be given when calculating +#' a sample's value for that PC. +#' @returns A data frame of the same rows as data and the same columns as loadings +#' +#' @export +project_it <- function(indata, loadings) { + indata |> + pivot_longer(cols=!c(sample) + , names_to='taxon' + , values_to='val' + , values_drop_na=TRUE + ) |> + left_join(loadings, by='taxon') |> + mutate( + across(starts_with('PC') + , \(d) val * d) + ) |> + ungroup() |> + group_by(sample) |> + summarise( + across(starts_with('PC'), sum) + ) +} diff --git a/vignettes/classifier.Rmd b/vignettes/classifier.Rmd new file mode 100644 index 0000000..54dd8dc --- /dev/null +++ b/vignettes/classifier.Rmd @@ -0,0 +1,287 @@ +--- +title: "Projection Recipes" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Recipes} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Introduction + +This page describes examples of using microbiome data projected using the Compendium Explorer web app and R helpers. + +More *tk + +### Setup + +See the Projection vignette for the process to generate a data table in which each row is a sample and each column is a principal component, as defined by your selected projection: + +```r +my_results <- project_it(taxtable, plib('full')) +``` + +Loading libraries, setting up helper functions +```r +library(data.table) +library(ranger) +library(tibble) + +library(dplyr) +library(tidyr) + +fetchmeta.curried <- function(meta) { + # generates a function that always appends the same metadata to a + # taxonomic table + + toreturn <- function(ids, append=NA) { + # Helper function for adding sample-level metadata to a table + # of read counts + if(!is.na(append)) ids$sample <- append + + toreturn <- ids |> + inner_join(meta, by='sample') |> + mutate( + Community_Group = as.factor(Community_Group) + , Lifestyle = as.factor(Lifestyle) + ) + + toreturn + } +} + +dropmeta <- function(df, save = c()) { + # Cleave the metadata off a data table, leaving only the read counts + toreturn <- df |> + select(save, !colnames(my_meta)) + + toreturn +} + +scoremodel <- function(calls, modelname) { + # Fll in the confusion matrix with diagonal values; + # expects a data frame with two columns: + # baseline - the true classes for a given test set + # pred - the predicted calls + + half <- calls |> + mutate( + correct = pred==baseline + ) |> + group_by(baseline) |> + summarise( + TP = sum(correct) + , total = n() + ) |> + mutate( + FN = total - TP + ) |> + arrange(baseline) |> + mutate(matchup = rev(baseline)) # this just adds a column to show which values we want to look up + + half |> # Fill in the other half of the confusion matrix + inner_join(half + , by=c('baseline'='matchup') + , suffix = c('', 'new') + ) |> + select(baseline, TP, FN + , TN=TPnew + , FP=FNnew) |> + mutate(model=modelname + , Precision = TP / (TP + FP) # How many of the positive calls are true? + , Recall = TP / (TP + FN) # How many group members were ID'd? + , PPV = Precision + , FDR = 1 - PPV + , F1 = (2 * Precision * Recall) / (Precision + Recall) + , Sensitivity = Recall # How many group members were ID'd? + , Specificity = TN / (TN + FP) # How many non-members were excluded? + , Accuracy = (TP + TN) / (TP + TN + FP + FN) # How many of the classifications were right? + ) +} +``` + +### Classifier + +This trains a random forest classifier on a subset of your samples (`trainer`) and uses it to generate predictions of a single variable (`dependent`) for a different subset of samples (`tester`). + +```r +classify_it <- function(trainer, tester, dependent) { + # trainer - training set + # tester - test set + # dependent - string indicating the name of the dependent variable, i.e. the + # one to predict + # key - list of the correct classificatiosn for the test set + + print(paste0('dimensions=', ncol(trainer))) + + rf = ranger(dependent.variable.name=dependent, data = (trainer) + , num.trees = 800 + , sample.fraction = 0.5 + ) + + + test_prediction <- predict(rf, tester)$predictions |> + as.data.frame() |> + rename(pred = "predict(rf, tester)$predictions") + + test_prediction +} +``` + +Set up all our data frames. First, we select the first 25 (projected) principal components and separate out one community group as the test set. + +```r +dataset <- my_results |> + select(sample, PC1:PC25) + +train <- dataset |> + fetchmeta() |> + filter(!Community_Group == 'Gond') |> + arrange(sample) |> + dropmeta(save=c('Lifestyle')) + +testset <- dataset |> + fetchmeta() |> + filter(Community_Group == 'Gond') |> + arrange(sample) +``` + +All our helpers make the actual classification statement look mundane: + +```r +projected_model <- classify_it(train, testset, 'Lifestyle') + +projected <- scoremodel(projected_model, "projected") +``` + +### Contextualizing model + +These scores are difficult to interpret on their own, especially because of the class imbalance in the test set. To provide some additional context, we'll also generate predictions using a zero-information model that always guesses the majority class. + +```r +naive_model <- data.frame( + pred = rep('Urban', 97) + , baseline = fetchmeta(testset)$Lifestyle +) + +allurbs <- scoremodel(naive_model, "naive") +``` + +We can also train a new model on 25 principal components, but derived from conventional PCA: + +```r +my_asvs <- read.delim('~/code/pcalab/data/examples/toy/userdata.txt') +my_taxa <- read.delim('~/code/pcalab/data/examples/toy/usertaxa.txt') +my_meta <- read.delim('~/code/pcalab/data/examples/toy/usermeta.txt') + +taxtable <- loadUserData(my_asvs, my_taxa) + +ord <- taxtable |> + prcomp( + tol=0, + center=FALSE + ) +``` + +As above, we can split the resulting points into train and test sets and see how well the Gond classification task works: + +```r +points <- as.data.frame(ord$x) |> + select(PC1:PC25) |> + rownames_to_column('sample') |> + fetchmeta() + +data_train = points |> + filter(!Community_Group == 'Gond') |> + arrange(sample) |> + dropmeta(save=c('Lifestyle')) + +data_test = points |> + filter(Community_Group == 'Gond') |> + arrange(sample) + +key <- data_test$Lifestyle + +data_test <- data_test |> + dropmeta() + +solo_pca <- classify_it(data_train, data_test, 'Lifestyle') |> + scoremodel('solo_pca') +``` + +```r +scores <- rbind(projected, solo_pca, allurbs) |> + mutate( + model = factor(model, levels=c('projected','solo_pca', 'naive')) + ) |> + filter(baseline == 'Urban') |> + select(!baseline) |> + pivot_longer( + !model, names_to='measure', values_to = 'val' + ) +``` + +# PROJECTED +matdefine <- list("Toeval" = c("correct", "wrong"), + "Naive" =c("correct", "wrong")) + +perf.base <- matrix(c(33, 11, 0, 32), # TP, FN, 0, FP + nrow = 2, dimnames = matdefine) + +perf.projected <- matrix(c(36, 8, 0, 22), + nrow = 2, dimnames = matdefine) +mcnemar.test(perf.base) +mcnemar.test(perf.projected) +``` + +```r +# https://machinelearningmastery.com/mcnemars-test-for-machine-learning/ + +mcnemar <- function(classes) { + yesno <- classes |> + filter(test == baseline) |> + filter(naive != baseline) |> + nrow() + + noyes <- classes |> + filter(test != baseline) |> + filter(naive == baseline) |> + nrow() + + print(paste0('yesno = ', yesno)) + print(paste0('noyes = ', noyes)) + + #mcnstat <- ((yesno - noyes)^2) / (yesno + noyes) + mcnstat <- ((noyes - yesno - 1)^2) / (yesno + noyes) + + print(paste0('stat = ', mcnstat)) + + print(paste0('p = ' + , pchisq(mcnstat + , df=1, lower.tail = FALSE) + )) +} + +# ORIGINAL to naive +compare |> + select(baseline, naive + , test=original + ) |> + mcnemar() + +# PROJECTED to naive +compare |> + select(baseline, naive + , test=projected + ) |> + mcnemar() + +# DIRECT COMPARE +compare |> + select(baseline, naive=original + , test=projected + ) |> + mcnemar() +``` + +All done here *tk* \ No newline at end of file diff --git a/vignettes/projection.Rmd b/vignettes/projection.Rmd new file mode 100644 index 0000000..6390e41 --- /dev/null +++ b/vignettes/projection.Rmd @@ -0,0 +1,51 @@ +--- +title: "Compendium Explorer" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Projection} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Overview + +Intro *tk + +## Getting started + +### Installation + +```{r eval=FALSE} +library(BiocManager) +BiocManager::install("blekhmanlab/MicroBioMap") +``` + +## Basic usage + +```{r message=FALSE} +library(MicroBioMap) + +my_asvs <- read.delim('~/code/pcalab/data/examples/toy/userdata.txt') +my_taxa <- read.delim('~/code/pcalab/data/examples/toy/usertaxa.txt') +my_meta <- read.delim('~/code/pcalab/data/examples/toy/usermeta.txt') + +taxtable <- loadUserData(my_asvs, my_taxa) +``` + +*tk + +```{r} +# this operation requires about 4GB of RAM +plib <- projection_library() +``` + +*tk +```{r} +my_results <- project_it(taxtable, plib('full')) +``` + +## sessionInfo + +```{r} +sessionInfo() +```