Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: MicroBioMap
Type: Package
Title: Access the microbiome compendium from R
Version: 0.99.13
Version: 0.99.14
Description: The MicroBioMap offers access to a dataset including more than
168,000 samples of publicly available
16S rRNA amplicon sequencing data, all processed using the same pipeline
Expand All @@ -16,7 +16,10 @@ Imports:
Matrix,
data.table,
BiocFileCache,
R.utils
R.utils,
dplr,
vegan,
tibble
Encoding: UTF-8
Authors@R:
c(
Expand Down
2 changes: 1 addition & 1 deletion LICENSE.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# MIT License

Copyright (c) 2023 MicroBioMap authors
Copyright (c) 2026 The University of Chicago

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
5 changes: 4 additions & 1 deletion R/constants.R
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
canonical_doi <- 'https://doi.org/10.5281/zenodo.8186993'
canonical_doi <- c(
'compendium' = 'https://doi.org/10.5281/zenodo.8186993'
, 'projection' = 'https://doi.org/10.5281/zenodo.19631960'
)
159 changes: 78 additions & 81 deletions R/loader.R
Original file line number Diff line number Diff line change
@@ -1,93 +1,90 @@
.getVersions <- function(bfc, verbose=FALSE) {
# Determines the most recent version of the compendium
# and retrieves the manifest that describes all available releases.
# Returns a data.table listing all versions and the necessary URLs
# This requires the canonical_doi configuration value stored in
# constants.R, which always resolves to the most recent version.

# Check if we've already cached the manifest.
# If not, make an HTTP call to get the URL we need
rpath <- BiocFileCache::bfcquery(bfc, 'manifest')$rpath
if(length(rpath) == 0) {
if(verbose) {
print('Retrieving version information.')
}
resolve <- curl::curl_fetch_memory(canonical_doi)
if(resolve$status_code != 200) {
stop(paste0(
'Could not resolve canonical DOI. Status code: ',
resolve$status_code
))
}
#' @importFrom data.table fread setkey
#' @importClassesFrom Matrix TsparseMatrix
#' @import TreeSummarizedExperiment
#' @import R.utils
#' @import ape
#' @importFrom BiocFileCache BiocFileCache bfcrpath bfcquery bfcnew
#' Compendium metadata retrieval
#'
#' Determines the most recent version of the compendium
#' and retrieves the manifest that describes all available releases.
#' @returns
#' This requires the canonical_doi configuration value stored in
#' constants.R, which always resolves to the most recent version.
#' @param bfc BiocFileCache object to use
#' @param entry A string from ['compendium','projection'] indicating which manifest to return.
#' @returns a data.table listing all versions and the necessary URLs
.getVersions <- function(bfc, entry, verbose=FALSE) {
rpath <- BiocFileCache::bfcquery(bfc, entry)$rpath # check if we've already cached the manifest
if(length(rpath) == 0) {
if(verbose) {
print('Retrieving version information.')
}
resolve <- curl::curl_fetch_memory(canonical_doi[entry])
if(resolve$status_code != 200) {
stop(paste0(
'Could not resolve canonical DOI. Status code: ',
resolve$status_code
))
}

if(verbose) {
print('Determined data address:')
print(resolve$url)
}
manifest <- paste0(resolve$url, '/files/manifest.csv')

rpath <- tryCatch(
{
bfcrpath(bfc, manifest)
},
error = function(msg){
print('Could not retrieve manifest file. Falling back to manifest as of v1.1.0')
if(verbose) {
print('Determined data address:')
print(resolve$url)
}
manifest <- paste0(resolve$url, '/files/manifest.csv')

rpath <- tryCatch(
{
bfcrpath(bfc, manifest)
},
error = function(msg){
print('Could not retrieve manifest file. Falling back to manifest as of April 2026.')
towrite <- data.table::data.table(
version = c('1.1.0', '1.0.1'),
zenodo_id = c('13733642', '10452633')
)
if(entry == 'projection') {
print('Falling back to projection manifest.')
towrite <- data.table::data.table(
version = c('1.1.0', '1.0.1'),
zenodo_id = c('13733642', '10452633'),
default = c(TRUE, FALSE)
version = c('0.3.0','0.2.0', '0.1.0'),
zenodo_id = c('20040560','19633215', '19631961'),
default = c(TRUE, FALSE, FALSE)
)
# we save this to the cache so the app remembers not to keep looking online
# for a manifest every time the version information is needed
savepath <- BiocFileCache::bfcnew(bfc, 'manifest', ext='.csv')
data.table::fwrite(towrite, file=savepath)
savepath
}
)
}
else {
if(verbose) {
print('Cached version information found.')

# we save this to the cache so the app remembers not to keep looking online
# for a manifest every time the version information is needed
savepath <- BiocFileCache::bfcnew(bfc, entry, ext='.csv')
data.table::fwrite(towrite, file=savepath)
savepath
}
)
}
else {
if(verbose) {
print('Cached version information found.')
}
results <- data.table::fread(rpath)

colnames(results) <- c('version','zenodo_id','default')
results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/taxonomic_table.csv.gz')
}
results <- data.table::fread(rpath)
# TODO: This is a silly bandage
colnames(results) <- c('version','zenodo_id','default')
results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/taxonomic_table.csv.gz')
results$coldata_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/sample_metadata.tsv')
if(entry == 'projection') {
results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/loadings.txt')
results$coldata_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/sample_metadata.tsv')
data.table::setkey(results, version)
results
}
data.table::setkey(results, version)
results
}

.getCompendiumData <- function(version, bfc) {
versions <- .getVersions(bfc)
rpath <- bfcrpath(bfc, versions[version]$data_url)
data.table::fread(rpath)
}

.getCompendiumColdata <- function(version, bfc) {
versions <- .getVersions(bfc)
rpath <- bfcrpath(bfc, versions[version]$coldata_url)
sampdat <- as.data.frame(data.table::fread(rpath))
rownames(sampdat) <- paste(sampdat[[2]], sampdat[[3]], sep = "_")
sampdat
}

#' load all compendium data into a TreeSummarizedExperiment
#' Compendium download
#'
#' Load all compendium data into a TreeSummarizedExperiment
#' @param version an optional parameter indicating which compendium version to retrieve
#' @param bfc BiocFileCache object to use
#'
#' @returns a `TreeSummarizedExperiment`
#'
#' @importFrom data.table fread setkey
#' @importClassesFrom Matrix TsparseMatrix
#' @import TreeSummarizedExperiment
#' @import R.utils
#' @import ape
#' @importFrom BiocFileCache BiocFileCache bfcrpath bfcquery bfcnew
#'
#' @export
#'
#' @examples
#' cpd <- getCompendium()
#'
Expand All @@ -96,17 +93,17 @@
#' assayNames(cpd)
#' head(colData(cpd))
#'

#' @export
getCompendium <- function(version=NA, bfc = BiocFileCache::BiocFileCache()) {
versions <- .getVersions(bfc)
versions <- .getVersions(bfc, 'compendium')

if(is.na(version)) {
# If the user has not specified a version, grab whichever
# is indicated in the manifest as the default (i.e. most recent)
version <- versions[versions$default,]$version[1]
}
print(paste('Retrieving compendium version',version))
dat <-.getCompendiumData(version, bfc)
dat <-.getProjectData(version, 'compendium', bfc)
coldat <- .getCompendiumColdata(version, bfc)

sampnames <- dat[[2]]
Expand Down
Loading