diff --git a/data/mapping-specifications.json b/data/mapping-specifications.json index 1aa3e12..752c449 100644 --- a/data/mapping-specifications.json +++ b/data/mapping-specifications.json @@ -343,10 +343,10 @@ "metadata_completeness_score": 0.2727 }, { - "id": "https://w3id.org/sssom/mappings/67df215d-fffd-4d37-b42f-268fa6cea80c", - "license": "https://w3id.org/sssom/license/unspecified", - "type": "sssom", + "id": "http://w3id.org/sssom/commons/monarch/mappings/ncit-hp.sssom.tsv", "content_url": "http://w3id.org/sssom/commons/monarch/mappings/ncit-hp.sssom.tsv", + "type": "sssom", + "status": "fetch_error", "registries": [ { "id": "https://w3id.org/sssom/commons/monarch", @@ -354,8 +354,7 @@ "url": "https://github.com/monarch-initiative/monarch-mapping-commons" } ], - "status": "ok", - "metadata_completeness_score": 0.2727 + "metadata_completeness_score": 0.0 }, { "id": "https://w3id.org/sssom/mappings/467f578f-3319-4fb4-92fb-bddaf41b0c1f", @@ -557,11 +556,10 @@ "metadata_completeness_score": 0.3636 }, { - "id": "https://doi.org/10.5281/zenodo.6949696", - "version": "V1.1", - "license": "https://creativecommons.org/publicdomain/zero/1.0/", - "type": "sssom", + "id": "https://w3id.org/cpont/mappings/omop2obo_drug_exposure.sssom.tsv", "content_url": "https://w3id.org/cpont/mappings/omop2obo_drug_exposure.sssom.tsv", + "type": "sssom", + "status": "fetch_error", "registries": [ { "id": "https://w3id.org/cpont/mappings", @@ -569,8 +567,7 @@ "url": "https://gitlab.c-path.org/c-pathontology/mapping-commons" } ], - "status": "ok", - "metadata_completeness_score": 0.3636 + "metadata_completeness_score": 0.0 }, { "id": "https://doi.org/10.5281/zenodo.6949858", @@ -618,7 +615,8 @@ "registries": [ { "id": "https://github.com/biopragmatics/mapping-registry", - "name": "Biopragmatics Mapping Registry" + "name": "Biopragmatics Mapping Registry", + "url": "https://github.com/biopragmatics" } ], "status": "ok", @@ -633,7 +631,8 @@ "registries": [ { "id": "https://github.com/biopragmatics/mapping-registry", - "name": "Biopragmatics Mapping Registry" + "name": "Biopragmatics Mapping Registry", + "url": "https://github.com/biopragmatics" } ], "status": "ok", @@ -643,11 +642,12 @@ "id": "https://zenodo.org/records/15504009/files/mappings.sssom.tsv.gz", "content_url": "https://zenodo.org/records/15504009/files/mappings.sssom.tsv.gz", "type": "sssom", - "status": "nonstandard_format", + "status": "no_metadata", "registries": [ { "id": "https://github.com/biopragmatics/mapping-registry", - "name": "Biopragmatics Mapping Registry" + "name": "Biopragmatics Mapping Registry", + "url": "https://github.com/biopragmatics" } ], "metadata_completeness_score": 0.0 @@ -656,11 +656,12 @@ "id": "https://zenodo.org/records/15826794/files/priority.sssom.tsv.gz", "content_url": "https://zenodo.org/records/15826794/files/priority.sssom.tsv.gz", "type": "sssom", - "status": "nonstandard_format", + "status": "no_metadata", "registries": [ { "id": "https://github.com/biopragmatics/mapping-registry", - "name": "Biopragmatics Mapping Registry" + "name": "Biopragmatics Mapping Registry", + "url": "https://github.com/biopragmatics" } ], "metadata_completeness_score": 0.0 @@ -669,11 +670,12 @@ "id": "https://zenodo.org/records/15826794/files/processed.sssom.tsv.gz", "content_url": "https://zenodo.org/records/15826794/files/processed.sssom.tsv.gz", "type": "sssom", - "status": "nonstandard_format", + "status": "no_metadata", "registries": [ { "id": "https://github.com/biopragmatics/mapping-registry", - "name": "Biopragmatics Mapping Registry" + "name": "Biopragmatics Mapping Registry", + "url": "https://github.com/biopragmatics" } ], "metadata_completeness_score": 0.0 @@ -682,11 +684,12 @@ "id": "https://zenodo.org/records/15826794/files/raw.sssom.tsv.gz", "content_url": "https://zenodo.org/records/15826794/files/raw.sssom.tsv.gz", "type": "sssom", - "status": "nonstandard_format", + "status": "no_metadata", "registries": [ { "id": "https://github.com/biopragmatics/mapping-registry", - "name": "Biopragmatics Mapping Registry" + "name": "Biopragmatics Mapping Registry", + "url": "https://github.com/biopragmatics" } ], "metadata_completeness_score": 0.0 @@ -695,11 +698,12 @@ "id": "https://zenodo.org/records/15826779/files/priority.sssom.tsv.gz", "content_url": "https://zenodo.org/records/15826779/files/priority.sssom.tsv.gz", "type": "sssom", - "status": "nonstandard_format", + "status": "no_metadata", "registries": [ { "id": "https://github.com/biopragmatics/mapping-registry", - "name": "Biopragmatics Mapping Registry" + "name": "Biopragmatics Mapping Registry", + "url": "https://github.com/biopragmatics" } ], "metadata_completeness_score": 0.0 @@ -708,11 +712,12 @@ "id": "https://zenodo.org/records/15826779/files/processed.sssom.tsv.gz", "content_url": "https://zenodo.org/records/15826779/files/processed.sssom.tsv.gz", "type": "sssom", - "status": "nonstandard_format", + "status": "no_metadata", "registries": [ { "id": "https://github.com/biopragmatics/mapping-registry", - "name": "Biopragmatics Mapping Registry" + "name": "Biopragmatics Mapping Registry", + "url": "https://github.com/biopragmatics" } ], "metadata_completeness_score": 0.0 @@ -721,11 +726,12 @@ "id": "https://zenodo.org/records/15826779/files/raw.sssom.tsv.gz", "content_url": "https://zenodo.org/records/15826779/files/raw.sssom.tsv.gz", "type": "sssom", - "status": "nonstandard_format", + "status": "no_metadata", "registries": [ { "id": "https://github.com/biopragmatics/mapping-registry", - "name": "Biopragmatics Mapping Registry" + "name": "Biopragmatics Mapping Registry", + "url": "https://github.com/biopragmatics" } ], "metadata_completeness_score": 0.0 @@ -734,11 +740,12 @@ "id": "https://zenodo.org/records/15826768/files/priority.sssom.tsv.gz", "content_url": "https://zenodo.org/records/15826768/files/priority.sssom.tsv.gz", "type": "sssom", - "status": "nonstandard_format", + "status": "no_metadata", "registries": [ { "id": "https://github.com/biopragmatics/mapping-registry", - "name": "Biopragmatics Mapping Registry" + "name": "Biopragmatics Mapping Registry", + "url": "https://github.com/biopragmatics" } ], "metadata_completeness_score": 0.0 @@ -747,11 +754,12 @@ "id": "https://zenodo.org/records/15826768/files/processed.sssom.tsv.gz", "content_url": "https://zenodo.org/records/15826768/files/processed.sssom.tsv.gz", "type": "sssom", - "status": "nonstandard_format", + "status": "no_metadata", "registries": [ { "id": "https://github.com/biopragmatics/mapping-registry", - "name": "Biopragmatics Mapping Registry" + "name": "Biopragmatics Mapping Registry", + "url": "https://github.com/biopragmatics" } ], "metadata_completeness_score": 0.0 @@ -760,11 +768,12 @@ "id": "https://zenodo.org/records/15826768/files/raw.sssom.tsv.gz", "content_url": "https://zenodo.org/records/15826768/files/raw.sssom.tsv.gz", "type": "sssom", - "status": "nonstandard_format", + "status": "no_metadata", "registries": [ { "id": "https://github.com/biopragmatics/mapping-registry", - "name": "Biopragmatics Mapping Registry" + "name": "Biopragmatics Mapping Registry", + "url": "https://github.com/biopragmatics" } ], "metadata_completeness_score": 0.0 @@ -773,11 +782,12 @@ "id": "https://zenodo.org/records/15826754/files/priority.sssom.tsv.gz", "content_url": "https://zenodo.org/records/15826754/files/priority.sssom.tsv.gz", "type": "sssom", - "status": "nonstandard_format", + "status": "no_metadata", "registries": [ { "id": "https://github.com/biopragmatics/mapping-registry", - "name": "Biopragmatics Mapping Registry" + "name": "Biopragmatics Mapping Registry", + "url": "https://github.com/biopragmatics" } ], "metadata_completeness_score": 0.0 @@ -786,11 +796,12 @@ "id": "https://zenodo.org/records/15826754/files/processed.sssom.tsv.gz", "content_url": "https://zenodo.org/records/15826754/files/processed.sssom.tsv.gz", "type": "sssom", - "status": "nonstandard_format", + "status": "no_metadata", "registries": [ { "id": "https://github.com/biopragmatics/mapping-registry", - "name": "Biopragmatics Mapping Registry" + "name": "Biopragmatics Mapping Registry", + "url": "https://github.com/biopragmatics" } ], "metadata_completeness_score": 0.0 @@ -799,11 +810,12 @@ "id": "https://zenodo.org/records/15826754/files/raw.sssom.tsv.gz", "content_url": "https://zenodo.org/records/15826754/files/raw.sssom.tsv.gz", "type": "sssom", - "status": "nonstandard_format", + "status": "no_metadata", "registries": [ { "id": "https://github.com/biopragmatics/mapping-registry", - "name": "Biopragmatics Mapping Registry" + "name": "Biopragmatics Mapping Registry", + "url": "https://github.com/biopragmatics" } ], "metadata_completeness_score": 0.0 @@ -812,11 +824,12 @@ "id": "https://zenodo.org/records/15826693/files/priority.sssom.tsv.gz", "content_url": "https://zenodo.org/records/15826693/files/priority.sssom.tsv.gz", "type": "sssom", - "status": "nonstandard_format", + "status": "no_metadata", "registries": [ { "id": "https://github.com/biopragmatics/mapping-registry", - "name": "Biopragmatics Mapping Registry" + "name": "Biopragmatics Mapping Registry", + "url": "https://github.com/biopragmatics" } ], "metadata_completeness_score": 0.0 @@ -825,11 +838,12 @@ "id": "https://zenodo.org/records/15826693/files/processed.sssom.tsv.gz", "content_url": "https://zenodo.org/records/15826693/files/processed.sssom.tsv.gz", "type": "sssom", - "status": "nonstandard_format", + "status": "no_metadata", "registries": [ { "id": "https://github.com/biopragmatics/mapping-registry", - "name": "Biopragmatics Mapping Registry" + "name": "Biopragmatics Mapping Registry", + "url": "https://github.com/biopragmatics" } ], "metadata_completeness_score": 0.0 @@ -838,11 +852,12 @@ "id": "https://zenodo.org/records/15826693/files/raw.sssom.tsv.gz", "content_url": "https://zenodo.org/records/15826693/files/raw.sssom.tsv.gz", "type": "sssom", - "status": "nonstandard_format", + "status": "no_metadata", "registries": [ { "id": "https://github.com/biopragmatics/mapping-registry", - "name": "Biopragmatics Mapping Registry" + "name": "Biopragmatics Mapping Registry", + "url": "https://github.com/biopragmatics" } ], "metadata_completeness_score": 0.0 diff --git a/scripts/cli.py b/scripts/cli.py index 26ef11f..471389b 100644 --- a/scripts/cli.py +++ b/scripts/cli.py @@ -2,6 +2,8 @@ import yaml import requests import json +import gzip +import io import os import logging import traceback @@ -33,10 +35,18 @@ def _registry_info(registry_data: dict) -> dict: } -def _classify_error(mapping_set_uri: str, error: Exception) -> str: +def _iter_gz_lines(response: requests.Response): + """Yield decompressed text lines from a gzipped HTTP response, streaming.""" + decompressor = gzip.GzipFile(fileobj=response.raw) + reader = io.TextIOWrapper(decompressor, encoding="utf-8") + for line in reader: + yield line.rstrip("\n") + + +def _classify_error(error: Exception) -> str: """Classify a mapping set processing error into a status string.""" error_msg = str(error).lower() - if "bytes-like object" in error_msg or mapping_set_uri.endswith(".gz"): + if "bytes-like object" in error_msg: return "nonstandard_format" if "no #-commented header" in error_msg: return "no_metadata" @@ -67,7 +77,11 @@ def _process_sssom_mapping_set(mapping_set_id: str, mapping_set_uri: str) -> dic raise RuntimeError(f"HTTP {response.status_code} fetching {mapping_set_uri}") try: - meta = parse_sssom_tsv(response.iter_lines(decode_unicode=True)) + if mapping_set_uri.endswith(".gz"): + lines = _iter_gz_lines(response) + else: + lines = response.iter_lines(decode_unicode=True) + meta = parse_sssom_tsv(lines) finally: response.close() @@ -184,7 +198,7 @@ def prepare_mapping_registry(registry_file, output_file, log_file): mapping_set_id, registry_id, registry_title, ) except Exception as e: - status = _classify_error(mapping_set_uri, e) + status = _classify_error(e) stub = _stub_spec(mapping_set_id, status) stub["registries"] = [info] specifications.append(stub)