diff --git a/Dockerfile b/Dockerfile index 931f0ca..7ef1d61 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,9 +13,4 @@ RUN uv sync COPY --chmod=755 . . # Container start script -CMD ["uv", "run", "gunicorn", "main:app", \ - "--timeout", "300", \ - "--graceful-timeout", "30", \ - "-k", "uvicorn.workers.UvicornWorker", \ - "-w", "1", \ - "-b", "0.0.0.0:5000"] \ No newline at end of file +CMD ["uv", "run", "gunicorn", "main:app", "-k", "uvicorn.workers.UvicornWorker", "-w", "2", "--bind", "0.0.0.0:5000", "--timeout", "30", "--graceful-timeout", "15", "--max-requests", "1000", "--max-requests-jitter", "100", "--keep-alive", "5", "--access-logfile", "-", "--error-logfile", "-"] \ No newline at end of file diff --git a/README.md b/README.md index 7cd85ed..68a60ab 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,18 @@ # Wikidata Textifier -**Wikidata Textifier** is an API that transforms Wikidata entities into compact outputs for LLM and GenAI use cases. -It resolves missing labels for properties and claim values using the Wikidata Action API and caches labels to reduce repeated lookups. +**Wikidata Textifier** is an API that transforms entities into compact outputs for LLM and GenAI use cases. +It resolves missing labels for properties and claim values using the Wikidata/Wikibase Action API and caches labels to reduce repeated lookups. Live API: [wd-textify.wmcloud.org](https://wd-textify.wmcloud.org/) \ API Docs: [wd-textify.wmcloud.org/docs](https://wd-textify.wmcloud.org/docs) ## Features -- Textify Wikidata entities as `json`, `text`, or `triplet`. +- Textify entities as `json`, `text`, or `triplet`. - Resolve labels for linked entities and properties. - Cache labels in MariaDB for faster repeated requests. - Support multilingual output with fallback language support. -- Avoid SPARQL and use Wikidata Action API / EntityData endpoints. +- Avoid SPARQL and use Wikibase Action API / EntityData endpoints. ## Output Formats @@ -20,6 +20,11 @@ API Docs: [wd-textify.wmcloud.org/docs](https://wd-textify.wmcloud.org/docs) - `text`: Readable summary including label, description, aliases, and attributes. - `triplet`: Triplet-style lines with labels and IDs for graph-style traversal. +## Future Plan + +- Replace Action API with GraphQL once Wikibase GraphQL is available for Wikibases: + [Wikidata: Wikibase GraphQL](https://www.wikidata.org/wiki/Wikidata:Wikibase_GraphQL) + ## API ### `GET /` @@ -28,7 +33,7 @@ API Docs: [wd-textify.wmcloud.org/docs](https://wd-textify.wmcloud.org/docs) | Name | Type | Required | Description | |---|---|---|---| -| `id` | string | Yes | Comma-separated Wikidata IDs (for example: `Q42` or `Q42,Q2`). | +| `id` | string | Yes | Comma-separated entity IDs (for example: `Q42` or `Q42,Q2`). | | `pid` | string | No | Comma-separated property IDs to filter claims (for example: `P31,P279`). | | `lang` | string | No | Preferred language code (default: `en`). | | `fallback_lang` | string | No | Fallback language code (default: `en`). | @@ -37,6 +42,7 @@ API Docs: [wd-textify.wmcloud.org/docs](https://wd-textify.wmcloud.org/docs) | `all_ranks` | bool | No | Include all statement ranks instead of preferred/normal filtering (default: `false`). | | `qualifiers` | bool | No | Include qualifiers in claim values (default: `true`). | | `references` | bool | No | Include references in claim values (default: `false`). | +| `action_api_url` | string | No | Action API URL (default: `https://www.wikidata.org/w/api.php`). | #### Example requests diff --git a/docker-compose.yml b/docker-compose.yml index 594dae4..c697cc7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,8 +13,6 @@ services: volumes: - ./data/mysql:/var/lib/mysql - ./docker-entrypoint-initdb:/docker-entrypoint-initdb.d - ports: - - "3306:3306" healthcheck: test: ["CMD-SHELL", "mariadb-admin ping -h 127.0.0.1 -u root -p$${MARIADB_ROOT_PASSWORD} --silent"] interval: 5s diff --git a/main.py b/main.py index 9ffe3f2..db154ab 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,4 @@ -"""FastAPI application that exposes Wikidata textification endpoints.""" +"""FastAPI application that exposes Wikidata/Wikibase textification endpoints.""" import os import time @@ -9,7 +9,7 @@ from fastapi.middleware.cors import CORSMiddleware from src import utils -from src.Normalizer import JSONNormalizer, TTLNormalizer +from src.Normalizer import JSONNormalizer from src.WikidataLabel import LazyLabelFactory, WikidataLabel # Start Fastapi app @@ -45,7 +45,7 @@ async def startup(): "/", responses={ 200: { - "description": "Returns a list of relevant Wikidata property PIDs with similarity scores", + "description": "Returns textified entities keyed by requested IDs", "content": { "application/json": { "example": [ @@ -57,8 +57,21 @@ async def startup(): }, }, 422: { - "description": "Missing or invalid query parameter", - "content": {"application/json": {"example": {"detail": "Invalid format specified"}}}, + "description": "Validation error for missing or invalid query parameters", + "content": { + "application/json": { + "example": { + "detail": [ + { + "type": "missing", + "loc": ["query", "id"], + "msg": "Field required", + "input": None, + } + ] + } + } + }, }, }, ) @@ -74,15 +87,16 @@ async def get_textified_wd( all_ranks: bool = False, qualifiers: bool = True, fallback_lang: str = "en", + action_api_url: str = "https://www.wikidata.org/w/api.php", ): - """Retrieve Wikidata entities as structured JSON, natural text, or triplet lines. + """Retrieve entities as structured JSON, natural text, or triplet lines. This endpoint fetches one or more entities, resolves missing labels, and normalizes claims into a compact representation suitable for downstream LLM use. **Args:** - - **id** (str): Comma-separated Wikidata IDs to fetch (for example: `"Q42"` or `"Q42,Q2"`). + - **id** (str): Comma-separated entity IDs to fetch (for example: `"Q42"` or `"Q42,Q2"`). - **pid** (str, optional): Comma-separated property IDs used to filter returned claims (for example: `"P31,P279"`). - **lang** (str): Preferred language code for labels and formatted values. - **format** (str): Output format. One of `"json"`, `"text"`, or `"triplet"`. @@ -91,6 +105,8 @@ async def get_textified_wd( - **all_ranks** (bool): If `true`, include preferred, normal, and deprecated statement ranks. - **qualifiers** (bool): If `true`, include qualifiers for claim values. - **fallback_lang** (str): Fallback language used when `lang` is unavailable. + - **action_api_url** (str): Action API URL + (default: `https://www.wikidata.org/w/api.php`). **Returns:** @@ -107,74 +123,44 @@ async def get_textified_wd( filter_pids = [p.strip() for p in pid.split(",")] qids = [q.strip() for q in id.split(",")] - label_factory = LazyLabelFactory(lang=lang, fallback_lang=fallback_lang) + label_factory = LazyLabelFactory(lang=lang, fallback_lang=fallback_lang, wb_url=action_api_url) + # JSON is used with Action API for bulk retrieval entities = {} - if len(qids) == 1: - # When one QID is requested, TTL is used - try: - entity_data = utils.get_wikidata_ttl_by_id(qids[0], lang=lang) - except requests.HTTPError: - entity_data = None - - if not entity_data: - response = "ID not found" - raise HTTPException(status_code=404, detail=response) - - entity_data = TTLNormalizer( - entity_id=qids[0], - ttl_text=entity_data, + try: + entity_data = utils.get_wikidata_json_by_ids(qids, action_api_url=action_api_url) + except requests.HTTPError: + entity_data = None + if not entity_data: + response = "IDs not found" + raise HTTPException(status_code=404, detail=response) + + entity_data = { + qid: JSONNormalizer( + entity_id=qid, + entity_json=entity_data[qid], lang=lang, fallback_lang=fallback_lang, label_factory=label_factory, debug=False, ) - - entities = { - qids[0]: entity_data.normalize( - external_ids=external_ids, - all_ranks=all_ranks, - references=references, - filter_pids=filter_pids, - qualifiers=qualifiers, - ) - } - else: - # JSON is used with Action API for bulk retrieval - try: - entity_data = utils.get_wikidata_json_by_ids(qids) - except requests.HTTPError: - entity_data = None - if not entity_data: - response = "IDs not found" - raise HTTPException(status_code=404, detail=response) - - entity_data = { - qid: JSONNormalizer( - entity_id=qid, - entity_json=entity_data[qid], - lang=lang, - fallback_lang=fallback_lang, - label_factory=label_factory, - debug=False, - ) - if entity_data.get(qid) - else None - for qid in qids - } - - entities = { - qid: entity.normalize( - external_ids=external_ids, - all_ranks=all_ranks, - references=references, - filter_pids=filter_pids, - qualifiers=qualifiers, - ) - if entity - else None - for qid, entity in entity_data.items() - } + if entity_data.get(qid) + else None + for qid in qids + } + + entities = { + qid: entity.normalize( + external_ids=external_ids, + all_ranks=all_ranks, + references=references, + filter_pids=filter_pids, + qualifiers=qualifiers, + ) + if entity + else None + for qid, entity in entity_data.items() + } return_data = {} for qid, entity in entities.items(): diff --git a/src/Normalizer/JSONNormalizer.py b/src/Normalizer/JSONNormalizer.py index 9742d89..fa5b0f9 100644 --- a/src/Normalizer/JSONNormalizer.py +++ b/src/Normalizer/JSONNormalizer.py @@ -1,4 +1,4 @@ -"""Normalize Wikidata Action API JSON into internal textifier objects.""" +"""Normalize Wikidata/Wikibase Action API JSON into internal textifier objects.""" from __future__ import annotations diff --git a/src/Textifier/WikidataTextifier.py b/src/Textifier/WikidataTextifier.py index b74c26d..1b42a4e 100644 --- a/src/Textifier/WikidataTextifier.py +++ b/src/Textifier/WikidataTextifier.py @@ -52,10 +52,7 @@ def __bool__(self) -> bool: def to_json(self) -> Optional[str]: """Serialize to a JSON object.""" - return { - "text": self.text, - "lang": self.lang - } + return {"text": self.text, "lang": self.lang} @dataclass(slots=True) @@ -73,10 +70,7 @@ def __str__(self) -> str: def __bool__(self) -> bool: """Return whether both latitude and longitude are present.""" # coordinates are meaningful if we have both lat/lon - return ( - self.latitude is not None - and self.longitude is not None - ) + return self.latitude is not None and self.longitude is not None def to_json(self) -> Dict[str, Any]: """Serialize coordinates to a JSON object.""" @@ -164,11 +158,7 @@ class WikidataEntity: def __bool__(self) -> bool: """Return whether this entity has a usable id and label.""" - return ( - bool(self.id) - and self.label is not None - and str(self.label) != "" - ) + return bool(self.id) and self.label is not None and str(self.label) != "" def to_text(self, lang="en") -> str: """Render the entity into a readable text.""" @@ -184,7 +174,7 @@ def to_text(self, lang="en") -> str: string += f" {lang_var[', '].join(map(str, self.aliases))}" attributes = [c.to_text(lang) for c in self.claims] - attributes= [a for a in attributes if a] # filter out empty attributes + attributes = [a for a in attributes if a] # filter out empty attributes if len(attributes) > 0: attributes = "\n- ".join(attributes) @@ -236,10 +226,7 @@ class WikidataClaim: def __bool__(self) -> bool: """Return whether this claim contains a value.""" - return ( - bool(self.property) - and any(bool(v) for v in self.values) - ) + return bool(self.property) and any(bool(v) for v in self.values) def to_text(self, lang="en") -> str: """Render the claim into a readable text.""" @@ -302,8 +289,8 @@ class WikidataClaimValue: value: Optional[ Union[ WikidataEntity, WikidataQuantity, WikidataTime, WikidataCoordinates, WikidataText, WikidataMonolingualText - ] - ] = None + ] + ] = None qualifiers: List[WikidataClaim] = field(default_factory=list) references: List[List[WikidataClaim]] = field(default_factory=list) rank: Optional[str] = None # preferred|normal|deprecated diff --git a/src/WikidataLabel.py b/src/WikidataLabel.py index 3ed8f2b..4ed738a 100644 --- a/src/WikidataLabel.py +++ b/src/WikidataLabel.py @@ -1,4 +1,4 @@ -"""Label cache and lazy label resolution for Wikidata entities.""" +"""Label cache and lazy label resolution for Wikibase entities.""" import json import os @@ -20,6 +20,7 @@ LABEL_TTL_DAYS = int(os.environ.get("LABEL_TTL_DAYS", "90")) LABEL_MAX_ROWS = int(os.environ.get("LABEL_MAX_ROWS", "10000000")) REQUEST_TIMEOUT_SECONDS = float(os.environ.get("REQUEST_TIMEOUT_SECONDS", "15")) +DEFAULT_WIKIBASE_URL = "https://www.wikidata.org" DATABASE_URL = f"mariadb+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}?charset=utf8mb4" @@ -36,10 +37,11 @@ class WikidataLabel(Base): - """Database cache for multilingual Wikidata labels.""" + """Database cache for multilingual Wikibase labels.""" __tablename__ = "labels" id = Column(String(64), primary_key=True) + wikibase_url = Column(String(255), primary_key=True, default=DEFAULT_WIKIBASE_URL) labels = Column(JSON, default=dict) date_added = Column(DateTime, default=datetime.now, index=True) @@ -48,17 +50,74 @@ def initialize_database(): """Create tables if they do not already exist.""" try: Base.metadata.create_all(engine) + WikidataLabel._migrate_labels_table_for_wikibase() return True except Exception as e: print(f"Error while initializing labels database: {e}") return False @staticmethod - def add_bulk_labels(data): + def _migrate_labels_table_for_wikibase(): + """Ensure the labels table supports cache partitioning per Wikibase.""" + with engine.begin() as connection: + has_wikibase_url = connection.execute( + text( + """ + SELECT COUNT(*) + FROM information_schema.COLUMNS + WHERE TABLE_SCHEMA = :schema_name + AND TABLE_NAME = 'labels' + AND COLUMN_NAME = 'wikibase_url' + """ + ), + {"schema_name": DB_NAME}, + ).scalar() + + if not has_wikibase_url: + connection.execute( + text( + f""" + ALTER TABLE labels + ADD COLUMN wikibase_url VARCHAR(255) NOT NULL DEFAULT '{DEFAULT_WIKIBASE_URL}' + """ + ) + ) + + primary_key_cols = [ + row[0] + for row in connection.execute( + text( + """ + SELECT COLUMN_NAME + FROM information_schema.KEY_COLUMN_USAGE + WHERE TABLE_SCHEMA = :schema_name + AND TABLE_NAME = 'labels' + AND CONSTRAINT_NAME = 'PRIMARY' + ORDER BY ORDINAL_POSITION + """ + ), + {"schema_name": DB_NAME}, + ).fetchall() + ] + + if primary_key_cols == ["id"]: + connection.execute( + text( + """ + ALTER TABLE labels + DROP PRIMARY KEY, + ADD PRIMARY KEY (id, wikibase_url) + """ + ) + ) + + @staticmethod + def add_bulk_labels(data, wb_url: str = DEFAULT_WIKIBASE_URL): """Insert or update multiple label records. Args: data (list[dict]): Records containing at least ``id`` and ``labels`` keys. + wb_url (str): Wikibase URL used as cache partition key. Returns: bool: ``True`` when the operation succeeds, otherwise ``False``. @@ -66,22 +125,34 @@ def add_bulk_labels(data): if not data: return True - for i in range(len(data)): - data[i]["date_added"] = datetime.now() - if isinstance(data[i].get("labels"), dict): - data[i]["labels"] = json.dumps(data[i]["labels"], ensure_ascii=False, separators=(",", ":")) + normalized_wb_url = WikidataLabel._normalize_wb_url(wb_url) + rows = [] + for row in data: + normalized_row = { + "id": row["id"], + "wikibase_url": WikidataLabel._normalize_wb_url(row.get("wikibase_url", normalized_wb_url)), + "labels": row.get("labels", {}), + "date_added": datetime.now(), + } + if isinstance(normalized_row["labels"], dict): + normalized_row["labels"] = json.dumps( + normalized_row["labels"], + ensure_ascii=False, + separators=(",", ":"), + ) + rows.append(normalized_row) with Session() as session: try: session.execute( text(""" - INSERT INTO labels (id, labels, date_added) - VALUES (:id, :labels, :date_added) + INSERT INTO labels (id, wikibase_url, labels, date_added) + VALUES (:id, :wikibase_url, :labels, :date_added) ON DUPLICATE KEY UPDATE labels = VALUES(labels), date_added = VALUES(date_added) """), - data, + rows, ) session.commit() @@ -92,19 +163,24 @@ def add_bulk_labels(data): return False @staticmethod - def add_label(id, labels): + def add_label(id, labels, wb_url: str = DEFAULT_WIKIBASE_URL): """Insert or update labels for a single entity. Args: id (str): Entity ID. labels (dict): Mapping of language code to label text. + wb_url (str): Wikibase URL used as cache partition key. Returns: bool: ``True`` when the operation succeeds, otherwise ``False``. """ with Session() as session: try: - new_entry = WikidataLabel(id=id, labels=labels) + new_entry = WikidataLabel( + id=id, + wikibase_url=WikidataLabel._normalize_wb_url(wb_url), + labels=labels, + ) session.add(new_entry) session.commit() return True @@ -114,22 +190,28 @@ def add_label(id, labels): return False @staticmethod - def get_labels(id): + def get_labels(id, wb_url: str = DEFAULT_WIKIBASE_URL): """Retrieve cached labels for one entity, with API fallback. Args: id (str): Entity ID. + wb_url (str): Wikibase URL used as cache partition key. Returns: dict | None: Cached or fetched labels for the entity, if available. """ + normalized_wb_url = WikidataLabel._normalize_wb_url(wb_url) try: with Session() as session: # Get labels that are less than LABEL_TTL_DAYS old date_limit = datetime.now() - timedelta(days=LABEL_TTL_DAYS) item = ( session.query(WikidataLabel) - .filter(WikidataLabel.id == id, WikidataLabel.date_added >= date_limit) + .filter( + WikidataLabel.id == id, + WikidataLabel.wikibase_url == normalized_wb_url, + WikidataLabel.date_added >= date_limit, + ) .first() ) @@ -138,18 +220,19 @@ def get_labels(id): except Exception as e: print(f"Error while fetching cached label {id}: {e}") - labels = WikidataLabel._get_labels_wdapi(id).get(id) + labels = WikidataLabel._get_labels_wdapi(id, wb_url=normalized_wb_url).get(id) if labels: - WikidataLabel.add_label(id, labels) + WikidataLabel.add_label(id, labels, wb_url=normalized_wb_url) return labels @staticmethod - def get_bulk_labels(ids): + def get_bulk_labels(ids, wb_url: str = DEFAULT_WIKIBASE_URL): """Retrieve cached labels for multiple entities, with API fallback. Args: ids (list[str]): Entity IDs to fetch. + wb_url (str): Wikibase URL used as cache partition key. Returns: dict[str, dict]: Mapping of each requested ID to its labels. @@ -157,6 +240,7 @@ def get_bulk_labels(ids): if not ids: return {} + normalized_wb_url = WikidataLabel._normalize_wb_url(wb_url) labels = {} try: with Session() as session: @@ -164,7 +248,11 @@ def get_bulk_labels(ids): date_limit = datetime.now() - timedelta(days=LABEL_TTL_DAYS) rows = ( session.query(WikidataLabel.id, WikidataLabel.labels) - .filter(WikidataLabel.id.in_(ids), WikidataLabel.date_added >= date_limit) + .filter( + WikidataLabel.id.in_(ids), + WikidataLabel.wikibase_url == normalized_wb_url, + WikidataLabel.date_added >= date_limit, + ) .all() ) labels = {id: labels for id, labels in rows} @@ -174,12 +262,13 @@ def get_bulk_labels(ids): # Fallback when labels are missing from the database missing_ids = set(ids) - set(labels.keys()) if missing_ids: - missing_labels = WikidataLabel._get_labels_wdapi(missing_ids) + missing_labels = WikidataLabel._get_labels_wdapi(missing_ids, wb_url=normalized_wb_url) labels.update(missing_labels) # Cache labels WikidataLabel.add_bulk_labels( - [{"id": entity_id, "labels": entity_labels} for entity_id, entity_labels in missing_labels.items()] + [{"id": entity_id, "labels": entity_labels} for entity_id, entity_labels in missing_labels.items()], + wb_url=normalized_wb_url, ) return labels @@ -214,11 +303,13 @@ def delete_old_labels(): DELETE l FROM labels AS l JOIN ( - SELECT id + SELECT id, wikibase_url FROM labels ORDER BY date_added ASC LIMIT :rows_to_delete - ) AS old_labels ON l.id = old_labels.id + ) AS old_labels + ON l.id = old_labels.id + AND l.wikibase_url = old_labels.wikibase_url """), {"rows_to_delete": rows_to_delete}, ) @@ -232,16 +323,17 @@ def delete_old_labels(): return False @staticmethod - def _get_labels_wdapi(ids): - """Retrieve labels from the Wikidata API. + def _get_labels_wdapi(ids, wb_url: str = DEFAULT_WIKIBASE_URL): + """Retrieve labels from the Wikibase Action API. Args: ids (list[str] | str): IDs as a list or ``|``-separated string. + wb_url (str): Wikibase URL to query. Returns: dict[str, dict]: Mapping of each ID to compressed labels. """ - entities_data = get_wikidata_json_by_ids(ids, props="labels") + entities_data = get_wikidata_json_by_ids(ids, action_api_url=wb_url, props="labels") entities_data = WikidataLabel._compress_labels(entities_data) return entities_data @@ -250,7 +342,7 @@ def _compress_labels(data): """Compress API labels by extracting each language's ``value`` field. Args: - data (dict): Raw entities payload from the Wikidata API. + data (dict): Raw entities payload from the Wikibase Action API. Returns: dict[str, dict]: Mapping of entity ID to ``{lang: label}``. @@ -263,6 +355,12 @@ def _compress_labels(data): new_labels[qid] = {} return new_labels + @staticmethod + def _normalize_wb_url(wb_url: str) -> str: + """Normalize a Wikibase URL for stable cache keys.""" + normalized = (wb_url or DEFAULT_WIKIBASE_URL).strip().rstrip("/") + return normalized or DEFAULT_WIKIBASE_URL + @staticmethod def get_lang_val(data, lang="en", fallback_lang=None): """Return the best label text from a labels dictionary. @@ -339,17 +437,19 @@ def __str__(self): class LazyLabelFactory: - """Create and batch-resolve lazy Wikidata labels.""" + """Create and batch-resolve lazy Wikibase labels.""" - def __init__(self, lang="en", fallback_lang="en"): + def __init__(self, lang="en", fallback_lang="en", wb_url: str = DEFAULT_WIKIBASE_URL): """Initialize a lazy label factory. Args: lang (str): Preferred language code. fallback_lang (str): Fallback language code. + wb_url (str): Wikibase URL used for label lookups. """ self.lang = lang self.fallback_lang = fallback_lang + self.wb_url = WikidataLabel._normalize_wb_url(wb_url) self._pending_ids = set() self._resolved_labels = {} @@ -371,7 +471,7 @@ def resolve_all(self): return self._pending_ids = self._pending_ids - set(self._resolved_labels.keys()) - label_data = WikidataLabel.get_bulk_labels(list(self._pending_ids)) + label_data = WikidataLabel.get_bulk_labels(list(self._pending_ids), wb_url=self.wb_url) self._resolved_labels.update(label_data) self._pending_ids.clear() diff --git a/src/__init__.py b/src/__init__.py index acaf1ea..aae4061 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1,4 +1,4 @@ -"""Public package exports for Wikidata textification primitives.""" +"""Public package exports for textification primitives.""" from .Normalizer import JSONNormalizer, TTLNormalizer from .Textifier import ( diff --git a/src/utils.py b/src/utils.py index 5e125e5..d3714c5 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,4 +1,4 @@ -"""HTTP helpers and value-formatting utilities for Wikidata APIs.""" +"""HTTP helpers and value-formatting utilities for Wikidata/Wikibase APIs.""" import html import json @@ -8,6 +8,7 @@ from requests.adapters import HTTPAdapter REQUEST_TIMEOUT_SECONDS = float(os.environ.get("REQUEST_TIMEOUT_SECONDS", "15")) +USER_AGENT = os.environ.get("USER_AGENT", "Wikidata Textifier (embeddings@wikimedia.de)") SESSION = requests.Session() adapter = HTTPAdapter(pool_connections=20, pool_maxsize=20) @@ -17,27 +18,29 @@ def get_wikidata_ttl_by_id( id, + wb_url="https://www.wikidata.org", lang="en", ): - """Fetch a Wikidata entity as TTL from ``Special:EntityData``. + """Fetch an entity as TTL from ``Special:EntityData``. Args: - id (str): Wikidata entity ID, for example ``"Q42"`` or ``"P31"``. + id (str): Entity ID, for example ``"Q42"`` or ``"P31"``. + wb_url (str): Wikibase base URL (default is Wikidata ``https://www.wikidata.org``). lang (str, optional): Language code for server-side label rendering. Returns: str: TTL document for the requested entity. Raises: - requests.HTTPError: If Wikidata returns an error response. + requests.HTTPError: If the upstream Wikibase returns an error response. """ params = { "uselang": lang, } - headers = {"User-Agent": "Wikidata Textifier (embeddings@wikimedia.de)"} + headers = {"User-Agent": USER_AGENT} response = SESSION.get( - f"https://www.wikidata.org/wiki/Special:EntityData/{id}.ttl", + f"{wb_url}/wiki/Special:EntityData/{id}.ttl", params=params, headers=headers, timeout=REQUEST_TIMEOUT_SECONDS, @@ -46,18 +49,24 @@ def get_wikidata_ttl_by_id( return response.text -def get_wikidata_json_by_ids(ids, props="labels|descriptions|aliases|claims"): - """Fetch one or more Wikidata entities from ``wbgetentities``. +def get_wikidata_json_by_ids( + ids, + action_api_url="https://www.wikidata.org/w/api.php", + props="labels|descriptions|aliases|claims", +): + """Fetch one or more entities from ``wbgetentities``. Args: ids (list[str] | str): Entity IDs as a list or ``|``-separated string. + action_api_url (str): Full Action API URL (default is + ``https://www.wikidata.org/w/api.php``). props (str): Pipe-delimited properties requested from the API. Returns: dict[str, dict]: Mapping of entity IDs to API entity payloads. Raises: - requests.HTTPError: If Wikidata returns an error response. + requests.HTTPError: If the upstream Wikibase returns an error response. """ if isinstance(ids, str): ids = ids.split("|") @@ -65,8 +74,7 @@ def get_wikidata_json_by_ids(ids, props="labels|descriptions|aliases|claims"): entities_data = {} - # Wikidata API has a limit on the number of IDs per request, - # typically 50 for wbgetentities. + # wbgetentities has a limit on number of IDs per request (typically 50). for chunk_idx in range(0, len(ids), 50): ids_chunk = ids[chunk_idx : chunk_idx + 50] params = { @@ -76,10 +84,10 @@ def get_wikidata_json_by_ids(ids, props="labels|descriptions|aliases|claims"): "format": "json", "origin": "*", } - headers = {"User-Agent": "Wikidata Textifier (embeddings@wikimedia.de)"} + headers = {"User-Agent": USER_AGENT} response = SESSION.get( - "https://www.wikidata.org/w/api.php?", + action_api_url, params=params, headers=headers, timeout=REQUEST_TIMEOUT_SECONDS, @@ -129,7 +137,7 @@ def wikidata_time_to_text(value: dict, lang: str = "en"): "before": value.get("before", 0), "after": value.get("after", 0), "precision": value.get("precision", 10), - "calendarmodel": value.get("calendarmodel", "Q1985786"), + "calendarmodel": value.get("calendarmodel", "Q12138"), }, } diff --git a/tests/README.md b/tests/README.md index 963ec4d..b0ee2cd 100644 --- a/tests/README.md +++ b/tests/README.md @@ -46,7 +46,7 @@ uv run pytest -q tests/unit Run integration tests only: ```bash -uv run pytest -q tests/integration -m integration +uv run pytest -q tests/integration ``` Run all tests: diff --git a/tests/integration/test_live_routes.py b/tests/integration/test_live_routes.py index 912f774..6fd169a 100644 --- a/tests/integration/test_live_routes.py +++ b/tests/integration/test_live_routes.py @@ -10,7 +10,6 @@ import pymysql import pytest -pytestmark = pytest.mark.integration LOCAL_BASE_URL = "http://127.0.0.1:5000" diff --git a/tests/unit/test_routes.py b/tests/unit/test_routes.py index 3592f5c..98c0b59 100644 --- a/tests/unit/test_routes.py +++ b/tests/unit/test_routes.py @@ -9,16 +9,24 @@ from src.Textifier.WikidataTextifier import WikidataEntity -def test_get_textified_wd_uses_ttl_normalizer_for_single_qid(monkeypatch, run_async, make_request): - """Validate ``TTLNormalizer`` is used when one QID is requested.""" +def test_get_textified_wd_uses_json_normalizer_for_single_qid(monkeypatch, run_async, make_request): + """Validate ``JSONNormalizer`` is used for single-QID requests.""" calls = {} - def fake_get_ttl(qid, lang="en"): - calls["requested_qid"] = qid - return "ttl-data" + def fake_get_json( + ids, + action_api_url="https://www.wikidata.org/w/api.php", + props="labels|descriptions|aliases|claims", + ): + calls["requested_ids"] = ids + calls["action_api_url"] = action_api_url + calls["props"] = props + return { + "Q42": {"labels": {"en": {"value": "Douglas Adams"}}, "descriptions": {}, "aliases": {}, "claims": {}}, + } - class DummyTTLNormalizer: - """Minimal TTL normalizer stand-in for unit testing.""" + class DummyJSONNormalizer: + """Minimal JSON normalizer stand-in for unit testing.""" def __init__(self, **kwargs): self.entity_id = kwargs["entity_id"] @@ -27,8 +35,8 @@ def __init__(self, **kwargs): def normalize(self, **kwargs): return WikidataEntity(id=self.entity_id, label="Douglas Adams", claims=[]) - monkeypatch.setattr(main.utils, "get_wikidata_ttl_by_id", fake_get_ttl) - monkeypatch.setattr(main, "TTLNormalizer", DummyTTLNormalizer) + monkeypatch.setattr(main.utils, "get_wikidata_json_by_ids", fake_get_json) + monkeypatch.setattr(main, "JSONNormalizer", DummyJSONNormalizer) result = run_async( main.get_textified_wd( @@ -37,10 +45,12 @@ def normalize(self, **kwargs): id="Q42", pid=None, format="json", + action_api_url="https://example.wikibase.local/w/api.php", ) ) - assert calls["requested_qid"] == "Q42" + assert calls["requested_ids"] == ["Q42"] + assert calls["action_api_url"] == "https://example.wikibase.local/w/api.php" assert calls["normalizer_entity_id"] == "Q42" assert result["Q42"]["QID"] == "Q42" assert result["Q42"]["label"] == "Douglas Adams" @@ -50,7 +60,12 @@ def test_get_textified_wd_uses_json_normalizer_for_multiple_qids(monkeypatch, ru """Validate ``JSONNormalizer`` is used for multi-QID requests.""" init_calls = [] - def fake_get_json(ids): + def fake_get_json( + ids, + action_api_url="https://www.wikidata.org/w/api.php", + props="labels|descriptions|aliases|claims", + ): + del action_api_url, props return { "Q1": {"labels": {"en": {"value": "One"}}, "descriptions": {}, "aliases": {}, "claims": {}}, "Q2": {"labels": {"en": {"value": "Two"}}, "descriptions": {}, "aliases": {}, "claims": {}}, diff --git a/tests/unit/test_wikidatalabel.py b/tests/unit/test_wikidatalabel.py index feefa94..9d9e013 100644 --- a/tests/unit/test_wikidatalabel.py +++ b/tests/unit/test_wikidatalabel.py @@ -49,7 +49,8 @@ def test_get_all_missing_labels_ids_collects_nested_ids(): def test_lazy_label_factory_resolves_pending_labels_in_bulk(monkeypatch): """It should resolve pending IDs via a single bulk lookup when cast to ``str``.""" - def fake_get_bulk_labels(ids): + def fake_get_bulk_labels(ids, wb_url="https://www.wikidata.org"): + del wb_url return {"Q42": {"en": "Douglas Adams"}} monkeypatch.setattr(WikidataLabel, "get_bulk_labels", staticmethod(fake_get_bulk_labels)) @@ -58,3 +59,20 @@ def fake_get_bulk_labels(ids): lazy_label = factory.create("Q42") assert str(lazy_label) == "Douglas Adams" + + +def test_lazy_label_factory_forwards_wikibase_url(monkeypatch): + """It should forward the configured Wikibase URL to bulk label lookups.""" + calls = [] + + def fake_get_bulk_labels(ids, wb_url="https://www.wikidata.org"): + calls.append((list(ids), wb_url)) + return {"Q42": {"en": "Douglas Adams"}} + + monkeypatch.setattr(WikidataLabel, "get_bulk_labels", staticmethod(fake_get_bulk_labels)) + + factory = LazyLabelFactory(lang="en", fallback_lang="en", wb_url="https://example.wikibase.local/") + lazy_label = factory.create("Q42") + + assert str(lazy_label) == "Douglas Adams" + assert calls == [(["Q42"], "https://example.wikibase.local")]