feat: make HF weights sync non-blocking with K8s init container

lilyz-ai · claude · lilyz-ai · commit b14deb6ebaa5 · 2026-02-20T05:29:06.000Z
ensure_model_weights_available is now synchronous — it returns the
expected checkpoint path immediately and fires a background asyncio task
to sync weights from HuggingFace Hub. An init container is injected
into the K8s deployment to poll storage until the weights are present
before the main container starts. LLMMetadata gains an hf_weights_syncing
flag to signal this flow downstream.

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/model-engine/model_engine_server/domain/entities/llm_entity.py b/model-engine/model_engine_server/domain/entities/llm_entity.py
@@ -31,3 +31,4 @@ class LLMMetadata:
     quantize: Optional[Quantization] = None
     checkpoint_path: Optional[str] = None
     chat_template_override: Optional[str] = None
+    hf_weights_syncing: bool = False
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -652,7 +652,8 @@ def load_model_weights_sub_commands_s3(
             s5cmd = "./s5cmd"
 
         checkpoint_files = self.llm_artifact_gateway.list_files(checkpoint_path)
-        validate_checkpoint_files(checkpoint_files)
+        if checkpoint_files:
+            validate_checkpoint_files(checkpoint_files)
 
         # filter to configs ('*.model' and '*.json') and weights ('*.safetensors')
         # For models that are not supported by transformers directly, we need to include '*.py' and '*.bin'
@@ -1389,18 +1390,20 @@ async def execute(
                 "Multinode endpoints are only supported for VLLM models."
             )
 
-        # Resolve checkpoint path: auto-download from HF Hub to remote storage if not cached
+        # Resolve checkpoint path: fires background sync and returns expected path immediately
         checkpoint_path = request.checkpoint_path
+        hf_weights_syncing = False
         if (
             checkpoint_path is None
             and request.source == LLMSource.HUGGING_FACE
             and self.model_weights_manager is not None
         ):
             models_info = SUPPORTED_MODELS_INFO.get(request.model_name)
             if models_info and models_info.hf_repo:
-                checkpoint_path = await self.model_weights_manager.ensure_model_weights_available(
-                    hf_repo=models_info.hf_repo
+                checkpoint_path = self.model_weights_manager.ensure_model_weights_available(
+                    models_info.hf_repo
                 )
+                hf_weights_syncing = True
 
         bundle = await self.create_llm_model_bundle_use_case.execute(
             user,
@@ -1447,6 +1450,7 @@ async def execute(
                 quantize=request.quantize,
                 checkpoint_path=checkpoint_path,
                 chat_template_override=request.chat_template_override,
+                hf_weights_syncing=hf_weights_syncing,
             )
         )
 
diff --git a/model-engine/model_engine_server/domain/use_cases/model_weights_manager.py b/model-engine/model_engine_server/domain/use_cases/model_weights_manager.py
@@ -25,28 +25,35 @@ class ModelWeightsManager:
     def __init__(self, llm_artifact_gateway: LLMArtifactGateway):
         self.llm_artifact_gateway = llm_artifact_gateway
 
-    def _get_remote_path(self, hf_repo: str) -> str:
+    def get_remote_path(self, hf_repo: str) -> str:
         prefix = hmi_config.hf_user_fine_tuned_weights_prefix.rstrip("/")
         return f"{prefix}/{hf_repo}"
 
-    async def ensure_model_weights_available(self, hf_repo: str) -> str:
+    def ensure_model_weights_available(self, hf_repo: str) -> str:
         """
-        Ensures model weights for ``hf_repo`` are available at the configured remote path.
+        Returns the expected remote path for ``hf_repo`` immediately and starts
+        syncing weights from HuggingFace Hub to that path in the background.
 
-        If the weights are already cached (remote path is non-empty), returns immediately.
-        Otherwise downloads from HuggingFace Hub and uploads to the remote path.
+        If the weights are already cached the background task exits early.
+        Callers receive the checkpoint path right away and can proceed with
+        any following actions (e.g. endpoint creation) without blocking.
 
         Args:
             hf_repo: HuggingFace repository ID, e.g. ``"meta-llama/Meta-Llama-3-8B"``.
 
         Returns:
-            The remote path (s3://, gs://, or https://) where the weights are stored.
+            The remote path (s3://, gs://, or https://) where the weights will be stored.
         """
-        remote_path = self._get_remote_path(hf_repo)
+        remote_path = self.get_remote_path(hf_repo)
+        asyncio.create_task(self._sync_weights(hf_repo, remote_path))
+        return remote_path
+
+    async def _sync_weights(self, hf_repo: str, remote_path: str) -> None:
+        """Downloads weights from HuggingFace Hub and uploads to remote storage if not cached."""
         files = self.llm_artifact_gateway.list_files(remote_path)
         if files:
             logger.info(f"Cache hit: {len(files)} files at {remote_path}")
-            return remote_path
+            return
 
         logger.info(f"Cache miss for {hf_repo}. Downloading from HuggingFace Hub...")
         loop = asyncio.get_event_loop()
@@ -70,4 +77,3 @@ async def ensure_model_weights_available(self, hf_repo: str) -> str:
             )
 
         logger.info(f"Weights for {hf_repo} uploaded to {remote_path}")
-        return remote_path
diff --git a/model-engine/model_engine_server/infra/gateways/resources/k8s_endpoint_resource_delegate.py b/model-engine/model_engine_server/infra/gateways/resources/k8s_endpoint_resource_delegate.py
@@ -62,6 +62,28 @@
 BASE_PATH_IN_ENDPOINT = "/app"
 
 DATADOG_ENV_VAR = {"DD_TRACE_ENABLED", "DD_SERVICE", "DD_ENV", "DD_VERSION", "DD_AGENT_HOST"}
+
+# Key under which LLM metadata is stored in model_endpoint_record.metadata
+_LLM_METADATA_KEY = "_llm"
+
+# Python script run by the init container to poll storage until HF weights are present.
+_HF_WEIGHTS_POLL_SCRIPT = """\
+import boto3, os, sys, time
+from urllib.parse import urlparse
+
+cp = os.environ["CHECKPOINT_PATH"]
+url = urlparse(cp)
+bucket = url.netloc
+prefix = url.path.lstrip("/")
+s3 = boto3.client("s3")
+while True:
+    resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, MaxKeys=1)
+    if resp.get("Contents"):
+        print(f"Model weights ready at {cp}", flush=True)
+        sys.exit(0)
+    print(f"Waiting for model weights at {cp}...", flush=True)
+    time.sleep(30)
+"""
 LWS_DEFAULT_ENV_VAR = {
     "K8S_OWN_POD_NAME",
     "K8S_OWN_NAMESPACE",
@@ -339,6 +361,42 @@ def add_pod_metadata_env_to_container(container: Dict[str, Any]) -> None:
     )
 
 
+def add_hf_weights_init_container(
+    deployment_template: Dict[str, Any],
+    checkpoint_path: str,
+) -> None:
+    """Prepend an init container that polls storage until HF weights are present.
+
+    Uses the forwarder image (model-engine gateway image, which has Python and
+    boto3) so no additional image pull is required.  Authentication relies on
+    the pod's service account (IRSA / workload-identity).
+    """
+    containers = deployment_template["spec"]["template"]["spec"]["containers"]
+    # Prefer the forwarder container image; fall back to the first container.
+    forwarder_image = next(
+        (c["image"] for c in containers if c["name"] in ("http-forwarder", "celery-forwarder")),
+        containers[0]["image"],
+    )
+
+    init_container: Dict[str, Any] = {
+        "name": "wait-for-model-weights",
+        "image": forwarder_image,
+        "env": [{"name": "CHECKPOINT_PATH", "value": checkpoint_path}],
+        "command": ["python3", "-c", _HF_WEIGHTS_POLL_SCRIPT],
+    }
+
+    # Reuse the AWS config volume mount if the volume is present in the pod spec
+    volumes = deployment_template["spec"]["template"]["spec"].get("volumes", [])
+    if any(v["name"] == "config-volume" for v in volumes):
+        init_container["volumeMounts"] = [
+            {"name": "config-volume", "mountPath": "/opt/.aws/config", "subPath": "config"}
+        ]
+
+    deployment_template["spec"]["template"]["spec"].setdefault("initContainers", []).append(
+        init_container
+    )
+
+
 def add_lws_default_env_vars_to_container(container: Dict[str, Any]) -> None:
     container_envs = []
     container_envs.extend(
@@ -1657,6 +1715,9 @@ async def _create_or_update_resources(
                 user_container = get_main_container_from_deployment_template(deployment_template)
                 add_datadog_env_to_container(deployment_template, user_container)
                 add_pod_metadata_env_to_container(user_container)
+            llm_metadata = (model_endpoint_record.metadata or {}).get(_LLM_METADATA_KEY, {})
+            if llm_metadata.get("hf_weights_syncing") and llm_metadata.get("checkpoint_path"):
+                add_hf_weights_init_container(deployment_template, llm_metadata["checkpoint_path"])
             await self._create_deployment(
                 model_endpoint_record=request.build_endpoint_request.model_endpoint_record,
                 deployment=deployment_template,
diff --git a/model-engine/tests/unit/domain/test_model_weights_manager.py b/model-engine/tests/unit/domain/test_model_weights_manager.py
@@ -40,10 +40,14 @@ async def test_cache_hit_skips_download():
     gateway = FakeArtifactGateway(existing_files=["model.safetensors"])
     manager = ModelWeightsManager(llm_artifact_gateway=gateway)
 
-    with patch(
-        "model_engine_server.domain.use_cases.model_weights_manager.snapshot_download"
-    ) as mock_download:
-        result = await manager.ensure_model_weights_available("meta-llama/Meta-Llama-3-8B")
+    mwm_base = "model_engine_server.domain.use_cases.model_weights_manager"
+    with (
+        patch(f"{mwm_base}.snapshot_download") as mock_download,
+        patch(f"{mwm_base}.asyncio.create_task") as mock_create_task,
+    ):
+        result = manager.ensure_model_weights_available("meta-llama/Meta-Llama-3-8B")
+        # Run the background sync task to assert on side-effects
+        await mock_create_task.call_args[0][0]
 
     mock_download.assert_not_called()
     assert len(gateway.uploaded) == 0
@@ -60,8 +64,10 @@ async def test_cache_hit_returns_correct_s3_path(monkeypatch):
     gateway = FakeArtifactGateway(existing_files=["file.bin"])
     manager = ModelWeightsManager(llm_artifact_gateway=gateway)
 
-    with patch("model_engine_server.domain.use_cases.model_weights_manager.snapshot_download"):
-        result = await manager.ensure_model_weights_available("org/model")
+    mwm_base = "model_engine_server.domain.use_cases.model_weights_manager"
+    with patch(f"{mwm_base}.asyncio.create_task") as mock_create_task:
+        result = manager.ensure_model_weights_available("org/model")
+        await mock_create_task.call_args[0][0]
 
     assert result == "s3://my-bucket/weights/org/model"
 
@@ -77,10 +83,14 @@ async def test_cache_miss_calls_snapshot_download_and_upload(tmp_path, monkeypat
     gateway = FakeArtifactGateway(existing_files=[])
     manager = ModelWeightsManager(llm_artifact_gateway=gateway)
 
-    with patch(
-        "model_engine_server.domain.use_cases.model_weights_manager.snapshot_download"
-    ) as mock_download:
-        result = await manager.ensure_model_weights_available("org/model")
+    mwm_base = "model_engine_server.domain.use_cases.model_weights_manager"
+    with (
+        patch(f"{mwm_base}.snapshot_download") as mock_download,
+        patch(f"{mwm_base}.asyncio.create_task") as mock_create_task,
+    ):
+        result = manager.ensure_model_weights_available("org/model")
+        # Run the background sync task so we can assert on its side-effects
+        await mock_create_task.call_args[0][0]
 
     mock_download.assert_called_once()
     call_kwargs = mock_download.call_args
@@ -93,8 +103,7 @@ async def test_cache_miss_calls_snapshot_download_and_upload(tmp_path, monkeypat
     assert result == "s3://my-bucket/weights/org/model"
 
 
-@pytest.mark.asyncio
-async def test_s3_path_construction(monkeypatch):
+def test_s3_path_construction(monkeypatch):
     """Remote path should be {prefix}/{hf_repo} with correct stripping of trailing slash."""
     monkeypatch.setattr(
         "model_engine_server.domain.use_cases.model_weights_manager.hmi_config",
@@ -103,27 +112,27 @@ async def test_s3_path_construction(monkeypatch):
     gateway = FakeArtifactGateway(existing_files=[])
     manager = ModelWeightsManager(llm_artifact_gateway=gateway)
 
-    path = manager._get_remote_path("myorg/mymodel")
+    path = manager.get_remote_path("myorg/mymodel")
     assert path == "s3://bucket/prefix/myorg/mymodel"
 
 
 @pytest.mark.asyncio
 async def test_create_llm_model_endpoint_calls_weights_manager_on_hf_source():
-    """CreateLLMModelEndpointV1UseCase should call model_weights_manager when source is HF and checkpoint_path is None."""
+    """CreateLLMModelEndpointV1UseCase should call ensure_model_weights_available (sync),
+    which returns the expected checkpoint path immediately and fires weight sync in the
+    background. All following actions (bundle, endpoint creation) proceed without blocking."""
     from model_engine_server.domain.entities import LLMSource
     from model_engine_server.domain.use_cases.model_weights_manager import ModelWeightsManager
 
     mock_manager = MagicMock(spec=ModelWeightsManager)
-    mock_manager.ensure_model_weights_available = AsyncMock(
-        return_value="s3://bucket/weights/huggyllama/llama-7b"
+    mock_manager.ensure_model_weights_available.return_value = (
+        "s3://bucket/weights/huggyllama/llama-7b"
     )
 
     # Use a real SUPPORTED_MODELS_INFO entry: "llama-2-7b" -> "huggyllama/llama-7b"
     from tests.unit.conftest import FakeLLMArtifactGateway
 
     fake_gateway = FakeLLMArtifactGateway()
-    # Ensure the resolved checkpoint path is found in the fake bucket
-    fake_gateway.s3_bucket["s3://bucket/weights/huggyllama/llama-7b"] = ["model.safetensors"]
 
     from model_engine_server.domain.use_cases.llm_model_endpoint_use_cases import (
         CreateLLMModelEndpointV1UseCase,
@@ -204,9 +213,8 @@ async def test_create_llm_model_endpoint_calls_weights_manager_on_hf_source():
         mock_authz.return_value.get_s3_bucket_for_user = MagicMock(return_value="test-bucket")
         await use_case.execute(user=user, request=request)
 
-    mock_manager.ensure_model_weights_available.assert_called_once_with(
-        hf_repo="huggyllama/llama-7b"
-    )
+    # ensure_model_weights_available is called synchronously — no await, no blocking
+    mock_manager.ensure_model_weights_available.assert_called_once_with("huggyllama/llama-7b")
     # Verify that the resolved checkpoint path was forwarded to the bundle use case
     bundle_call_kwargs = mock_bundle_use_case.execute.call_args.kwargs
     assert bundle_call_kwargs["checkpoint_path"] == "s3://bucket/weights/huggyllama/llama-7b"