modelscope · Yunnglin · Jun 26, 2026 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026
diff --git a/cookbook/client/server/megatron/entrypoint.sh b/cookbook/client/server/megatron/entrypoint.sh
@@ -11,10 +11,12 @@ TWINKLE_WORK_DIR="${TWINKLE_WORK_DIR:-/dashscope/caches/application/twinkle}"
 TEMP_DIR="${TWINKLE_TEMP_DIR:-/dashscope/caches/application/ray_logs}"
 LOG_FILE="$TWINKLE_WORK_DIR/run.log"
 TWINKLE_HEALTH_URL="${TWINKLE_HEALTH_URL:-http://127.0.0.1:9000/api/v1/healthz}"
+TWINKLE_DEEP_HEALTH_URL="${TWINKLE_DEEP_HEALTH_URL:-http://127.0.0.1:9000/api/v1/twinkle/healthz/deep}"
 TWINKLE_WATCHDOG_INTERVAL_SECONDS="${TWINKLE_WATCHDOG_INTERVAL_SECONDS:-10}"
 TWINKLE_WATCHDOG_FAILURE_THRESHOLD="${TWINKLE_WATCHDOG_FAILURE_THRESHOLD:-3}"
 TWINKLE_RAY_GRACE_SECONDS="${TWINKLE_RAY_GRACE_SECONDS:-30}"
 TWINKLE_HEALTH_GRACE_SECONDS="${TWINKLE_HEALTH_GRACE_SECONDS:-${TWINKLE_WATCHDOG_STARTUP_GRACE_SECONDS:-300}}"
+TWINKLE_DEEP_HEALTH_GRACE_SECONDS="${TWINKLE_DEEP_HEALTH_GRACE_SECONDS:-${TWINKLE_HEALTH_GRACE_SECONDS:-300}}"
 RESTART_BACKOFF_SECONDS="${TWINKLE_ENTRYPOINT_RESTART_BACKOFF_SECONDS:-10}"
 
 CHILD_PID=""
@@ -58,6 +60,7 @@ validate_entrypoint_config() {
     require_positive_int "TWINKLE_WATCHDOG_FAILURE_THRESHOLD" "$TWINKLE_WATCHDOG_FAILURE_THRESHOLD"
     require_non_negative_int "TWINKLE_RAY_GRACE_SECONDS" "$TWINKLE_RAY_GRACE_SECONDS"
     require_non_negative_int "TWINKLE_HEALTH_GRACE_SECONDS" "$TWINKLE_HEALTH_GRACE_SECONDS"
+    require_non_negative_int "TWINKLE_DEEP_HEALTH_GRACE_SECONDS" "$TWINKLE_DEEP_HEALTH_GRACE_SECONDS"
     require_non_negative_int "TWINKLE_ENTRYPOINT_RESTART_BACKOFF_SECONDS" "$RESTART_BACKOFF_SECONDS"
 
     require_command timeout
@@ -72,21 +75,22 @@ validate_entrypoint_config() {
 }
 
 check_http_health() {
+    local url="${1:-$TWINKLE_HEALTH_URL}"
     if command -v curl &> /dev/null; then
-        curl -fsS --max-time 10 "$TWINKLE_HEALTH_URL" >/dev/null
+        curl -fsS --max-time 10 "$url" >/dev/null
         return
     fi
 
     if command -v wget &> /dev/null; then
-        wget -q --spider --timeout=10 "$TWINKLE_HEALTH_URL"
+        wget -q -O /dev/null --timeout=10 "$url"
         return
     fi
 
     local python_bin="python3"
     if ! command -v "$python_bin" &> /dev/null; then
         python_bin="python"
     fi
-    "$python_bin" - "$TWINKLE_HEALTH_URL" <<'PY'
+    "$python_bin" - "$url" <<'PY'
 import sys
 import urllib.request
 
@@ -102,6 +106,7 @@ PY
 print_watchdog_diagnostics() {
     print_warning "EntryPoint watchdog 诊断信息："
     echo "  - health url: $TWINKLE_HEALTH_URL"
+    echo "  - deep health url: $TWINKLE_DEEP_HEALTH_URL"
     echo "  - run.sh pid: ${CHILD_PID:-unset}"
     echo "  - Ray logs: $TEMP_DIR/session_latest/logs"
 
@@ -161,6 +166,9 @@ while true; do
         elif ! check_http_health; then
             WATCHDOG_FAILURE_REASON="http health check failed: $TWINKLE_HEALTH_URL"
             WATCHDOG_GRACE_SECONDS="$TWINKLE_HEALTH_GRACE_SECONDS"
+        elif ! check_http_health "$TWINKLE_DEEP_HEALTH_URL"; then
+            WATCHDOG_FAILURE_REASON="deep health check failed (model actors may be dead): $TWINKLE_DEEP_HEALTH_URL"
+            WATCHDOG_GRACE_SECONDS="$TWINKLE_DEEP_HEALTH_GRACE_SECONDS"
         fi
 
         if [ -z "$WATCHDOG_FAILURE_REASON" ]; then

diff --git a/cookbook/client/server/megatron/server_config.yaml b/cookbook/client/server/megatron/server_config.yaml
@@ -54,6 +54,7 @@ applications:
             env_vars:
               TWINKLE_TRUST_REMOTE_CODE: "0"
               TWINKLE_LONG_POLL_TIMEOUT: "120"
+              TWINKLE_FAIL_FAST: "0"
 
   # 3. Sampler Service - Runs inference / sampling using vLLM engine
   #    Used for generating text from the model (e.g., evaluating LoRA results).
@@ -66,7 +67,7 @@ applications:
       nproc_per_node: 4               # Number of GPU processes per node
       sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
       engine_args:                    # vLLM engine-specific settings
-        max_model_len: 65536           # Maximum sequence length the engine supports
+        max_model_len: 32768           # Maximum sequence length the engine supports
         gpu_memory_utilization: 0.75   # 80% utilization, ~64GB/GPU, leaves buffer for safety
         enable_lora: true             # Allow loading LoRA adapters during inference
         max_loras: 5                  # Max allowed loras working on vLLM at the same time
@@ -84,7 +85,7 @@ applications:
       queue_config:
         rps_limit: 20                               # Max requests per second
         tps_limit: 131072                            # Max tokens per second
-        max_input_tokens: 65536
+        max_input_tokens: 32768
     deployments:
       - name: SamplerManagement
         autoscaling_config:
@@ -97,6 +98,7 @@ applications:
             env_vars:
               TWINKLE_TRUST_REMOTE_CODE: "0"
               TWINKLE_LONG_POLL_TIMEOUT: "120"
+              TWINKLE_FAIL_FAST: "0"
 
   # 2. Model Service - Hosts the base model for training.
   #    Config: PP=2 x DP=2 on 4 GPUs, ~27GB weights/GPU, comfortable for LoRA training
@@ -105,8 +107,8 @@ applications:
     import_path: model
     args:
       backend: megatron                          # Use Megatron-LM backend
-      model_id: "ms://Qwen/Qwen3.6-27B" # ModelScope model identifier
-      max_length: 65536                           # model max length
+      model_id: "ms://Qwen/Qwen3.6-27B"          # ModelScope model identifier
+      max_length: 32768                           # model max length
       max_loras: 3                                # model max loras
       nproc_per_node: 4                           # Number of GPU processes per node
       device_group:
@@ -121,7 +123,7 @@ applications:
       queue_config:
         rps_limit: 20                               # Max requests per second
         tps_limit: 131072                            # Max tokens per second
-        max_input_tokens: 65536
+        max_input_tokens: 32768
       adapter_config:
         adapter_timeout: 120                       # Seconds before idle adapter unload
         adapter_max_lifetime: 36000               # Maximum lifetime of an adapter in seconds (e.g., 10 hours)
@@ -137,6 +139,7 @@ applications:
             env_vars:
               TWINKLE_TRUST_REMOTE_CODE: "0"
               TWINKLE_LONG_POLL_TIMEOUT: "120"
+              TWINKLE_FAIL_FAST: "0"
 
   # 4. Processor Service
   - name: processor
@@ -159,3 +162,6 @@ applications:
           target_ongoing_requests: 128
         ray_actor_options:
           num_cpus: 0.1
+          runtime_env:
+            env_vars:
+              TWINKLE_FAIL_FAST: "0"
diff --git a/cookbook/client/server/megatron/server_config_4b.yaml b/cookbook/client/server/megatron/server_config_4b.yaml
@@ -31,6 +31,9 @@ applications:
           target_ongoing_requests: 128   # Target concurrent requests per replica
         ray_actor_options:
           num_cpus: 0.1                  # CPU resources allocated to this actor
+          runtime_env:
+            env_vars:
+              TWINKLE_FAIL_FAST: "0"
 
   # 2. Model Service (commented out) - Would host the base model for training.
   #    Uncomment and configure if you need a training model worker.
@@ -68,6 +71,7 @@ applications:
           runtime_env:
             env_vars:
               TWINKLE_TRUST_REMOTE_CODE: "0"
+              TWINKLE_FAIL_FAST: "0"
 
   # 3. Sampler Service - Runs inference / sampling using vLLM engine
   #    Used for generating text from the model (e.g., evaluating LoRA results).
@@ -105,6 +109,7 @@ applications:
           runtime_env:
             env_vars:
               TWINKLE_TRUST_REMOTE_CODE: "0"
+              TWINKLE_FAIL_FAST: "0"
 
   # 4. Processor Service
   - name: processor
@@ -127,3 +132,6 @@ applications:
           target_ongoing_requests: 128
         ray_actor_options:
           num_cpus: 0.1
+          runtime_env:
+            env_vars:
+              TWINKLE_FAIL_FAST: "0"
diff --git a/cookbook/client/server/transformer/server_config.yaml b/cookbook/client/server/transformer/server_config.yaml
@@ -11,7 +11,7 @@ http_options:
 
 # Telemetry: push traces/metrics/logs to LGTM's OTel Collector via OTLP
 telemetry:
-  enabled: true
+  enabled: false
   otlp_endpoint: http://localhost:4317
 
 # Persistence configuration for ServerState (sessions, models, futures, ...).
@@ -49,6 +49,9 @@ applications:
           target_ongoing_requests: 128   # Target concurrent requests per replica
         ray_actor_options:
           num_cpus: 0.1                  # CPU resources allocated to this actor
+          runtime_env:
+            env_vars:
+              TWINKLE_FAIL_FAST: "0"
 
   # 2. Model Service - Hosts the base model for training.
   - name: models-Qwen3.5-4B
@@ -81,43 +84,45 @@ applications:
           num_cpus: 0.1
           runtime_env:
             env_vars:
-              TWINKLE_TRUST_REMOTE_CODE: "0"
+              TWINKLE_TRUST_REMOTE_CODE: "1"
+              TWINKLE_FAIL_FAST: "0"
 
   # 3. Sampler Service - Runs inference / sampling using vLLM engine
   #    Used for generating text from the model (e.g., evaluating LoRA results).
-  # - name: sampler-Qwen3.5-4B
-  #   route_prefix: /api/v1/sampler/Qwen/Qwen3.5-4B
-  #   import_path: sampler
-  #   args:
-  #     model_id: "ms://Qwen/Qwen3.5-4B"   # ModelScope model identifier
-  #     nproc_per_node: 2               # Number of GPU processes per node
-  #     sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
-  #     engine_args:                    # vLLM engine-specific settings
-  #       max_model_len: 4096           # Maximum sequence length the engine supports
-  #       gpu_memory_utilization: 0.5   # Fraction of GPU memory to use (0.0-1.0)
-  #       enable_lora: true             # Allow loading LoRA adapters during inference
-  #       logprobs_mode: processed_logprobs # Logprobs mode for sampling results
-  #     device_group:                   # Logical device group for the sampler
-  #       name: sampler
-  #       ranks: 1                    # Number of GPUs to use
-  #       device_type: cuda
-  #     device_mesh:
-  #       device_type: cuda
-  #       dp_size: 1
-  #     queue_config:
-  #       rps_limit: 100                             # Max requests per second
-  #       tps_limit: 100000                           # Max tokens per second
-  #   deployments:
-  #     - name: SamplerManagement
-  #       autoscaling_config:
-  #         min_replicas: 1
-  #         max_replicas: 1
-  #         target_ongoing_requests: 16
-  #       ray_actor_options:
-  #         num_cpus: 0.1
-  #         runtime_env:
-  #           env_vars:
-  #             TWINKLE_TRUST_REMOTE_CODE: "0"
+  - name: sampler-Qwen3.5-4B
+    route_prefix: /api/v1/sampler/Qwen/Qwen3.5-4B
+    import_path: sampler
+    args:
+      model_id: "ms://Qwen/Qwen3.5-4B"   # ModelScope model identifier
+      nproc_per_node: 1               # Number of GPU processes per node
+      sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
+      engine_args:                    # vLLM engine-specific settings
+        max_model_len: 4096           # Maximum sequence length the engine supports
+        gpu_memory_utilization: 0.5   # Fraction of GPU memory to use (0.0-1.0)
+        enable_lora: true             # Allow loading LoRA adapters during inference
+        logprobs_mode: processed_logprobs # Logprobs mode for sampling results
+      device_group:                   # Logical device group for the sampler
+        name: sampler
+        ranks: 1                    # Number of GPUs to use
+        device_type: cuda
+      device_mesh:
+        device_type: cuda
+        dp_size: 1
+      queue_config:
+        rps_limit: 100                             # Max requests per second
+        tps_limit: 100000                           # Max tokens per second
+    deployments:
+      - name: SamplerManagement
+        autoscaling_config:
+          min_replicas: 1
+          max_replicas: 1
+          target_ongoing_requests: 16
+        ray_actor_options:
+          num_cpus: 0.1
+          runtime_env:
+            env_vars:
+              TWINKLE_TRUST_REMOTE_CODE: "1"
+              TWINKLE_FAIL_FAST: "0"
 
   # 4. Processor Service
   - name: processor
@@ -140,3 +145,6 @@ applications:
           target_ongoing_requests: 128
         ray_actor_options:
           num_cpus: 0.1
+          runtime_env:
+            env_vars:
+              TWINKLE_FAIL_FAST: "0"
diff --git a/cookbook/client/server/transformer/server_e2e.py b/cookbook/client/server/transformer/server_e2e.py
diff --git a/cookbook/client/tinker/modelscope/dpo.py b/cookbook/client/tinker/modelscope/dpo.py
@@ -51,7 +51,7 @@
 max_length = 2048
 lora_rank = 8
 system_prompt = 'You are a helpful assistant.'
-use_swanlab = True
+use_swanlab = False
 
 
 # ---------------------------------------------------------------------------

diff --git a/cookbook/client/tinker/modelscope/short_math_grpo.py b/cookbook/client/tinker/modelscope/short_math_grpo.py
@@ -165,7 +165,7 @@ def main():
         if step % SYNC_INTERVAL == 0:
             logger.info(f'Step {step}: Saving weights for sampler...')
 
-            sampling_client = (training_client.save_weights_and_get_sampling_client(name=f'GSM8K-step-{step}'))
+            sampling_client = training_client.save_weights_and_get_sampling_client()
             logger.info(f'Step {step}: Sampling client ready')
 
         if sampling_client is None:

diff --git a/cookbook/client/tinker/self_host/short_math_grpo.py b/cookbook/client/tinker/self_host/short_math_grpo.py
@@ -165,7 +165,7 @@ def main():
         if step % SYNC_INTERVAL == 0:
             logger.info(f'Step {step}: Saving weights for sampler...')
 
-            sampling_client = (training_client.save_weights_and_get_sampling_client(name=f'GSM8K-step-{step}'))
+            sampling_client = training_client.save_weights_and_get_sampling_client()
             logger.info(f'Step {step}: Sampling client ready')
 
         if sampling_client is None: