Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions cookbook/client/server/megatron/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@ TWINKLE_WORK_DIR="${TWINKLE_WORK_DIR:-/dashscope/caches/application/twinkle}"
TEMP_DIR="${TWINKLE_TEMP_DIR:-/dashscope/caches/application/ray_logs}"
LOG_FILE="$TWINKLE_WORK_DIR/run.log"
TWINKLE_HEALTH_URL="${TWINKLE_HEALTH_URL:-http://127.0.0.1:9000/api/v1/healthz}"
TWINKLE_DEEP_HEALTH_URL="${TWINKLE_DEEP_HEALTH_URL:-http://127.0.0.1:9000/api/v1/twinkle/healthz/deep}"
TWINKLE_WATCHDOG_INTERVAL_SECONDS="${TWINKLE_WATCHDOG_INTERVAL_SECONDS:-10}"
TWINKLE_WATCHDOG_FAILURE_THRESHOLD="${TWINKLE_WATCHDOG_FAILURE_THRESHOLD:-3}"
TWINKLE_RAY_GRACE_SECONDS="${TWINKLE_RAY_GRACE_SECONDS:-30}"
TWINKLE_HEALTH_GRACE_SECONDS="${TWINKLE_HEALTH_GRACE_SECONDS:-${TWINKLE_WATCHDOG_STARTUP_GRACE_SECONDS:-300}}"
TWINKLE_DEEP_HEALTH_GRACE_SECONDS="${TWINKLE_DEEP_HEALTH_GRACE_SECONDS:-${TWINKLE_HEALTH_GRACE_SECONDS:-300}}"
RESTART_BACKOFF_SECONDS="${TWINKLE_ENTRYPOINT_RESTART_BACKOFF_SECONDS:-10}"

CHILD_PID=""
Expand Down Expand Up @@ -58,6 +60,7 @@ validate_entrypoint_config() {
require_positive_int "TWINKLE_WATCHDOG_FAILURE_THRESHOLD" "$TWINKLE_WATCHDOG_FAILURE_THRESHOLD"
require_non_negative_int "TWINKLE_RAY_GRACE_SECONDS" "$TWINKLE_RAY_GRACE_SECONDS"
require_non_negative_int "TWINKLE_HEALTH_GRACE_SECONDS" "$TWINKLE_HEALTH_GRACE_SECONDS"
require_non_negative_int "TWINKLE_DEEP_HEALTH_GRACE_SECONDS" "$TWINKLE_DEEP_HEALTH_GRACE_SECONDS"
require_non_negative_int "TWINKLE_ENTRYPOINT_RESTART_BACKOFF_SECONDS" "$RESTART_BACKOFF_SECONDS"

require_command timeout
Expand All @@ -72,21 +75,22 @@ validate_entrypoint_config() {
}

check_http_health() {
local url="${1:-$TWINKLE_HEALTH_URL}"
if command -v curl &> /dev/null; then
curl -fsS --max-time 10 "$TWINKLE_HEALTH_URL" >/dev/null
curl -fsS --max-time 10 "$url" >/dev/null
return
fi

if command -v wget &> /dev/null; then
wget -q --spider --timeout=10 "$TWINKLE_HEALTH_URL"
wget -q -O /dev/null --timeout=10 "$url"
return
fi

local python_bin="python3"
if ! command -v "$python_bin" &> /dev/null; then
python_bin="python"
fi
"$python_bin" - "$TWINKLE_HEALTH_URL" <<'PY'
"$python_bin" - "$url" <<'PY'
import sys
import urllib.request

Expand All @@ -102,6 +106,7 @@ PY
print_watchdog_diagnostics() {
print_warning "EntryPoint watchdog 诊断信息:"
echo " - health url: $TWINKLE_HEALTH_URL"
echo " - deep health url: $TWINKLE_DEEP_HEALTH_URL"
echo " - run.sh pid: ${CHILD_PID:-unset}"
echo " - Ray logs: $TEMP_DIR/session_latest/logs"

Expand Down Expand Up @@ -161,6 +166,9 @@ while true; do
elif ! check_http_health; then
WATCHDOG_FAILURE_REASON="http health check failed: $TWINKLE_HEALTH_URL"
WATCHDOG_GRACE_SECONDS="$TWINKLE_HEALTH_GRACE_SECONDS"
elif ! check_http_health "$TWINKLE_DEEP_HEALTH_URL"; then
WATCHDOG_FAILURE_REASON="deep health check failed (model actors may be dead): $TWINKLE_DEEP_HEALTH_URL"
WATCHDOG_GRACE_SECONDS="$TWINKLE_DEEP_HEALTH_GRACE_SECONDS"
fi

if [ -z "$WATCHDOG_FAILURE_REASON" ]; then
Expand Down
16 changes: 11 additions & 5 deletions cookbook/client/server/megatron/server_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ applications:
env_vars:
TWINKLE_TRUST_REMOTE_CODE: "0"
TWINKLE_LONG_POLL_TIMEOUT: "120"
TWINKLE_FAIL_FAST: "0"

# 3. Sampler Service - Runs inference / sampling using vLLM engine
# Used for generating text from the model (e.g., evaluating LoRA results).
Expand All @@ -66,7 +67,7 @@ applications:
nproc_per_node: 4 # Number of GPU processes per node
sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
engine_args: # vLLM engine-specific settings
max_model_len: 65536 # Maximum sequence length the engine supports
max_model_len: 32768 # Maximum sequence length the engine supports
gpu_memory_utilization: 0.75 # 80% utilization, ~64GB/GPU, leaves buffer for safety
enable_lora: true # Allow loading LoRA adapters during inference
max_loras: 5 # Max allowed loras working on vLLM at the same time
Expand All @@ -84,7 +85,7 @@ applications:
queue_config:
rps_limit: 20 # Max requests per second
tps_limit: 131072 # Max tokens per second
max_input_tokens: 65536
max_input_tokens: 32768
deployments:
- name: SamplerManagement
autoscaling_config:
Expand All @@ -97,6 +98,7 @@ applications:
env_vars:
TWINKLE_TRUST_REMOTE_CODE: "0"
TWINKLE_LONG_POLL_TIMEOUT: "120"
TWINKLE_FAIL_FAST: "0"

# 2. Model Service - Hosts the base model for training.
# Config: PP=2 x DP=2 on 4 GPUs, ~27GB weights/GPU, comfortable for LoRA training
Expand All @@ -105,8 +107,8 @@ applications:
import_path: model
args:
backend: megatron # Use Megatron-LM backend
model_id: "ms://Qwen/Qwen3.6-27B" # ModelScope model identifier
max_length: 65536 # model max length
model_id: "ms://Qwen/Qwen3.6-27B" # ModelScope model identifier
max_length: 32768 # model max length
max_loras: 3 # model max loras
nproc_per_node: 4 # Number of GPU processes per node
device_group:
Expand All @@ -121,7 +123,7 @@ applications:
queue_config:
rps_limit: 20 # Max requests per second
tps_limit: 131072 # Max tokens per second
max_input_tokens: 65536
max_input_tokens: 32768
adapter_config:
adapter_timeout: 120 # Seconds before idle adapter unload
adapter_max_lifetime: 36000 # Maximum lifetime of an adapter in seconds (e.g., 10 hours)
Expand All @@ -137,6 +139,7 @@ applications:
env_vars:
TWINKLE_TRUST_REMOTE_CODE: "0"
TWINKLE_LONG_POLL_TIMEOUT: "120"
TWINKLE_FAIL_FAST: "0"

# 4. Processor Service
- name: processor
Expand All @@ -159,3 +162,6 @@ applications:
target_ongoing_requests: 128
ray_actor_options:
num_cpus: 0.1
runtime_env:
env_vars:
TWINKLE_FAIL_FAST: "0"
8 changes: 8 additions & 0 deletions cookbook/client/server/megatron/server_config_4b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ applications:
target_ongoing_requests: 128 # Target concurrent requests per replica
ray_actor_options:
num_cpus: 0.1 # CPU resources allocated to this actor
runtime_env:
env_vars:
TWINKLE_FAIL_FAST: "0"

# 2. Model Service (commented out) - Would host the base model for training.
# Uncomment and configure if you need a training model worker.
Expand Down Expand Up @@ -68,6 +71,7 @@ applications:
runtime_env:
env_vars:
TWINKLE_TRUST_REMOTE_CODE: "0"
TWINKLE_FAIL_FAST: "0"

# 3. Sampler Service - Runs inference / sampling using vLLM engine
# Used for generating text from the model (e.g., evaluating LoRA results).
Expand Down Expand Up @@ -105,6 +109,7 @@ applications:
runtime_env:
env_vars:
TWINKLE_TRUST_REMOTE_CODE: "0"
TWINKLE_FAIL_FAST: "0"

# 4. Processor Service
- name: processor
Expand All @@ -127,3 +132,6 @@ applications:
target_ongoing_requests: 128
ray_actor_options:
num_cpus: 0.1
runtime_env:
env_vars:
TWINKLE_FAIL_FAST: "0"
78 changes: 43 additions & 35 deletions cookbook/client/server/transformer/server_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ http_options:

# Telemetry: push traces/metrics/logs to LGTM's OTel Collector via OTLP
telemetry:
enabled: true
enabled: false
otlp_endpoint: http://localhost:4317

# Persistence configuration for ServerState (sessions, models, futures, ...).
Expand Down Expand Up @@ -49,6 +49,9 @@ applications:
target_ongoing_requests: 128 # Target concurrent requests per replica
ray_actor_options:
num_cpus: 0.1 # CPU resources allocated to this actor
runtime_env:
env_vars:
TWINKLE_FAIL_FAST: "0"

# 2. Model Service - Hosts the base model for training.
- name: models-Qwen3.5-4B
Expand Down Expand Up @@ -81,43 +84,45 @@ applications:
num_cpus: 0.1
runtime_env:
env_vars:
TWINKLE_TRUST_REMOTE_CODE: "0"
TWINKLE_TRUST_REMOTE_CODE: "1"
TWINKLE_FAIL_FAST: "0"
Comment thread
Yunnglin marked this conversation as resolved.

# 3. Sampler Service - Runs inference / sampling using vLLM engine
# Used for generating text from the model (e.g., evaluating LoRA results).
# - name: sampler-Qwen3.5-4B
# route_prefix: /api/v1/sampler/Qwen/Qwen3.5-4B
# import_path: sampler
# args:
# model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier
# nproc_per_node: 2 # Number of GPU processes per node
# sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
# engine_args: # vLLM engine-specific settings
# max_model_len: 4096 # Maximum sequence length the engine supports
# gpu_memory_utilization: 0.5 # Fraction of GPU memory to use (0.0-1.0)
# enable_lora: true # Allow loading LoRA adapters during inference
# logprobs_mode: processed_logprobs # Logprobs mode for sampling results
# device_group: # Logical device group for the sampler
# name: sampler
# ranks: 1 # Number of GPUs to use
# device_type: cuda
# device_mesh:
# device_type: cuda
# dp_size: 1
# queue_config:
# rps_limit: 100 # Max requests per second
# tps_limit: 100000 # Max tokens per second
# deployments:
# - name: SamplerManagement
# autoscaling_config:
# min_replicas: 1
# max_replicas: 1
# target_ongoing_requests: 16
# ray_actor_options:
# num_cpus: 0.1
# runtime_env:
# env_vars:
# TWINKLE_TRUST_REMOTE_CODE: "0"
- name: sampler-Qwen3.5-4B
route_prefix: /api/v1/sampler/Qwen/Qwen3.5-4B
import_path: sampler
args:
model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier
nproc_per_node: 1 # Number of GPU processes per node
sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
engine_args: # vLLM engine-specific settings
max_model_len: 4096 # Maximum sequence length the engine supports
gpu_memory_utilization: 0.5 # Fraction of GPU memory to use (0.0-1.0)
enable_lora: true # Allow loading LoRA adapters during inference
logprobs_mode: processed_logprobs # Logprobs mode for sampling results
device_group: # Logical device group for the sampler
name: sampler
ranks: 1 # Number of GPUs to use
device_type: cuda
device_mesh:
device_type: cuda
dp_size: 1
queue_config:
rps_limit: 100 # Max requests per second
tps_limit: 100000 # Max tokens per second
deployments:
- name: SamplerManagement
autoscaling_config:
min_replicas: 1
max_replicas: 1
target_ongoing_requests: 16
ray_actor_options:
num_cpus: 0.1
runtime_env:
env_vars:
TWINKLE_TRUST_REMOTE_CODE: "1"
TWINKLE_FAIL_FAST: "0"

# 4. Processor Service
- name: processor
Expand All @@ -140,3 +145,6 @@ applications:
target_ongoing_requests: 128
ray_actor_options:
num_cpus: 0.1
runtime_env:
env_vars:
TWINKLE_FAIL_FAST: "0"
11 changes: 0 additions & 11 deletions cookbook/client/server/transformer/server_e2e.py

This file was deleted.

2 changes: 1 addition & 1 deletion cookbook/client/tinker/modelscope/dpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
max_length = 2048
lora_rank = 8
system_prompt = 'You are a helpful assistant.'
use_swanlab = True
use_swanlab = False


# ---------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion cookbook/client/tinker/modelscope/short_math_grpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def main():
if step % SYNC_INTERVAL == 0:
logger.info(f'Step {step}: Saving weights for sampler...')

sampling_client = (training_client.save_weights_and_get_sampling_client(name=f'GSM8K-step-{step}'))
sampling_client = training_client.save_weights_and_get_sampling_client()
logger.info(f'Step {step}: Sampling client ready')

if sampling_client is None:
Expand Down
2 changes: 1 addition & 1 deletion cookbook/client/tinker/self_host/short_math_grpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def main():
if step % SYNC_INTERVAL == 0:
logger.info(f'Step {step}: Saving weights for sampler...')

sampling_client = (training_client.save_weights_and_get_sampling_client(name=f'GSM8K-step-{step}'))
sampling_client = training_client.save_weights_and_get_sampling_client()
logger.info(f'Step {step}: Sampling client ready')

if sampling_client is None:
Expand Down
Loading
Loading