Skip to content

Commit 3e50ca0

Browse files
authored
feat: Upgrade to transformers v5 (#629)
1 parent 621e82b commit 3e50ca0

10 files changed

Lines changed: 747 additions & 117 deletions
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
"""Launch a multi-step Qwen3.5 LocalBackend yes-no-maybe run on SkyPilot."""
2+
3+
import argparse
4+
import os
5+
import textwrap
6+
7+
from dotenv import load_dotenv
8+
import sky
9+
from sky import ClusterStatus
10+
11+
load_dotenv()
12+
13+
DEFAULT_IMAGE_ID = "docker:nvidia/cuda:12.8.1-devel-ubuntu22.04"
14+
15+
16+
def _format_env_bool(value: bool) -> str:
17+
return "true" if value else "false"
18+
19+
20+
def _format_int_list(values: list[int]) -> str:
21+
return ",".join(str(value) for value in values)
22+
23+
24+
parser = argparse.ArgumentParser(
25+
description="Launch a Qwen3.5 LocalBackend yes-no-maybe convergence run."
26+
)
27+
parser.add_argument("--fast", action="store_true")
28+
parser.add_argument("--base-model", type=str, default="Qwen/Qwen3.5-4B")
29+
parser.add_argument("--accelerator", type=str, default="H200:1")
30+
parser.add_argument(
31+
"--cluster-name", type=str, default="art-qwen35-localbackend-yes-no-maybe"
32+
)
33+
parser.add_argument("--image-id", type=str, default=DEFAULT_IMAGE_ID)
34+
parser.add_argument("--project", type=str, default="qwen35-localbackend-yes-no-maybe")
35+
parser.add_argument("--gpu-memory-utilization", type=float, default=0.35)
36+
parser.add_argument("--max-model-len", type=int, default=1024)
37+
parser.add_argument("--max-seq-length", type=int, default=1024)
38+
parser.add_argument("--max-num-seqs", type=int, default=8)
39+
parser.add_argument("--num-steps", type=int, default=10)
40+
parser.add_argument("--rollouts-per-prompt", type=int, default=8)
41+
parser.add_argument("--eval-prompts", type=int, default=24)
42+
parser.add_argument("--eval-every-n-steps", type=int, default=1)
43+
parser.add_argument("--max-tokens", type=int, default=5)
44+
parser.add_argument("--learning-rate", type=float, default=5e-5)
45+
parser.add_argument(
46+
"--load-in-4bit", action=argparse.BooleanOptionalAction, default=False
47+
)
48+
parser.add_argument(
49+
"--load-in-16bit", action=argparse.BooleanOptionalAction, default=True
50+
)
51+
parser.add_argument(
52+
"--enable-thinking", action=argparse.BooleanOptionalAction, default=False
53+
)
54+
parser.add_argument("--trainer-gpu-ids", type=int, nargs="+")
55+
parser.add_argument("--inference-gpu-ids", type=int, nargs="+")
56+
args = parser.parse_args()
57+
58+
assert (args.trainer_gpu_ids is None) == (args.inference_gpu_ids is None), (
59+
"--trainer-gpu-ids and --inference-gpu-ids must both be set or both unset"
60+
)
61+
62+
cluster_name = args.cluster_name
63+
cluster_prefix = os.environ.get("CLUSTER_PREFIX")
64+
if cluster_prefix:
65+
cluster_name = f"{cluster_prefix}-{cluster_name}"
66+
67+
setup_script = textwrap.dedent("""\
68+
echo 'Setting up environment...'
69+
apt-get update
70+
apt-get install -y python3 python3-pip python-is-python3 git curl ninja-build
71+
curl -LsSf https://astral.sh/uv/install.sh | sh
72+
source $HOME/.local/bin/env
73+
""")
74+
75+
env = [
76+
f"PROJECT={args.project}",
77+
"MODEL_NAME=qwen35-localbackend-ynm-$(date +%Y%m%d-%H%M%S)",
78+
f"BASE_MODEL={args.base_model}",
79+
f"GPU_MEMORY_UTILIZATION={args.gpu_memory_utilization}",
80+
f"MAX_MODEL_LEN={args.max_model_len}",
81+
f"MAX_SEQ_LENGTH={args.max_seq_length}",
82+
f"MAX_NUM_SEQS={args.max_num_seqs}",
83+
"ENFORCE_EAGER=true",
84+
f"LOAD_IN_4BIT={_format_env_bool(args.load_in_4bit)}",
85+
f"LOAD_IN_16BIT={_format_env_bool(args.load_in_16bit)}",
86+
f"ENABLE_THINKING={_format_env_bool(args.enable_thinking)}",
87+
f"NUM_STEPS={args.num_steps}",
88+
f"ROLLOUTS_PER_PROMPT={args.rollouts_per_prompt}",
89+
f"EVAL_PROMPTS={args.eval_prompts}",
90+
f"EVAL_EVERY_N_STEPS={args.eval_every_n_steps}",
91+
f"MAX_TOKENS={args.max_tokens}",
92+
f"LEARNING_RATE={args.learning_rate}",
93+
]
94+
if args.trainer_gpu_ids is not None:
95+
env.extend(
96+
[
97+
f"TRAINER_GPU_IDS={_format_int_list(args.trainer_gpu_ids)}",
98+
f"INFERENCE_GPU_IDS={_format_int_list(args.inference_gpu_ids)}",
99+
]
100+
)
101+
env_block = " \\\n ".join(env)
102+
103+
run_script = textwrap.dedent(
104+
f"""\
105+
source $HOME/.local/bin/env
106+
cd ~/sky_workdir
107+
~/.local/bin/uv sync --extra backend
108+
{env_block} \
109+
~/.local/bin/uv run dev/yes-no-maybe-metrics.py
110+
"""
111+
)
112+
113+
task = sky.Task(
114+
name="qwen3.5-localbackend-yes-no-maybe",
115+
setup=setup_script,
116+
run=run_script,
117+
workdir=".",
118+
)
119+
task.set_resources(
120+
sky.Resources(
121+
accelerators=args.accelerator,
122+
cloud=sky.clouds.Kubernetes(),
123+
image_id=args.image_id,
124+
)
125+
)
126+
task.set_file_mounts({"~/sky_workdir/.env": ".env"})
127+
128+
print(f"Launching on cluster: {cluster_name}")
129+
print(f" base_model: {args.base_model}")
130+
print(f" project: {args.project}")
131+
print(f" accelerator: {args.accelerator}")
132+
print(f" image_id: {args.image_id}")
133+
print(f" gpu_memory_utilization: {args.gpu_memory_utilization}")
134+
print(f" max_model_len: {args.max_model_len}")
135+
print(f" max_seq_length: {args.max_seq_length}")
136+
print(f" max_num_seqs: {args.max_num_seqs}")
137+
print(f" num_steps: {args.num_steps}")
138+
print(f" rollouts_per_prompt: {args.rollouts_per_prompt}")
139+
print(f" eval_prompts: {args.eval_prompts}")
140+
print(f" eval_every_n_steps: {args.eval_every_n_steps}")
141+
print(f" max_tokens: {args.max_tokens}")
142+
print(f" learning_rate: {args.learning_rate}")
143+
print(f" load_in_4bit: {args.load_in_4bit}")
144+
print(f" load_in_16bit: {args.load_in_16bit}")
145+
print(f" enable_thinking: {args.enable_thinking}")
146+
print(f" trainer_gpu_ids: {args.trainer_gpu_ids}")
147+
print(f" inference_gpu_ids: {args.inference_gpu_ids}")
148+
149+
cluster_status = sky.stream_and_get(sky.status(cluster_names=[cluster_name]))
150+
if cluster_status and cluster_status[0]["status"] == ClusterStatus.UP:
151+
print(f"Cluster {cluster_name} is UP. Canceling any active jobs...")
152+
sky.stream_and_get(sky.cancel(cluster_name, all=True))
153+
154+
job_id, _ = sky.stream_and_get(
155+
sky.launch(
156+
task,
157+
cluster_name=cluster_name,
158+
retry_until_up=True,
159+
idle_minutes_to_autostop=60,
160+
down=True,
161+
fast=args.fast,
162+
)
163+
)
164+
165+
print(f"Job submitted (ID: {job_id}). Streaming logs...")
166+
exit_code = sky.tail_logs(cluster_name=cluster_name, job_id=job_id, follow=True)
167+
print(f"Job {job_id} finished with exit code {exit_code}.")

0 commit comments

Comments
 (0)