Skip to content

Commit a1e0c6e

Browse files
committed
feat: Add dedicated Megatron lora mode
1 parent dec6b3a commit a1e0c6e

File tree

3 files changed

+452
-27
lines changed

3 files changed

+452
-27
lines changed

src/art/megatron/backend.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import os
2+
13
from mp_actors import move_to_child_process
24

35
from ..local.backend import LocalBackend
@@ -19,6 +21,7 @@ def __init__(
1921

2022
async def _get_service(self, model: TrainableModel) -> ModelService:
2123
from ..dev.get_model_config import get_model_config
24+
from ..dev.validate import is_dedicated_mode, validate_dedicated_config
2225
from .service import MegatronService
2326

2427
if model.name not in self._services:
@@ -27,13 +30,19 @@ async def _get_service(self, model: TrainableModel) -> ModelService:
2730
output_dir=get_model_dir(model=model, art_path=self._path),
2831
config=model._internal_config,
2932
)
33+
validate_dedicated_config(config)
34+
dedicated = is_dedicated_mode(config)
35+
if dedicated:
36+
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
37+
str(gpu_id) for gpu_id in config["trainer_gpu_ids"]
38+
)
3039
self._services[model.name] = MegatronService(
3140
model_name=model.name,
3241
base_model=model.base_model,
3342
config=config,
3443
output_dir=get_model_dir(model=model, art_path=self._path),
3544
)
36-
if not self._in_process:
45+
if not dedicated and not self._in_process:
3746
self._services[model.name] = move_to_child_process(
3847
self._services[model.name],
3948
process_name="megatron-service",

0 commit comments

Comments
 (0)