1+ import os
2+
13from mp_actors import move_to_child_process
24
35from ..local .backend import LocalBackend
@@ -19,6 +21,7 @@ def __init__(
1921
2022 async def _get_service (self , model : TrainableModel ) -> ModelService :
2123 from ..dev .get_model_config import get_model_config
24+ from ..dev .validate import is_dedicated_mode , validate_dedicated_config
2225 from .service import MegatronService
2326
2427 if model .name not in self ._services :
@@ -27,13 +30,19 @@ async def _get_service(self, model: TrainableModel) -> ModelService:
2730 output_dir = get_model_dir (model = model , art_path = self ._path ),
2831 config = model ._internal_config ,
2932 )
33+ validate_dedicated_config (config )
34+ dedicated = is_dedicated_mode (config )
35+ if dedicated :
36+ os .environ ["CUDA_VISIBLE_DEVICES" ] = "," .join (
37+ str (gpu_id ) for gpu_id in config ["trainer_gpu_ids" ]
38+ )
3039 self ._services [model .name ] = MegatronService (
3140 model_name = model .name ,
3241 base_model = model .base_model ,
3342 config = config ,
3443 output_dir = get_model_dir (model = model , art_path = self ._path ),
3544 )
36- if not self ._in_process :
45+ if not dedicated and not self ._in_process :
3746 self ._services [model .name ] = move_to_child_process (
3847 self ._services [model .name ],
3948 process_name = "megatron-service" ,
0 commit comments