GenerativeAIExamples/industries/predictive_maintenance_agent/deploy_llama_nemotron.sh at fadcd300df24e7b781b47888956d54462c9b3741 · NVIDIA/GenerativeAIExamples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/bin/bash

# Deploy Llama 3.3 Nemotron Super 49B V1 with FP8 precision on 2 GPUs
# Port: 9000

echo "Deploying Llama 3.3 Nemotron Super 49B V1..."
echo "Configuration: FP8 precision, 2 GPUs (0,1), Port 9000"

# Set environment variables
export NGC_API_KEY="${NGC_API_KEY:-<PASTE_API_KEY_HERE>}"
export LOCAL_NIM_CACHE="${LOCAL_NIM_CACHE:-$HOME/.cache/nim/llama-nemotron}"

# Create cache directory
mkdir -p "$LOCAL_NIM_CACHE"

# Check if NGC_API_KEY is set
if [ "$NGC_API_KEY" = "<PASTE_API_KEY_HERE>" ]; then
    echo "ERROR: Please set your NGC_API_KEY environment variable"
    echo "Run: export NGC_API_KEY=your_actual_api_key"
    exit 1
fi

echo "Using cache directory: $LOCAL_NIM_CACHE"
echo "Starting deployment on port 9000..."

# Deploy with FP8 precision on 2 GPUs (0,1)
CUDA_VISIBLE_DEVICES=0,1 docker run -it --rm \
    --gpus all \
    --shm-size=16GB \
    -e NGC_API_KEY \
    -v "$LOCAL_NIM_CACHE:/opt/nim/.cache" \
    -u $(id -u) \
    -p 9000:8000 \
    -e NIM_TENSOR_PARALLEL_SIZE=2 \
    -e NIM_PRECISION=fp8 \
    nvcr.io/nim/nvidia/llama-3.3-nemotron-super-49b-v1:latest

echo "Llama 3.3 Nemotron deployment completed."