diff --git a/base/comps/kmod-nvidia-open/kmod-nvidia-open-modprobe.conf b/base/comps/kmod-nvidia-open/kmod-nvidia-open-modprobe.conf new file mode 100644 index 00000000000..43be4bf4efa --- /dev/null +++ b/base/comps/kmod-nvidia-open/kmod-nvidia-open-modprobe.conf @@ -0,0 +1,8 @@ +# Block conflicting kernel modules +blacklist nouveau +blacklist nvidiafb +blacklist rivafb + +# Load NVIDIA modules with recommended options +options nvidia NVreg_OpenRmEnableUnsupportedGpus=1 +options nvidia-drm modeset=1 diff --git a/base/comps/kmod-nvidia-open/kmod-nvidia-open.comp.toml b/base/comps/kmod-nvidia-open/kmod-nvidia-open.comp.toml new file mode 100644 index 00000000000..d7c689e013d --- /dev/null +++ b/base/comps/kmod-nvidia-open/kmod-nvidia-open.comp.toml @@ -0,0 +1,16 @@ +[component-templates.kmod-nvidia-open] +description = "Out-of-tree driver built against multiple kernel versions and toolchains" + +[component-templates.kmod-nvidia-open.default-component-config] +spec = { type = "local", path = "kmod-nvidia-open.spec" } + +[[component-templates.kmod-nvidia-open.default-component-config.source-files]] +filename = "open-gpu-kernel-modules-595.58.03.tar.gz" +hash-type = "SHA256" +hash = "e0c4659ddf15e4f4e19cee05b49f88c9ba08ef3add0dfe08249798f58d0fe75e" +origin = { type = "download", uri = "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/595.58.03.tar.gz" } + +[[component-templates.kmod-nvidia-open.matrix]] +axis = "kernel" + +[component-templates.kmod-nvidia-open.matrix.values.6-18] diff --git a/base/comps/kmod-nvidia-open/kmod-nvidia-open.spec b/base/comps/kmod-nvidia-open/kmod-nvidia-open.spec new file mode 100644 index 00000000000..e9c71278467 --- /dev/null +++ b/base/comps/kmod-nvidia-open/kmod-nvidia-open.spec @@ -0,0 +1,134 @@ +# Azure Linux NVIDIA open GPU kernel modules +# Built from: https://github.com/NVIDIA/open-gpu-kernel-modules + +# Kernel modules are built via kbuild outside the normal RPM build tree, +# so automatic debuginfo extraction fails with empty debugsourcefiles.list. +%global debug_package %{nil} + +# Auto-detect the kernel version from the installed kernel-devel package. +# The kernel-devel package installs headers to /usr/src/kernels/. +# This is resolved at RPM build time (inside mock), after BuildRequires +# are installed — so the directory will exist. +# +# To override: rpmbuild --define 'kernel_uname_r 6.18.5-1.4.azl4.x86_64' +%{!?kernel_uname_r: %global kernel_uname_r %(ls -1 /usr/src/kernels/ 2>/dev/null | sort -V | tail -1)} + +%global kmod_install_dir /lib/modules/%{kernel_uname_r}/extra/nvidia + +Name: kmod-nvidia-open +Version: 595.58.03 +Release: 3_%{kernel_uname_r}%{?dist} +Summary: NVIDIA open GPU kernel modules for CUDA workloads +License: MIT AND GPLv2 +URL: https://github.com/NVIDIA/open-gpu-kernel-modules +Vendor: Microsoft Corporation +Distribution: Azure Linux +ExclusiveArch: x86_64 aarch64 + +Source0: https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/%{version}.tar.gz#/open-gpu-kernel-modules-%{version}.tar.gz +Source1: kmod-nvidia-open-modprobe.conf + +BuildRequires: kernel-devel +BuildRequires: gcc +BuildRequires: gcc-c++ +BuildRequires: make +BuildRequires: elfutils-libelf-devel +BuildRequires: binutils + +Requires: kernel-uname-r = %{kernel_uname_r} +Requires(post): kmod +Requires(postun): kmod + +Provides: nvidia-open-kmod = %{version}-%{release} +Provides: kmod-nvidia-open = %{version}-%{release} +Provides: kmod-nvidia-open-%{kernel_uname_r} = %{version}-%{release} + +# Prevent conflicting NVIDIA driver packages from being installed +Conflicts: nvidia-driver-cuda +Conflicts: kmod-nvidia-open-dkms + +%description +Open-source NVIDIA GPU kernel modules built from the official +NVIDIA/open-gpu-kernel-modules repository for kernel %{kernel_uname_r}. + +These modules support CUDA workloads on NVIDIA GPUs with compute +capability 5.0 and later. The modules are built using the open-source +kernel module variant (nvidia-open). + +Modules included: + - nvidia.ko (core driver) + - nvidia-modeset.ko (modesetting support) + - nvidia-drm.ko (DRM/KMS support) + - nvidia-uvm.ko (unified virtual memory) + - nvidia-peermem.ko (GPU peer memory for RDMA) + +%prep +%autosetup -n open-gpu-kernel-modules-%{version} + +%build +# Unset LDFLAGS — NVIDIA's kbuild invokes ld directly (not via gcc), +# so it doesn't understand the -Wl, prefix in RPM's default hardening flags. +unset LDFLAGS + +# Build the open kernel modules +# KERNEL_UNAME must match the target kernel exactly +make %{?_smp_mflags} modules -j$(nproc) \ + KERNEL_UNAME="%{kernel_uname_r}" \ + SYSSRC="/usr/src/kernels/%{kernel_uname_r}" \ + SYSOUT="/usr/src/kernels/%{kernel_uname_r}" \ + IGNORE_CC_MISMATCH=1 \ + IGNORE_XEN_PRESENCE=1 \ + IGNORE_PREEMPT_RT_PRESENCE=1 \ + NV_EXCLUDE_BUILD_MODULES="" \ + INSTALL_MOD_DIR="extra/nvidia" + +%install +install -d %{buildroot}%{kmod_install_dir} + +# Install the built kernel modules +for mod in nvidia nvidia-modeset nvidia-drm nvidia-uvm nvidia-peermem; do + ko="kernel-open/${mod}.ko" + if [ -f "${ko}" ]; then + install -m 0644 "${ko}" %{buildroot}%{kmod_install_dir}/ + fi +done + +# Install modprobe configuration to blacklist conflicting modules +install -D -m 0644 %{SOURCE1} %{buildroot}%{_sysconfdir}/modprobe.d/kmod-nvidia-open.conf + +# Generate modules.dep metadata at build time (weak-modules support) +install -d %{buildroot}%{_sysconfdir}/depmod.d +cat > %{buildroot}%{_sysconfdir}/depmod.d/kmod-nvidia-open.conf << 'EOF' +# Ensure NVIDIA modules in extra/ override any in-tree modules +override nvidia %{kernel_uname_r} extra/nvidia +override nvidia-modeset %{kernel_uname_r} extra/nvidia +override nvidia-drm %{kernel_uname_r} extra/nvidia +override nvidia-uvm %{kernel_uname_r} extra/nvidia +override nvidia-peermem %{kernel_uname_r} extra/nvidia +EOF + +%post +/usr/sbin/depmod -a %{kernel_uname_r} || : + +%postun +/usr/sbin/depmod -a %{kernel_uname_r} || : + +%files +%license COPYING +%{kmod_install_dir}/nvidia.ko +%{kmod_install_dir}/nvidia-modeset.ko +%{kmod_install_dir}/nvidia-drm.ko +%{kmod_install_dir}/nvidia-uvm.ko +%{kmod_install_dir}/nvidia-peermem.ko +%config(noreplace) %{_sysconfdir}/modprobe.d/kmod-nvidia-open.conf +%{_sysconfdir}/depmod.d/kmod-nvidia-open.conf + +%changelog +* Thu Apr 10 2026 Elaheh Dehghani - 595.58.03-2 +- Auto-detect kernel version from installed kernel-devel package +- Remove hardcoded kernel_version macro dependency + +* Thu Apr 09 2026 Elaheh Dehghani - 595.58.03-1 +- Initial Azure Linux 4.0 package +- Built from NVIDIA/open-gpu-kernel-modules upstream source +- Open-source kernel modules for CUDA workloads diff --git a/base/comps/nvidia-cuda-driver/nvidia-cuda-driver.comp.toml b/base/comps/nvidia-cuda-driver/nvidia-cuda-driver.comp.toml new file mode 100644 index 00000000000..46fef8e3f24 --- /dev/null +++ b/base/comps/nvidia-cuda-driver/nvidia-cuda-driver.comp.toml @@ -0,0 +1,15 @@ +[components.nvidia-cuda-driver] +# Headless compute-only: no graphics/display components. +spec = { type = "local", path = "nvidia-cuda-driver.spec" } + +[[components.nvidia-cuda-driver.source-files]] +filename = "NVIDIA-Linux-x86_64-595.58.03-no-compat32.run" +hash-type = "SHA256" +hash = "b66c59b9bebc191b9c21bad7476da1486dd390198223a548892110bca5fb91f9" +origin = { type = "download", uri = "https://us.download.nvidia.com/XFree86/Linux-x86_64/595.58.03/NVIDIA-Linux-x86_64-595.58.03-no-compat32.run" } + +[[components.nvidia-cuda-driver.source-files]] +filename = "NVIDIA-Linux-aarch64-595.58.03.run" +hash-type = "SHA256" +hash = "873cc8298d537bc424081591f87e64d4507f1cad5480685a8ba725df011a3d3f" +origin = { type = "download", uri = "https://us.download.nvidia.com/XFree86/Linux-aarch64/595.58.03/NVIDIA-Linux-aarch64-595.58.03.run" } diff --git a/base/comps/nvidia-cuda-driver/nvidia-cuda-driver.spec b/base/comps/nvidia-cuda-driver/nvidia-cuda-driver.spec new file mode 100644 index 00000000000..afe392381b1 --- /dev/null +++ b/base/comps/nvidia-cuda-driver/nvidia-cuda-driver.spec @@ -0,0 +1,311 @@ +%global debug_package %{nil} +# Prebuilt proprietary binaries — do not strip or modify +%global __strip /bin/true +%global __brp_ldconfig %{nil} + +%global nvidia_driver_version 595.58.03 +%global nvidia_libdir %{_libdir} +%global nvidia_bindir %{_bindir} +%global nvidia_fwdir /lib/firmware/nvidia/%{nvidia_driver_version} +%global nvidia_datadir %{_datadir}/nvidia + +Name: nvidia-cuda-driver +Version: %{nvidia_driver_version} +Release: 1%{?dist} +Summary: NVIDIA user-space GPU driver components +License: NVIDIA Proprietary +URL: https://www.nvidia.com/en-us/drivers/ +Vendor: Microsoft Corporation +Distribution: Azure Linux +ExclusiveArch: x86_64 aarch64 + +# Architecture-specific .run installer +# x86_64: use no-compat32 variant (64-bit only, smaller download) +# aarch64: single variant (no 32-bit compat layer on ARM) +%ifarch x86_64 +Source0: https://us.download.nvidia.com/XFree86/Linux-x86_64/%{version}/NVIDIA-Linux-x86_64-%{version}-no-compat32.run +%endif +%ifarch aarch64 +Source0: https://us.download.nvidia.com/XFree86/Linux-aarch64/%{version}/NVIDIA-Linux-aarch64-%{version}.run +%endif + +# nvidia-persistenced systemd unit +Source1: nvidia-persistenced.service + +# Persistent software state directory for the NVIDIA driver +Source2: tmpfiles-nvidia.conf + +# Configuration file to register the NVIDIA OpenCL ICD with the ICD loader +Source3: nvidia.icd + +Requires: kmod-nvidia-open = %{version} +Requires: nvidia-cuda-driver-libs%{?_isa} = %{version}-%{release} +Requires: nvidia-cuda-driver-firmware = %{version}-%{release} + +%description +User-space NVIDIA GPU driver components for headless CUDA workloads on +Azure Linux. + +This package is the companion to kmod-nvidia-open (which provides the open-source +NVIDIA kernel modules). It installs the proprietary user-space libraries, +management tools, and firmware needed to run CUDA applications on NVIDIA +GPUs (Turing and later). + +This is a compute-only package — no graphics, X11, Vulkan, or display +components are included. Azure Linux is a headless distro. + +# --------------------------------------------------------------------------- +# Sub-packages +# --------------------------------------------------------------------------- + +%package libs +Summary: NVIDIA CUDA driver shared libraries (compute-only) +Requires: libelf + +%description libs +Core NVIDIA shared libraries for headless GPU compute, including the CUDA +runtime, NVML management library, PTX JIT compiler, NVVM compiler, OpenCL +implementation, and hardware video encode/decode (NVENC/NVCUVID). + +No OpenGL, EGL, Vulkan, or display libraries are included. + +%package firmware +Summary: NVIDIA GSP firmware +# Firmware is architecture-independent content but packaged per-arch +# because it is extracted from an arch-specific installer. + +%description firmware +GPU System Processor (GSP) firmware images for NVIDIA GPUs. These are +required by the NVIDIA kernel modules to offload tasks to the GPU's +on-board processor. + +%package tools +Summary: NVIDIA GPU management and diagnostic tools +Requires: nvidia-cuda-driver-libs%{?_isa} = %{version}-%{release} + +%description tools +Command-line tools for managing and monitoring NVIDIA GPUs: + - nvidia-smi (system management interface) + - nvidia-persistenced (persistence daemon) + - nvidia-cuda-mps-control (multi-process service control) + - nvidia-cuda-mps-server (multi-process service server) + - nvidia-debugdump (GPU state dump for bug reports) + - nvidia-bug-report.sh (automated bug report generator) + - nvidia-modprobe (kernel module loader / device node creator) + +%package devel +Summary: NVIDIA CUDA driver development symlinks +Requires: nvidia-cuda-driver-libs%{?_isa} = %{version}-%{release} + +%description devel +Unversioned .so symlinks for linking against NVIDIA CUDA driver libraries +at build time (compute-only — no graphics libraries). + +# --------------------------------------------------------------------------- +# Prep — extract the self-extracting .run archive +# --------------------------------------------------------------------------- + +%prep +# The .run file is a shell-based self-extracting archive. +# --extract-only: extract without running the installer +# --target: directory name for the extracted contents +sh %{SOURCE0} --extract-only --target nvidia-installer +cd nvidia-installer + +# --------------------------------------------------------------------------- +# Build — nothing to build, all binaries are prebuilt +# --------------------------------------------------------------------------- + +%build +# Prebuilt proprietary binaries — nothing to compile. + +# --------------------------------------------------------------------------- +# Install — compute-only components (no graphics/display) +# --------------------------------------------------------------------------- + +%install +cd nvidia-installer + +install -d %{buildroot}%{nvidia_libdir} +install -d %{buildroot}%{nvidia_bindir} +install -d %{buildroot}%{nvidia_fwdir} +install -d %{buildroot}%{nvidia_datadir} +install -d %{buildroot}%{_sysconfdir}/OpenCL/vendors +install -d %{buildroot}%{_unitdir} +install -d %{buildroot}%{_tmpfilesdir} + +# -- Core CUDA and compute libraries -- +for lib in \ + libcuda.so.%{version} \ + libnvidia-ml.so.%{version} \ + libnvidia-ptxjitcompiler.so.%{version} \ + libnvidia-nvvm.so.4.0.0 \ + libnvidia-opencl.so.%{version} \ + libnvidia-cfg.so.%{version} \ + libnvidia-tls.so.%{version} \ + libnvidia-gpucomp.so.%{version} \ + libnvidia-pkcs11.so.%{version} \ + libnvidia-pkcs11-openssl3.so.%{version} \ + libcudadebugger.so.%{version} \ +; do + if [ -f "$lib" ]; then + install -m 0755 "$lib" %{buildroot}%{nvidia_libdir}/ + fi +done + +# -- Video encode/decode libraries (headless compute pipelines) -- +for lib in \ + libnvcuvid.so.%{version} \ + libnvidia-encode.so.%{version} \ + libnvidia-opticalflow.so.%{version} \ +; do + if [ -f "$lib" ]; then + install -m 0755 "$lib" %{buildroot}%{nvidia_libdir}/ + fi +done + +# -- Versioned soname symlinks -- +ln -sf libcuda.so.%{version} %{buildroot}%{nvidia_libdir}/libcuda.so.1 +ln -sf libnvidia-ml.so.%{version} %{buildroot}%{nvidia_libdir}/libnvidia-ml.so.1 +ln -sf libnvidia-ptxjitcompiler.so.%{version} %{buildroot}%{nvidia_libdir}/libnvidia-ptxjitcompiler.so.1 +ln -sf libnvidia-nvvm.so.4.0.0 %{buildroot}%{nvidia_libdir}/libnvidia-nvvm.so.4 +ln -sf libnvidia-opencl.so.%{version} %{buildroot}%{nvidia_libdir}/libnvidia-opencl.so.1 +ln -sf libnvidia-cfg.so.%{version} %{buildroot}%{nvidia_libdir}/libnvidia-cfg.so.1 +ln -sf libcudadebugger.so.%{version} %{buildroot}%{nvidia_libdir}/libcudadebugger.so.1 +ln -sf libnvcuvid.so.%{version} %{buildroot}%{nvidia_libdir}/libnvcuvid.so.1 +ln -sf libnvidia-encode.so.%{version} %{buildroot}%{nvidia_libdir}/libnvidia-encode.so.1 +ln -sf libnvidia-opticalflow.so.%{version} %{buildroot}%{nvidia_libdir}/libnvidia-opticalflow.so.1 + +# -- Unversioned development symlinks -- +ln -sf libcuda.so.1 %{buildroot}%{nvidia_libdir}/libcuda.so +ln -sf libnvidia-ml.so.1 %{buildroot}%{nvidia_libdir}/libnvidia-ml.so +ln -sf libnvidia-ptxjitcompiler.so.1 %{buildroot}%{nvidia_libdir}/libnvidia-ptxjitcompiler.so +ln -sf libnvidia-nvvm.so.4 %{buildroot}%{nvidia_libdir}/libnvidia-nvvm.so +ln -sf libnvidia-opencl.so.1 %{buildroot}%{nvidia_libdir}/libnvidia-opencl.so +ln -sf libcudadebugger.so.1 %{buildroot}%{nvidia_libdir}/libcudadebugger.so +ln -sf libnvcuvid.so.1 %{buildroot}%{nvidia_libdir}/libnvcuvid.so +ln -sf libnvidia-encode.so.1 %{buildroot}%{nvidia_libdir}/libnvidia-encode.so +ln -sf libnvidia-opticalflow.so.1 %{buildroot}%{nvidia_libdir}/libnvidia-opticalflow.so + +# -- Tools / binaries (compute & management only) -- +for bin in \ + nvidia-smi \ + nvidia-persistenced \ + nvidia-cuda-mps-control \ + nvidia-cuda-mps-server \ + nvidia-debugdump \ + nvidia-bug-report.sh \ + nvidia-modprobe \ +; do + if [ -f "$bin" ]; then + install -m 0755 "$bin" %{buildroot}%{nvidia_bindir}/ + fi +done + +# -- GSP firmware -- +if [ -d "firmware" ]; then + install -d %{buildroot}%{nvidia_fwdir} + install -m 0644 firmware/gsp_*.bin %{buildroot}%{nvidia_fwdir}/ +fi + +# -- Data files (container runtime file list) -- +if [ -f sandboxutils-filelist.json ]; then + install -d %{buildroot}%{_datadir}/nvidia/files.d + install -m 0644 sandboxutils-filelist.json %{buildroot}%{_datadir}/nvidia/files.d/ +fi + +# -- OpenCL ICD registration -- +install -m 0644 %{SOURCE3} %{buildroot}%{_sysconfdir}/OpenCL/vendors/nvidia.icd + +# -- systemd service for nvidia-persistenced -- +install -m 0644 %{SOURCE1} %{buildroot}%{_unitdir}/nvidia-persistenced.service + +# -- tmpfiles.d for /var/run/nvidia-persistenced -- +install -m 0644 %{SOURCE2} %{buildroot}%{_tmpfilesdir}/nvidia.conf + +# --------------------------------------------------------------------------- +# Post-install / uninstall scriptlets +# --------------------------------------------------------------------------- + +%post libs -p /sbin/ldconfig +%postun libs -p /sbin/ldconfig + +%post tools +%systemd_post nvidia-persistenced.service + +%preun tools +%systemd_preun nvidia-persistenced.service + +%postun tools +%systemd_postun_with_restart nvidia-persistenced.service + +# --------------------------------------------------------------------------- +# Files +# --------------------------------------------------------------------------- + +%files +%license nvidia-installer/LICENSE +%doc nvidia-installer/README.txt +%{_sysconfdir}/OpenCL/vendors/nvidia.icd + +%files libs +# Core CUDA / compute libraries +%{nvidia_libdir}/libcuda.so.%{version} +%{nvidia_libdir}/libcuda.so.1 +%{nvidia_libdir}/libnvidia-ml.so.%{version} +%{nvidia_libdir}/libnvidia-ml.so.1 +%{nvidia_libdir}/libnvidia-ptxjitcompiler.so.%{version} +%{nvidia_libdir}/libnvidia-ptxjitcompiler.so.1 +%{nvidia_libdir}/libnvidia-nvvm.so.4.0.0 +%{nvidia_libdir}/libnvidia-nvvm.so.4 +%{nvidia_libdir}/libnvidia-opencl.so.%{version} +%{nvidia_libdir}/libnvidia-opencl.so.1 +%{nvidia_libdir}/libnvidia-cfg.so.%{version} +%{nvidia_libdir}/libnvidia-cfg.so.1 +%{nvidia_libdir}/libnvidia-tls.so.%{version} +%{nvidia_libdir}/libnvidia-gpucomp.so.%{version} +%{nvidia_libdir}/libnvidia-pkcs11.so.%{version} +%{nvidia_libdir}/libnvidia-pkcs11-openssl3.so.%{version} +%{nvidia_libdir}/libcudadebugger.so.%{version} +%{nvidia_libdir}/libcudadebugger.so.1 +# Video encode/decode (headless compute pipelines) +%{nvidia_libdir}/libnvcuvid.so.%{version} +%{nvidia_libdir}/libnvcuvid.so.1 +%{nvidia_libdir}/libnvidia-encode.so.%{version} +%{nvidia_libdir}/libnvidia-encode.so.1 +%{nvidia_libdir}/libnvidia-opticalflow.so.%{version} +%{nvidia_libdir}/libnvidia-opticalflow.so.1 + +%files firmware +%dir %{nvidia_fwdir} +%{nvidia_fwdir}/gsp_*.bin + +%files tools +%{nvidia_bindir}/nvidia-smi +%{nvidia_bindir}/nvidia-persistenced +%{nvidia_bindir}/nvidia-cuda-mps-control +%{nvidia_bindir}/nvidia-cuda-mps-server +%{nvidia_bindir}/nvidia-debugdump +%{nvidia_bindir}/nvidia-bug-report.sh +%{nvidia_bindir}/nvidia-modprobe +%{_unitdir}/nvidia-persistenced.service +%{_tmpfilesdir}/nvidia.conf + +%files devel +%{nvidia_libdir}/libcuda.so +%{nvidia_libdir}/libnvidia-ml.so +%{nvidia_libdir}/libnvidia-ptxjitcompiler.so +%{nvidia_libdir}/libnvidia-nvvm.so +%{nvidia_libdir}/libnvidia-opencl.so +%{nvidia_libdir}/libcudadebugger.so +%{nvidia_libdir}/libnvcuvid.so +%{nvidia_libdir}/libnvidia-encode.so +%{nvidia_libdir}/libnvidia-opticalflow.so + +%changelog +* Thu Apr 10 2026 Elaheh Dehghani - 595.58.03-1 +- Initial Azure Linux 4.0 package (headless / compute-only) +- User-space NVIDIA GPU driver components for CUDA workloads +- Companion to kmod-nvidia-open (open-source kernel modules) +- Based on NVIDIA-Linux-x86_64-595.58.03 driver release +- No graphics/display components (X11, GLX, EGL, Vulkan, OptiX, NGX) diff --git a/base/comps/nvidia-cuda-driver/nvidia-persistenced.service b/base/comps/nvidia-cuda-driver/nvidia-persistenced.service new file mode 100644 index 00000000000..178fccc030b --- /dev/null +++ b/base/comps/nvidia-cuda-driver/nvidia-persistenced.service @@ -0,0 +1,14 @@ +[Unit] +Description=NVIDIA Persistence Daemon +# Keep the NVIDIA kernel module loaded and GPU state initialized +# even when no user-space clients are running. This reduces latency +# for the first CUDA application launch. +Wants=syslog.target + +[Service] +Type=forking +ExecStart=/usr/bin/nvidia-persistenced --user nvidia-persistenced --persistence-mode +ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced + +[Install] +WantedBy=multi-user.target diff --git a/base/comps/nvidia-cuda-driver/nvidia.icd b/base/comps/nvidia-cuda-driver/nvidia.icd new file mode 100644 index 00000000000..9712af911b8 --- /dev/null +++ b/base/comps/nvidia-cuda-driver/nvidia.icd @@ -0,0 +1 @@ +libnvidia-opencl.so.1 diff --git a/base/comps/nvidia-cuda-driver/tmpfiles-nvidia.conf b/base/comps/nvidia-cuda-driver/tmpfiles-nvidia.conf new file mode 100644 index 00000000000..39c52702ca2 --- /dev/null +++ b/base/comps/nvidia-cuda-driver/tmpfiles-nvidia.conf @@ -0,0 +1,2 @@ +# Runtime directory for nvidia-persistenced +d /var/run/nvidia-persistenced 0755 nvidia-persistenced nvidia-persistenced - diff --git a/docs/nvidia-driver-packaging.md b/docs/nvidia-driver-packaging.md new file mode 100644 index 00000000000..d871c9b14a2 --- /dev/null +++ b/docs/nvidia-driver-packaging.md @@ -0,0 +1,296 @@ +# NVIDIA Driver Packaging Guidelines for Azure Linux + +This document describes the packaging conventions for NVIDIA GPU driver +components on Azure Linux, covering both kernel modules and user-space +libraries/tools. + +## Design Principles + +1. **Kernel vs. user-space separation** — Kernel modules and user-space + components are packaged independently. They share the same NVIDIA driver + version but have no build-time dependency on each other. +2. **Headless / compute-only** — Azure Linux is a headless distro. Only + compute, monitoring, and video encode/decode components are included. + No X11, GLX, EGL, Vulkan, OptiX, or display-server libraries. +3. **Multi-kernel support** — Kernel modules can be built against multiple + kernel versions using `component-templates` with a matrix axis. The + user-space package works with any matching kernel module build. +4. **Three kernel module variants** — Kernel modules are available through + three distinct sources, packaged as separate kmod variants: + - **`kmod-nvidia-open`** — Open-source modules from NVIDIA's + [open-gpu-kernel-modules](https://github.com/NVIDIA/open-gpu-kernel-modules) + repository (MIT+GPL). Default for CUDA compute on supported GPUs. + - **`kmod-nvidia-closed`** — Proprietary modules from the NVIDIA + datacenter `.run` installer. Same CUDA/datacenter driver branch as + `open`, but required for GPUs not supported by the open-source modules. + - **`kmod-nvidia-grid`** — Proprietary modules from the GRID/vGPU + `-grid-azure.run` installer (published at `download.microsoft.com`). + A separate driver branch with vGPU mediation support and embedded + Azure licensing. + + All three variants conflict — only one can be installed at a time. +5. **Two user-space packages** — Each driver branch has a matching + user-space package: + - **`nvidia-cuda-driver`** — User-space libraries and tools for the + CUDA/datacenter branch. Pairs with either `kmod-nvidia-open` or + `kmod-nvidia-closed` (same branch, different kernel module source). + Extracted from the NVIDIA datacenter `.run` installer. + - **`nvidia-grid-driver`** — User-space libraries and tools for the + GRID/vGPU branch. Pairs only with `kmod-nvidia-grid`. Extracted from + the same GRID `-grid-azure.run` file as the kmod, includes Azure + licensing and vGPU manager components. + + The two user-space packages conflict — a system runs either dedicated + GPU compute or GRID/vGPU, not both. + +## Package Naming Conventions + +### Kernel module packages: `kmod-nvidia-` + +NVIDIA publishes kernel modules through three distinct channels. The `` +suffix identifies which **driver branch and source** the modules come from: + +| Variant | Package name | Source | Description | +|---------|-------------|--------|-------------| +| `open` | `kmod-nvidia-open` | [open-gpu-kernel-modules](https://github.com/NVIDIA/open-gpu-kernel-modules) (GitHub) | Open-source kernel modules (MIT+GPL). Default for CUDA compute on supported GPUs. | +| `closed` | `kmod-nvidia-closed` | NVIDIA datacenter `.run` installer | Proprietary kernel modules from the **CUDA/datacenter driver branch**. Required for GPUs not supported by the open-source modules, or when proprietary-only features are needed. | +| `grid` | `kmod-nvidia-grid` | GRID `-grid-azure.run` from `download.microsoft.com` | Proprietary kernel modules from the **GRID/vGPU driver branch**. Includes vGPU mediation support (`nvidia-vgpu-vfio.ko`). Azure licensing embedded. | + +The variant names reflect the **driver branch**, not just the license: +- `open` vs `closed` distinguishes the source license within the same CUDA/datacenter branch +- `grid` identifies the entirely separate GRID/vGPU branch (different versions, different feature set) + +All three **conflict** with each other — only one kmod variant can be installed at a time. + +**Naming rules:** +- Prefix: `kmod-` (standard Linux convention for out-of-tree kernel modules) +- Driver name: `nvidia-` (identifies the source/branch) +- The kernel `uname -r` string is embedded in the **Release** tag, not the Name, + so the same spec can build for any kernel version +- The component template name (e.g., `kmod-nvidia-open`) is expanded with a matrix + suffix by azldev: `kmod-nvidia-open-6-18` is the component name for the + 6.18 kernel build + +## Version and Release Conventions + +### Shared version + +Both `kmod-nvidia-open` and `nvidia-cuda-driver` use the **NVIDIA driver +version** as their RPM `Version:` tag: + +``` +Version: 595.58.03 +``` + +This ensures `Requires: kmod-nvidia-open = %{version}` in the user-space +package pins to the correct driver release without caring about the kernel. + +## Dependency Graph + +``` +nvidia-cuda-driver +├── Requires: (kmod-nvidia-open = 595.58.03 OR kmod-nvidia-closed = 595.58.03) +├── Requires: nvidia-cuda-driver-libs +└── Requires: nvidia-cuda-driver-firmware + +kmod-nvidia-open (per kernel version) kmod-nvidia-closed (per kernel version) +├── BuildRequires: kernel-devel ├── BuildRequires: kernel-devel +├── Requires: kernel-uname-r = ├── Requires: kernel-uname-r = +├── Conflicts: kmod-nvidia-closed ├── Conflicts: kmod-nvidia-open +├── Conflicts: kmod-nvidia-grid ├── Conflicts: kmod-nvidia-grid +├── Provides: kmod-nvidia-open = 595.58.03 ├── Provides: kmod-nvidia-closed = 595.58.03 +└── Provides: kmod-nvidia-open- └── Provides: kmod-nvidia-closed- +``` + +Key points: +- `nvidia-cuda-driver` works with **either** `kmod-nvidia-open` or + `kmod-nvidia-closed` — both are the same CUDA/datacenter driver branch, + just different kernel module sources (open-source vs proprietary) +- The dependency is **version-only** (matches on NVIDIA driver version, + not kernel version) +- `kmod-nvidia-*` → `kernel-uname-r` is an **exact** dependency + (must match the running kernel) +- All three kmod variants conflict with each other — only one can be installed +- Users install the kmod variant that matches their GPU and kernel; the + user-space package works with any compatible kmod + +## Component Definition Patterns + +### Kernel modules: `component-templates` with matrix + +The kmod uses `component-templates` with a `kernel` matrix axis. This causes +azldev to generate one component per kernel flavours: + +```toml +[component-templates.kmod-nvidia-open] +description = "Out-of-tree driver built against multiple kernel versions" + +[component-templates.kmod-nvidia-open.default-component-config] +spec = { type = "local", path = "kmod-nvidia-open.spec" } + +[[component-templates.kmod-nvidia-open.matrix]] +axis = "kernel" + +[component-templates.kmod-nvidia-open.matrix.values.default] +# No overlays needed — spec auto-detects kernel_uname_r from +# the installed kernel-devel headers at build time. +``` + +Resulting component name: `kmod-nvidia-open` + +To add a new kernel version, add a new matrix value: + +```toml +[component-templates.kmod-nvidia-open.matrix.values.64k] +``` +Resulting component name: `kmod-nvidia-open-64k` (or `kmod-64k-nvidia-open`?) + +### User-space: standard `components` + +The user-space driver is a normal (non-templated) component since it's +kernel-independent: + +```toml +[components.nvidia-cuda-driver] +spec = { type = "local", path = "nvidia-cuda-driver.spec" } + +[[components.nvidia-cuda-driver.source-files]] +filename = "NVIDIA-Linux-x86_64-595.58.03-no-compat32.run" +# ... + +[[components.nvidia-cuda-driver.source-files]] +filename = "NVIDIA-Linux-aarch64-595.58.03.run" +# ... +``` + +Both x86_64 and aarch64 source files are listed; the spec uses `%ifarch` +to select the correct one at build time. + +## Kernel Version Auto-Detection + +The `component-templates` matrix in the kmod TOML file defines which kernel +flavours to build against. When azldev expands the template, each matrix +value produces a distinct component (e.g., `kmod-nvidia-open-hwe`) whose +build configuration can specify the matching `kernel--devel` +package as a `BuildRequires`. This ensures the correct kernel headers are +pulled into the mock chroot for each flavour. + +Once inside the chroot, the kmod spec auto-detects the exact kernel version +from the installed headers: + +```rpm +%{!?kernel_uname_r: %global kernel_uname_r %(ls -1 /usr/src/kernels/ | sort -V | tail -1)} +``` + +This resolves to the `uname -r` string of whichever `kernel-devel` package +is installed in the mock chroot (e.g., `6.18.5-1.4.azl4.x86_64`). + +To override manually: + +```bash +rpmbuild --define 'kernel_uname_r 6.18.5-1.4.azl4.x86_64' ... +``` + +## Build Commands + +```bash +# Build kernel (prerequisite — produces kernel-devel RPM) +azldev comp build -p kernel --local-repo-with-publish ./base/out -q + +# Build kernel modules (uses kernel-devel from local repo) +azldev comp build -p kmod-nvidia-open --local-repo-with-publish ./base/out -q + +# Build user-space driver +azldev comp build -p nvidia-cuda-driver --local-repo-with-publish ./base/out -q +``` + +Build order: `kernel` → `kmod-nvidia-open-*` → `nvidia-cuda-driver` +(the user-space package has no build-time dependency on the kmod, but +listing it after ensures the kmod RPM is available for integration testing). + +## GRID / vGPU Packages + +The open-source packages (`kmod-nvidia-open` + `nvidia-cuda-driver`) are for +**dedicated GPU compute** — the VM (or host) has exclusive access to the full +physical GPU, whether on bare metal or via GPU passthrough. They are **not +compatible** with NVIDIA GRID/vGPU workloads where a GPU is shared across VMs. + +GRID/vGPU requires a separate set of packages built from proprietary `.run` +files redistributed by Microsoft. These `.run` files include Azure-specific +licensing for GRID Virtual GPU Software — **no separate NVIDIA vGPU license +server is required**. The drivers are published at `download.microsoft.com` +and are specific to Azure N-series VM families. + +### GRID package split + +| Package | Source | Contents | +|---------|--------|----------| +| `kmod-nvidia-grid` | GRID `-grid-azure.run` from `download.microsoft.com` | Proprietary kernel modules with vGPU mediation support (`nvidia.ko`, `nvidia-vgpu-vfio.ko`, etc.) | +| `nvidia-grid-driver` | GRID `-grid-azure.run` from `download.microsoft.com` | User-space GRID libraries, vGPU manager tools, and Azure licensing components | + +### Naming conventions + +- **`kmod-nvidia-grid`** — follows the `kmod-nvidia-` pattern with + `grid` identifying the GRID/vGPU **driver branch**. This is distinct from + `kmod-nvidia-closed` (proprietary CUDA/datacenter branch) and + `kmod-nvidia-open` (open-source). The name reflects the branch, not just + the license, because the CUDA and GRID branches carry different versions, + different kernel modules (GRID includes `nvidia-vgpu-vfio.ko`), and target + different workloads. All three kmod variants conflict with each other. +- **`nvidia-grid-driver`** — the user-space companion. Named `grid-driver` + (not `cuda-driver`) to clearly indicate the GRID/vGPU purpose and avoid + confusion with the dedicated-GPU `nvidia-cuda-driver` package. These two + user-space packages also conflict — a system runs either dedicated GPU + compute or GRID/vGPU, not both. + +### Dependency graph (GRID) + +``` +nvidia-grid-driver +├── Requires: kmod-nvidia-grid = +├── Requires: nvidia-grid-driver-libs +└── Requires: nvidia-grid-driver-firmware + └── (GSP firmware + vGPU manager firmware) + +kmod-nvidia-grid (per kernel version) +├── BuildRequires: kernel-devel +├── Requires: kernel-uname-r = +├── Conflicts: kmod-nvidia-open, kmod-nvidia-closed +└── Provides: kmod-nvidia-grid- +``` + +### Conflict matrix + +| Installed | `kmod-nvidia-open` | `kmod-nvidia-closed` | `kmod-nvidia-grid` | `nvidia-cuda-driver` | `nvidia-grid-driver` | +|-----------|--------------------|-----------------------|--------------------|----------------------|----------------------| +| `kmod-nvidia-open` | — | ❌ Conflicts | ❌ Conflicts | ✅ | ❌ Conflicts | +| `kmod-nvidia-closed` | ❌ Conflicts | — | ❌ Conflicts | ✅ | ❌ Conflicts | +| `kmod-nvidia-grid` | ❌ Conflicts | ❌ Conflicts | — | ❌ Conflicts | ✅ | +| `nvidia-cuda-driver` | ✅ | ✅ | ❌ Conflicts | — | ❌ Conflicts | +| `nvidia-grid-driver` | ❌ Conflicts | ❌ Conflicts | ✅ | ❌ Conflicts | — | + +A system installs **one** of three kmod variants and its matching user-space package: +- **`kmod-nvidia-open`** + `nvidia-cuda-driver` — open-source, dedicated GPU compute +- **`kmod-nvidia-closed`** + `nvidia-cuda-driver` — proprietary, dedicated GPU compute +- **`kmod-nvidia-grid`** + `nvidia-grid-driver` — proprietary, GRID/vGPU + +Note that `nvidia-cuda-driver` pairs with **either** `kmod-nvidia-open` or +`kmod-nvidia-closed` (same CUDA/datacenter branch, different source license), +but `nvidia-grid-driver` **only** pairs with `kmod-nvidia-grid` (different +driver branch entirely). + +### Build commands (GRID) + +```bash +# Build GRID kernel modules +azldev comp build -p kmod-nvidia-grid --local-repo-with-publish ./base/out -q + +# Build GRID user-space driver +azldev comp build -p nvidia-grid-driver --local-repo-with-publish ./base/out -q +``` + +> **Note:** Both `kmod-nvidia-grid` and `nvidia-grid-driver` use the +> same `.run` file as their source. The kmod spec extracts and builds only +> the kernel modules, while the user-space spec extracts only the libraries +> and tools (using `--no-kernel-modules`). The `-grid-azure` suffix in the +> filename distinguishes it from the public NVIDIA consumer/datacenter drivers.