Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
From: Bottlerocket Kernel Kit <noreply@amazon.com>

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand where are you coming from with this patch, but I'll prefer if we take a different approach just so that we don't carry-on a patch.

I think we can extend ghostdog to create the device and call it after we load the kernel module, as a subsequent ExecStart call. It will basically do what the insmod script does in the GDRCopy repo.

Date: Mon, 02 Jun 2026 22:00:00 +0000
Subject: [PATCH] gdrdrv: register as a misc device

Upstream gdrdrv uses register_chrdev() and relies on a user-space helper
to create /dev/gdrdrv via mknod after modprobe. On Bottlerocket the host
shell is minimal (no awk/grep/etc.) and the kit convention is to ship no
helper scripts. Switch the driver to misc_register() with a fixed mode of
0666 so devtmpfs creates /dev/gdrdrv automatically when the module loads,
matching the IMEX "just provide it" pattern used elsewhere in the kit.

Signed-off-by: Bottlerocket Kernel Kit <noreply@amazon.com>
---
src/gdrdrv/gdrdrv.c | 18 ++++++++++++------
1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/gdrdrv/gdrdrv.c b/src/gdrdrv/gdrdrv.c
--- a/src/gdrdrv/gdrdrv.c
+++ b/src/gdrdrv/gdrdrv.c
@@ -30,6 +30,7 @@
#include <linux/string.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
+#include <linux/miscdevice.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/io.h>
@@ -95,7 +96,6 @@

//-----------------------------------------------------------------------------

-static int gdrdrv_major = 0;
static int gdrdrv_cpu_could_cache_gpu_mappings = 0;
static int gdrdrv_cpu_must_use_device_mapping = 0;

@@ -1880,21 +1880,27 @@ struct file_operations gdrdrv_fops = {
.mmap = gdrdrv_mmap
};

+static struct miscdevice gdrdrv_misc_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = DEVNAME,
+ .fops = &gdrdrv_fops,
+ .mode = 0666,
+};
+
//-----------------------------------------------------------------------------

static int __init gdrdrv_init(void)
{
int result;

- result = register_chrdev(gdrdrv_major, DEVNAME, &gdrdrv_fops);
+ result = misc_register(&gdrdrv_misc_dev);
if (result < 0) {
- gdr_err("can't get major %d\n", gdrdrv_major);
+ gdr_err("misc_register failed: %d\n", result);
return result;
}
- if (gdrdrv_major == 0) gdrdrv_major = result; /* dynamic */

gdr_msg(KERN_INFO, "loading gdrdrv version %s built for %s NVIDIA driver\n", GDRDRV_VERSION_STRING, GDRDRV_BUILT_FOR_NVIDIA_FLAVOR_STRING);
- gdr_msg(KERN_INFO, "device registered with major number %d\n", gdrdrv_major);
+ gdr_msg(KERN_INFO, "registered as misc device, minor %d\n", gdrdrv_misc_dev.minor);
gdr_msg(KERN_INFO, "dbg traces %s, info traces %s", dbg_enabled ? "enabled" : "disabled", info_enabled ? "enabled" : "disabled");

#if defined(CONFIG_PPC64) && defined(PVR_POWER9)
@@ -1947,12 +1953,12 @@ void gdrdrv_procfs_cleanup(void)
static void __exit gdrdrv_cleanup(void)
{
int64_t last_nv_get_pages_refcount;
- gdr_msg(KERN_INFO, "unregistering major number %d\n", gdrdrv_major);
+ gdr_msg(KERN_INFO, "unregistering misc device\n");

gdrdrv_procfs_cleanup();

/* cleanup_module is never called if registering failed */
- unregister_chrdev(gdrdrv_major, DEVNAME);
+ misc_deregister(&gdrdrv_misc_dev);

last_nv_get_pages_refcount = atomic64_read(&gdrdrv_nv_get_pages_refcount);
if (dbg_enabled)
--
2.49.0
5 changes: 5 additions & 0 deletions packages/kmod-6.18-nvidia-r580/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,10 @@ url = "https://raw.githubusercontent.com/NVIDIA/open-gpu-kernel-modules/580.159.
sha512 = "f9cee68cbb12095af4b4e92d01c210461789ef41c70b64efefd6719d0b88468b7a67a3629c432d4d9304c730b5d1a942228a5bcc74a03ab1c411c77c758cd938"
force-upstream = true

[[package.metadata.build-package.external-files]]
url = "https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.5.2.tar.gz"
sha512 = "c717f118eff8cd5a8dc35613c3881818f8b71dc493461dd0151ce7c882f8e2c2d852e22733fab4e2bec57219e10eec874c11b4fad90dd4815ae572840ed19d28"
force-upstream = true

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not correct. This kernel module is MIT and we can distribute its sources.


[build-dependencies]
kernel-6_18 = { path = "../kernel-6.18" }
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[Unit]
Description=Copy GDRCopy kernel module (open-gpu)
RequiresMountsFor=PREFIX/lib/modules PREFIX/src/kernels
# Rerunning this service after the system is fully loaded will override
# the already linked kernel modules. This doesn't affect the running system,
# since kernel modules are linked early in the boot sequence, but we still
# disable manual restarts to prevent unnecessary kernel modules rewrites.
RefuseManualStart=true
RefuseManualStop=true

[Service]
Type=oneshot
ExecCondition=/usr/bin/ghostdog match-nvidia-driver open-gpu
ExecStart=/usr/bin/driverdog --modules-set nvidia-gdrcopy-open-gpu link-modules
RemainAfterExit=true
StandardError=journal+console

[Install]
RequiredBy=drivers.target
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[Unit]
Description=Copy GDRCopy kernel module (tesla)
RequiresMountsFor=PREFIX/lib/modules PREFIX/src/kernels
# Rerunning this service after the system is fully loaded will override
# the already linked kernel modules. This doesn't affect the running system,
# since kernel modules are linked early in the boot sequence, but we still
# disable manual restarts to prevent unnecessary kernel modules rewrites.
RefuseManualStart=true
RefuseManualStop=true

[Service]
Type=oneshot
ExecCondition=/usr/bin/ghostdog match-nvidia-driver tesla
ExecStart=/usr/bin/driverdog --modules-set nvidia-gdrcopy-tesla link-modules
RemainAfterExit=true
StandardError=journal+console

[Install]
RequiredBy=drivers.target
115 changes: 115 additions & 0 deletions packages/kmod-6.18-nvidia-r580/kmod-6.18-nvidia-r580.spec
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
%global tesla_patch 03
%global tesla_ver %{tesla_major}.%{tesla_minor}.%{tesla_patch}
%global grid_ver grid-19.5
%global gdrcopy_ver 2.5.2
%if "%{?_cross_arch}" == "aarch64"
%global nvidia_arch sbsa
%else
Expand Down Expand Up @@ -46,6 +47,8 @@ Source11: https://developer.download.nvidia.com/compute/cuda/repos/amzn2023/sbsa
Source20: https://developer.download.nvidia.com/compute/cuda/repos/amzn2023/x86_64/nvidia-imex-%{tesla_ver}-1.amzn2023.x86_64.rpm
Source21: https://developer.download.nvidia.com/compute/cuda/repos/amzn2023/sbsa/nvidia-imex-%{tesla_ver}-1.amzn2023.aarch64.rpm

Source100: https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v%{gdrcopy_ver}.tar.gz

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For consistency with the core kit, fetch the archive from the archives URLs. You can check what we do in other packages for the core kit.


# Common NVIDIA conf files from 200 to 299
Source200: nvidia-tmpfiles.conf.in
Source202: nvidia-dependencies-modules-load.conf
Expand Down Expand Up @@ -85,7 +88,16 @@ Source503: load-open-gpu-kernel-modules.service.in
Source504: copy-grid-kernel-modules.service.in
Source505: load-grid-kernel-modules.service.in

Source600: nvidia-gdrcopy-open-gpu-config.toml.in
Source601: copy-gdrcopy-open-gpu-kernel-module.service.in
Source602: load-gdrcopy-open-gpu-kernel-module.service.in
Source603: nvidia-gdrcopy-tmpfiles.conf
Source604: nvidia-gdrcopy-tesla-config.toml.in
Source605: copy-gdrcopy-tesla-kernel-module.service.in
Source606: load-gdrcopy-tesla-kernel-module.service.in

Patch001: 0001-makefile-allow-to-use-any-kernel-arch.patch
Patch002: 0002-gdrdrv-register-as-misc-device.patch

BuildRequires: %{_cross_os}kernel-6.18-devel
Requires: %{_cross_os}kernel-6.18
Expand All @@ -96,6 +108,7 @@ Requires: %{name}-open-gpu
Requires: %{name}-grid
%endif
Requires: %{name}-mps
Requires: %{name}-gdrcopy

%description
%{summary}.
Expand Down Expand Up @@ -165,6 +178,19 @@ Requires: %{name}
%description mps
%{summary}.

%package gdrcopy
Summary: NVIDIA GDRCopy kernel driver (gdrdrv) for the r%{tesla_major} NVIDIA driver
Version: %{gdrcopy_ver}
License: MIT
Requires: %{_cross_os}variant-platform(aws)
Requires: %{name}

%description gdrcopy
%{summary}.
Ships two flavors of gdrdrv: one built against the open NVIDIA driver and one
against the proprietary driver. The right flavor is loaded at boot based on
which driver variant ghostdog matches.
Comment on lines +190 to +192

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For consistency, this isn't needed.


%prep
# Extract nvidia sources with `-x`, otherwise the script will try to install
# the driver in the current run
Expand Down Expand Up @@ -273,6 +299,48 @@ and .devid != "0x2237")' supported-gpus.json | jq -s '{"open-gpu": .}' > open-gp
jq -e '."open-gpu"[] | select(."devid" == "0x2330") | ."features"| index("kernelopen")' open-gpu-supported-devices.json
popd

tar -xof %{S:100}
pushd gdrcopy-%{gdrcopy_ver}
%patch 2 -p1
popd

pushd gdrcopy-%{gdrcopy_ver}/src/gdrdrv

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of re-using sources, could you please prepare two copies of the sources for each compilation? I recall we do something similar elsewhere where we prepare two copies of the same sources but we just configure them differently.

NVIDIA_SRC_DIR="%{_builddir}/NVIDIA-Linux-%{_cross_arch}-%{tesla_ver}/kernel-open/nvidia" \
NVIDIA_IS_OPENSOURCE=y \
HAVE_VM_FLAGS_SET=y \
HAVE_PROC_OPS=y \
KBUILD_EXTRA_SYMBOLS="%{_builddir}/NVIDIA-Linux-%{_cross_arch}-%{tesla_ver}/kernel-open/Module.symvers" \
Comment on lines +309 to +312

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These can be provided after your make just for consistency with the existing make command.

make %{?_smp_mflags} \
-C %{kernel_sources} \
M="$PWD" \
ARCH=%{_cross_karch} \
IGNORE_CC_MISMATCH=1 \
CC=%{_cross_target}-gcc \
LD=%{_cross_target}-ld \
modules

%{_cross_target}-strip -g --strip-unneeded gdrdrv.ko
mv gdrdrv.ko ../../gdrdrv-open-gpu.ko

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you have two sources, you don't have to move the kernel module and you can refer to it directly in the install section.

make clean

NVIDIA_SRC_DIR="%{_builddir}/NVIDIA-Linux-%{_cross_arch}-%{tesla_ver}/kernel/nvidia" \
NVIDIA_IS_OPENSOURCE=y \

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not the open source version, right?

HAVE_VM_FLAGS_SET=y \
HAVE_PROC_OPS=y \
KBUILD_EXTRA_SYMBOLS="%{_builddir}/NVIDIA-Linux-%{_cross_arch}-%{tesla_ver}/kernel/Module.symvers" \
make %{?_smp_mflags} \
-C %{kernel_sources} \
M="$PWD" \
ARCH=%{_cross_karch} \
IGNORE_CC_MISMATCH=1 \
CC=%{_cross_target}-gcc \
LD=%{_cross_target}-ld \
modules

%{_cross_target}-strip -g --strip-unneeded gdrdrv.ko

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you need to strip the kernel symbols? Isn't that already done when the kernel module is built?

mv gdrdrv.ko ../../gdrdrv-tesla.ko
popd

%install
install -d %{buildroot}%{_cross_libdir}
install -d %{buildroot}%{_cross_tmpfilesdir}
Expand Down Expand Up @@ -524,6 +592,36 @@ install -p -m 0644 %{S:217} %{buildroot}%{_cross_tmpfilesdir}/nvidia-imex.conf
install -d %{buildroot}%{_cross_libdir}/modprobe.d
install -p -m 0644 %{S:218} %{buildroot}%{_cross_libdir}/modprobe.d/10-nvidia-default-imex-channel.conf

install -d %{buildroot}%{_cross_datadir}/nvidia/gdrcopy/open-gpu/drivers
install -d %{buildroot}%{_cross_datadir}/nvidia/gdrcopy/tesla/drivers

install -p -m 0644 gdrcopy-%{gdrcopy_ver}/gdrdrv-open-gpu.ko \
%{buildroot}%{_cross_datadir}/nvidia/gdrcopy/open-gpu/drivers/gdrdrv.ko
install -p -m 0644 gdrcopy-%{gdrcopy_ver}/gdrdrv-tesla.ko \
%{buildroot}%{_cross_datadir}/nvidia/gdrcopy/tesla/drivers/gdrdrv.ko

install -p -m 0644 gdrcopy-%{gdrcopy_ver}/LICENSE gdrcopy-LICENSE

sed -e 's|__NVIDIA_MODULES__|%{_cross_datadir}/nvidia/gdrcopy/open-gpu/drivers/|' %{S:600} > \
nvidia-gdrcopy-open-gpu.toml
install -m 0644 nvidia-gdrcopy-open-gpu.toml %{buildroot}%{_cross_factorydir}%{_cross_sysconfdir}/drivers
sed -e 's|__NVIDIA_MODULES__|%{_cross_datadir}/nvidia/gdrcopy/tesla/drivers/|' %{S:604} > \
nvidia-gdrcopy-tesla.toml
install -m 0644 nvidia-gdrcopy-tesla.toml %{buildroot}%{_cross_factorydir}%{_cross_sysconfdir}/drivers

sed -e 's|PREFIX|%{_cross_prefix}|g' %{S:601} > copy-gdrcopy-open-gpu-kernel-module.service
sed -e 's|PREFIX|%{_cross_prefix}|g' %{S:602} > load-gdrcopy-open-gpu-kernel-module.service
sed -e 's|PREFIX|%{_cross_prefix}|g' %{S:605} > copy-gdrcopy-tesla-kernel-module.service
sed -e 's|PREFIX|%{_cross_prefix}|g' %{S:606} > load-gdrcopy-tesla-kernel-module.service
install -p -m 0644 \
copy-gdrcopy-open-gpu-kernel-module.service \
load-gdrcopy-open-gpu-kernel-module.service \
copy-gdrcopy-tesla-kernel-module.service \
load-gdrcopy-tesla-kernel-module.service \
%{buildroot}%{_cross_unitdir}

install -p -m 0644 %{S:603} %{buildroot}%{_cross_tmpfilesdir}/nvidia-gdrcopy.conf

%files
%{_cross_attribution_file}
%dir %{_cross_libexecdir}/nvidia
Expand Down Expand Up @@ -819,3 +917,20 @@ install -p -m 0644 %{S:218} %{buildroot}%{_cross_libdir}/modprobe.d/10-nvidia-de
%{_cross_bindir}/nvidia-cuda-mps-server
%{_cross_libexecdir}/nvidia/tesla/bin/nvidia-cuda-mps-control
%{_cross_libexecdir}/nvidia/tesla/bin/nvidia-cuda-mps-server

%files gdrcopy
%license gdrcopy-LICENSE
%dir %{_cross_datadir}/nvidia/gdrcopy
%dir %{_cross_datadir}/nvidia/gdrcopy/open-gpu
%dir %{_cross_datadir}/nvidia/gdrcopy/open-gpu/drivers
%dir %{_cross_datadir}/nvidia/gdrcopy/tesla
%dir %{_cross_datadir}/nvidia/gdrcopy/tesla/drivers
%{_cross_datadir}/nvidia/gdrcopy/open-gpu/drivers/gdrdrv.ko
%{_cross_datadir}/nvidia/gdrcopy/tesla/drivers/gdrdrv.ko
%{_cross_factorydir}%{_cross_sysconfdir}/drivers/nvidia-gdrcopy-open-gpu.toml
%{_cross_factorydir}%{_cross_sysconfdir}/drivers/nvidia-gdrcopy-tesla.toml
%{_cross_tmpfilesdir}/nvidia-gdrcopy.conf
%{_cross_unitdir}/copy-gdrcopy-open-gpu-kernel-module.service
%{_cross_unitdir}/load-gdrcopy-open-gpu-kernel-module.service
%{_cross_unitdir}/copy-gdrcopy-tesla-kernel-module.service
%{_cross_unitdir}/load-gdrcopy-tesla-kernel-module.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[Unit]
Description=Load GDRCopy kernel module (open-gpu)
RequiresMountsFor=PREFIX/lib/modules PREFIX/src/kernels
After=copy-gdrcopy-open-gpu-kernel-module.service load-open-gpu-kernel-modules.service
Requires=copy-gdrcopy-open-gpu-kernel-module.service load-open-gpu-kernel-modules.service
# Disable manual restarts to prevent loading kernel modules
# that weren't linked by the running system
RefuseManualStart=true
RefuseManualStop=true

[Service]
Type=oneshot
ExecCondition=/usr/bin/ghostdog match-nvidia-driver open-gpu
ExecStart=/usr/bin/driverdog --modules-set nvidia-gdrcopy-open-gpu load-modules
RemainAfterExit=true
StandardError=journal+console

[Install]
RequiredBy=drivers.target
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[Unit]
Description=Load GDRCopy kernel module (tesla)
RequiresMountsFor=PREFIX/lib/modules PREFIX/src/kernels
After=copy-gdrcopy-tesla-kernel-module.service load-tesla-kernel-modules.service
Requires=copy-gdrcopy-tesla-kernel-module.service load-tesla-kernel-modules.service
# Disable manual restarts to prevent loading kernel modules
# that weren't linked by the running system
RefuseManualStart=true
RefuseManualStop=true

[Service]
Type=oneshot
ExecCondition=/usr/bin/ghostdog match-nvidia-driver tesla
ExecStart=/usr/bin/driverdog --modules-set nvidia-gdrcopy-tesla load-modules
RemainAfterExit=true
StandardError=journal+console

[Install]
RequiredBy=drivers.target
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[nvidia-gdrcopy-open-gpu]
lib-modules-path = "kernel/drivers/extra/video/nvidia/gdrcopy/open-gpu"

[nvidia-gdrcopy-open-gpu.kernel-modules."gdrdrv.ko"]
copy-source = "__NVIDIA_MODULES__"
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[nvidia-gdrcopy-tesla]
lib-modules-path = "kernel/drivers/extra/video/nvidia/gdrcopy/tesla"

[nvidia-gdrcopy-tesla.kernel-modules."gdrdrv.ko"]
copy-source = "__NVIDIA_MODULES__"
2 changes: 2 additions & 0 deletions packages/kmod-6.18-nvidia-r580/nvidia-gdrcopy-tmpfiles.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
C /etc/drivers/nvidia-gdrcopy-open-gpu.toml
C /etc/drivers/nvidia-gdrcopy-tesla.toml
4 changes: 4 additions & 0 deletions packages/kmod-6.18-nvidia-r580/nvidia-tmpfiles.conf.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ R __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/op
d __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/open-gpu 0755 root root - -
R __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/grid - - - - -
d __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/grid 0755 root root - -
R __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/gdrcopy - - - - -
d __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/gdrcopy 0755 root root - -
d __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/gdrcopy/open-gpu 0755 root root - -
d __PREFIX__/lib/modules/__KERNEL_VERSION__/kernel/drivers/extra/video/nvidia/gdrcopy/tesla 0755 root root - -
C /etc/nvidia/fabricmanager.cfg - - - -
C /etc/nvidia/fabricmanager.env - - - -
C /etc/nvidia/gridd.conf - - - -
Expand Down
Loading