From 782d1bc8bab98b14814ab4cbeec1cf5322e48645 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Fri, 26 Feb 2021 10:00:06 -0500 Subject: [PATCH 01/11] fix error message _FULL previously removed --- data-science-stack | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-science-stack b/data-science-stack index ff0e486..9595df1 100755 --- a/data-science-stack +++ b/data-science-stack @@ -221,7 +221,7 @@ EOF sudo dnf -y module install nvidia-driver:460-dkms # REBOOT not necessary else - nvlog "Automated NVIDIA driver install on $OS_FLAVOR $OS_RELEASE_FULL is not supported." + nvlog "Automated NVIDIA driver install on $OS_FLAVOR $OS_RELEASE is not supported." nvlog "Please install NVIDIA driver $MIN_DRIVER or newer and run again." exit 1 fi From 7ea006201162e40eaf1f9edee833fb793d28b1a5 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Mon, 8 Feb 2021 22:01:33 -0500 Subject: [PATCH 02/11] add fedora32 to supported list --- data-science-stack | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data-science-stack b/data-science-stack index 9595df1..f8d3e3d 100755 --- a/data-science-stack +++ b/data-science-stack @@ -34,7 +34,7 @@ OS_RELEASE=$VERSION_ID OS_RELEASE_MAJOR=${VERSION_ID%%.*} # extract major release, e.g. 1.2 -> 1, 1.2.3 -> 1 case $OS_FLAVOR$OS_RELEASE in - ubuntu18.04 | ubuntu20.04 | rhel7* | rhel8* ) + ubuntu18.04 | ubuntu20.04 | rhel7* | rhel8* | fedora32 ) ;; *) echo "Unknown system type: $OS_FLAVOR $OS_RELEASE" From 8008a526f1edbb0d0bfcc080aa79341c08db1094 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Mon, 8 Feb 2021 21:30:05 -0500 Subject: [PATCH 03/11] make install_cuda() defensive wrt OS_FLAVOR and OS_RELEASE --- data-science-stack | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/data-science-stack b/data-science-stack index f8d3e3d..64eea67 100755 --- a/data-science-stack +++ b/data-science-stack @@ -326,10 +326,13 @@ install_cuda () { sudo yum-config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo sudo yum clean all sudo yum install -y cuda-toolkit-11-0 - else + elif [ $OS_FLAVOR$OS_RELEASE_MAJOR = "rhel8" ]; then sudo dnf config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo sudo dnf clean all sudo dnf -y install cuda-toolkit-11-0 + else + nvlog "ERROR: install_cuda(): unknown system type: $OS_FLAVOR $OS_RELEASE" + exit 1 fi fi set +e From f68c8415e02b9715f99b7d348c355b4e1a4ac66b Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Mon, 8 Feb 2021 20:37:06 -0500 Subject: [PATCH 04/11] make install_base() defensive wrt OS_FLAVOR and OS_RELEASE --- data-science-stack | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/data-science-stack b/data-science-stack index 64eea67..8fbc301 100755 --- a/data-science-stack +++ b/data-science-stack @@ -127,7 +127,7 @@ install_base () { wget \ which \ yum-utils - else # RHEL 8 + elif [ $OS_FLAVOR$OS_RELEASE_MAJOR = "rhel8" ]; then sudo yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm || true sudo yum groupinstall -y 'Development Tools' sudo yum install -y \ @@ -146,6 +146,9 @@ install_base () { wget \ which \ yum-utils + else + nvlog "ERROR: install_base(): unknown system type: $OS_FLAVOR $OS_RELEASE" + exit 1 fi set +e From feb825595b69627313cc6e875f6bd275acddb440 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Mon, 8 Feb 2021 21:49:31 -0500 Subject: [PATCH 05/11] detect cuda version with rpm --- data-science-stack | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/data-science-stack b/data-science-stack index 8fbc301..b40814a 100755 --- a/data-science-stack +++ b/data-science-stack @@ -289,6 +289,11 @@ detect_cuda () { if [ $? -ne 0 ]; then CUDA_VER=0 fi + elif which rpm > /dev/null; then + CUDA_VER=$(rpm -q --queryformat '%{VERSION}' cuda 2> /dev/null) + if [ $? -ne 0 ]; then + CUDA_VER=0 + fi else CUDA_VER=0 fi From 3bf43c306d56b5bd7724addfdcec04651d532bbd Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Mon, 8 Feb 2021 20:41:07 -0500 Subject: [PATCH 06/11] add fedora32 to install_base --- data-science-stack | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/data-science-stack b/data-science-stack index b40814a..cace9c3 100755 --- a/data-science-stack +++ b/data-science-stack @@ -146,6 +146,24 @@ install_base () { wget \ which \ yum-utils + elif [ $OS_FLAVOR$OS_RELEASE = "fedora32" ]; then + sudo yum groupinstall -y 'Development Tools' + sudo yum install -y \ + bzip2 \ + clang \ + curl \ + device-mapper-persistent-data \ + file \ + git \ + graphviz \ + jq \ + lvm2 \ + npm \ + screen \ + vim \ + wget \ + which \ + yum-utils else nvlog "ERROR: install_base(): unknown system type: $OS_FLAVOR $OS_RELEASE" exit 1 From 1c4809574de0740a594cf260e210e9cc2ccfe4aa Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Mon, 8 Feb 2021 21:24:54 -0500 Subject: [PATCH 07/11] add fedora32 instructions to install_driver, using rpmfusion --- data-science-stack | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/data-science-stack b/data-science-stack index cace9c3..f38d810 100755 --- a/data-science-stack +++ b/data-science-stack @@ -241,6 +241,12 @@ EOF sudo dnf install -y kernel-devel-$(uname -r) kernel-headers-$(uname -r) sudo dnf -y module install nvidia-driver:460-dkms # REBOOT not necessary + elif [ $OS_FLAVOR$OS_RELEASE = "fedora32" ]; then + sudo dnf -y update + sudo dnf install https://download1.rpmfusion.org/free/fedora/rpmfusion-free-release-$(rpm -E %fedora).noarch.rpm + sudo dnf install https://download1.rpmfusion.org/nonfree/fedora/rpmfusion-nonfree-release-$(rpm -E %fedora).noarch.rpm + sudo dnf install akmod-nvidia + REBOOT=1 else nvlog "Automated NVIDIA driver install on $OS_FLAVOR $OS_RELEASE is not supported." nvlog "Please install NVIDIA driver $MIN_DRIVER or newer and run again." From c4fe8873a44e9fdcc7810523eb8083f5bf320821 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Mon, 8 Feb 2021 21:50:59 -0500 Subject: [PATCH 08/11] add fedora to install_cuda() --- data-science-stack | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/data-science-stack b/data-science-stack index f38d810..9904a90 100755 --- a/data-science-stack +++ b/data-science-stack @@ -362,6 +362,10 @@ install_cuda () { sudo dnf config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo sudo dnf clean all sudo dnf -y install cuda-toolkit-11-0 + elif [ $OS_FLAVOR$OS_RELEASE = "fedora32" ]; then + sudo dnf config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/fedora$(rpm -E %fedora)/x86_64/cuda-fedora$(rpm -E %fedora).repo + sudo dnf clean all + sudo dnf -y install cuda-toolkit-11-0 else nvlog "ERROR: install_cuda(): unknown system type: $OS_FLAVOR $OS_RELEASE" exit 1 From b97e04378ff63c3d2e88f181ea7d56d3ee827f85 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Tue, 9 Feb 2021 00:15:49 -0500 Subject: [PATCH 09/11] make install_docker() defensive wrt OS_FLAVOR and OS_RELEASE --- data-science-stack | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/data-science-stack b/data-science-stack index 9904a90..d21965f 100755 --- a/data-science-stack +++ b/data-science-stack @@ -484,7 +484,7 @@ install_docker () { sudo systemctl restart docker fi - else # RHEL 8 + elif [ $OS_FLAVOR$OS_RELEASE_MAJOR = "rhel8" ]; then # NVIDIA Repos curl -s -L https://nvidia.github.io/nvidia-docker/$OS_FLAVOR$OS_RELEASE/nvidia-docker.repo | \ sudo tee /etc/yum.repos.d/nvidia-docker.repo @@ -519,6 +519,9 @@ install_docker () { sudo systemctl restart docker fi + else + nvlog "ERROR: install_docker(): unknown system type: $OS_FLAVOR $OS_RELEASE" + exit 1 fi set +e From 976e91f1d7637f57ad4213f82383aad67b7dbe27 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Wed, 10 Feb 2021 07:28:29 -0500 Subject: [PATCH 10/11] fix manual install steps, yes please --- data-science-stack | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data-science-stack b/data-science-stack index d21965f..57c3f39 100755 --- a/data-science-stack +++ b/data-science-stack @@ -243,9 +243,9 @@ EOF # REBOOT not necessary elif [ $OS_FLAVOR$OS_RELEASE = "fedora32" ]; then sudo dnf -y update - sudo dnf install https://download1.rpmfusion.org/free/fedora/rpmfusion-free-release-$(rpm -E %fedora).noarch.rpm - sudo dnf install https://download1.rpmfusion.org/nonfree/fedora/rpmfusion-nonfree-release-$(rpm -E %fedora).noarch.rpm - sudo dnf install akmod-nvidia + sudo dnf -y install https://download1.rpmfusion.org/free/fedora/rpmfusion-free-release-$(rpm -E %fedora).noarch.rpm + sudo dnf -y install https://download1.rpmfusion.org/nonfree/fedora/rpmfusion-nonfree-release-$(rpm -E %fedora).noarch.rpm + sudo dnf -y install akmod-nvidia REBOOT=1 else nvlog "Automated NVIDIA driver install on $OS_FLAVOR $OS_RELEASE is not supported." From d33788b467cdd59ae5af728d7048a926502e5cb5 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Tue, 9 Feb 2021 08:10:15 -0500 Subject: [PATCH 11/11] WIP: fedora32 for install_docker() issues - . Fedora is not supported by nvidia-docker2, using rhel8 does not immediately work . nvidia-container-cli: container error: cgroup subsystem devices not found: unknown -> sed -i 's/#no-cgroups = false/no-cgroups = true/' /etc/nvidia-container-runtime/config.toml . Failed to initialize NVML: Unknown Error -> need --privileged --- data-science-stack | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/data-science-stack b/data-science-stack index 57c3f39..d31498b 100755 --- a/data-science-stack +++ b/data-science-stack @@ -519,6 +519,23 @@ install_docker () { sudo systemctl restart docker fi + elif [ $OS_FLAVOR$OS_RELEASE = "fedora32" ]; then + sudo dnf config-manager --add-repo https://download.docker.com/linux/fedora/docker-ce.repo + sudo dnf -y remove podman-docker # XXX: this deserves a warning to the user + sudo dnf -y install docker-ce docker-ce-cli containerd.io + + # XXX: nvidia-docker does not support Fedora, using RHEL 8.3 instead + sudo dnf config-manager --add-repo https://nvidia.github.io/nvidia-docker/rhel8.3/nvidia-docker.repo + sudo dnf -y install nvidia-docker2 + + # XXX: need to do something for WSL? + sudo systemctl start docker + sudo groupadd -f docker + sudo systemctl restart docker + + # XXX: test docker with: docker run hello-world + # XXX: test nvidia-docker with: docker run --runtime=nvidia --rm nvidia/cuda:11.2.0-base nvidia-smi + else nvlog "ERROR: install_docker(): unknown system type: $OS_FLAVOR $OS_RELEASE" exit 1