Skip to content

TorchML image doesn't work on cards with compute capability 12 #37

@ilia-nikiforov-umn

Description

@ilia-nikiforov-umn

Tested on a 5060, the version of libtorch we have appears to not support compute capability 12:
https://discuss.pytorch.org/t/pytorch-support-for-sm-120-nvidia-geforce-rtx-5060/220941

This is a no-Python script authored by ChatGPT that tests the operation of libtorch directly. It works on an A1000 6GB laptop, but fails on the 5060:

#!/usr/bin/env bash
set -euo pipefail

# Point this to your CUDA-enabled LibTorch folder
: "${LIBTORCH:=/usr/local}"

cat > torchscript_cuda_sanity.cpp <<'CPP'
#include <torch/torch.h>
#include <torch/script.h>
#include <iostream>
#include <vector>

static void check(bool cond, const char* msg) {
  if (!cond) {
    std::cerr << "ERROR: " << msg << "\n";
    std::exit(1);
  }
}

int main() {
  std::cout << "=== LibTorch TorchScript CUDA sanity ===\n";
  std::cout << "Torch version: " << TORCH_VERSION << "\n";

  bool cuda_available = torch::cuda::is_available();
  std::cout << "CUDA available: " << (cuda_available ? "true" : "false") << "\n";
  if (cuda_available) {
    std::cout << "CUDA device count: " << torch::cuda::device_count() << "\n";
  }

  // TorchScript code: a Module with forward(a, b) = a + b
  const std::string ts_src = R"TS(
    def forward(self, a: Tensor, b: Tensor) -> Tensor:
        return a + b
  )TS";

  torch::jit::Module m("AddModule");
  m.define(ts_src);

  // CPU test (double tensors)
  auto a_cpu = torch::tensor({1.5, 2.25, -3.0}, torch::dtype(torch::kFloat64).device(torch::kCPU));
  auto b_cpu = torch::tensor({3.0, 4.75,  7.0}, torch::dtype(torch::kFloat64).device(torch::kCPU));

  auto out_cpu_iv = m.forward({a_cpu, b_cpu});
  auto out_cpu = out_cpu_iv.toTensor();
  std::cout << "\nCPU output: " << out_cpu << "\n";

  // Basic correctness check on CPU
  auto expected_cpu = a_cpu + b_cpu;
  check(out_cpu.equal(expected_cpu), "CPU TorchScript result mismatch");

  if (!cuda_available) {
    std::cout << "\nCUDA not available in this build/runtime. CPU sanity PASSED.\n";
    return 0;
  }

  // CUDA test (double tensors)
  auto a_gpu = a_cpu.to(torch::kCUDA);
  auto b_gpu = b_cpu.to(torch::kCUDA);

  auto out_gpu_iv = m.forward({a_gpu, b_gpu});
  auto out_gpu = out_gpu_iv.toTensor();

  // Force sync to surface kernel/runtime problems immediately
  torch::cuda::synchronize();

  std::cout << "GPU output: " << out_gpu.cpu() << "\n";

  // Compare CPU vs GPU
  check(out_gpu.cpu().equal(out_cpu), "GPU TorchScript output != CPU output");

  // Extra CUDA sanity: a small matmul to confirm kernels run fine
  auto x = torch::randn({512, 512}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
  auto y = torch::randn({512, 512}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
  auto z = x.matmul(y);
  torch::cuda::synchronize();
  std::cout << "\nExtra CUDA op OK. z.mean() = " << z.mean().item<double>() << "\n";

  std::cout << "\nPASS: TorchScript + CUDA sanity check succeeded.\n";
  return 0;
}
CPP

# Build (Linux/macOS). Requires CUDA-enabled LibTorch.
# Notes:
# - If you get undefined references, your LIBTORCH is likely CPU-only or mismatched.
# - rpath keeps runtime from needing LD_LIBRARY_PATH.
c++ -O2 -std=c++17 torchscript_cuda_sanity.cpp -o torchscript_cuda_sanity \
  -I"$LIBTORCH/include" \
  -I"$LIBTORCH/include/torch/csrc/api/include" \
  -L"$LIBTORCH/lib" \
  -Wl,--no-as-needed \
  -ltorch_cuda \
  -lc10_cuda \
  -Wl,--as-needed \
  -ltorch \
  -ltorch_cpu \
  -lc10 \
  -Wl,-rpath,"$LIBTORCH/lib"


./torchscript_cuda_sanity

To invoke, use

chmod +x torchscript_cuda_sanity.sh
LIBTORCH=/usr/local ./torchscript_cuda_sanity.sh

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions