TorchML image doesn't work on cards with compute capability 12

Tested on a 5060, the version of libtorch we have appears to not support compute capability 12:
https://discuss.pytorch.org/t/pytorch-support-for-sm-120-nvidia-geforce-rtx-5060/220941

This is a no-Python script authored by ChatGPT that tests the operation of libtorch directly. It works on an A1000 6GB laptop, but fails on the 5060:

```bash
#!/usr/bin/env bash
set -euo pipefail

# Point this to your CUDA-enabled LibTorch folder
: "${LIBTORCH:=/usr/local}"

cat > torchscript_cuda_sanity.cpp <<'CPP'
#include <torch/torch.h>
#include <torch/script.h>
#include <iostream>
#include <vector>

static void check(bool cond, const char* msg) {
  if (!cond) {
    std::cerr << "ERROR: " << msg << "\n";
    std::exit(1);
  }
}

int main() {
  std::cout << "=== LibTorch TorchScript CUDA sanity ===\n";
  std::cout << "Torch version: " << TORCH_VERSION << "\n";

  bool cuda_available = torch::cuda::is_available();
  std::cout << "CUDA available: " << (cuda_available ? "true" : "false") << "\n";
  if (cuda_available) {
    std::cout << "CUDA device count: " << torch::cuda::device_count() << "\n";
  }

  // TorchScript code: a Module with forward(a, b) = a + b
  const std::string ts_src = R"TS(
    def forward(self, a: Tensor, b: Tensor) -> Tensor:
        return a + b
  )TS";

  torch::jit::Module m("AddModule");
  m.define(ts_src);

  // CPU test (double tensors)
  auto a_cpu = torch::tensor({1.5, 2.25, -3.0}, torch::dtype(torch::kFloat64).device(torch::kCPU));
  auto b_cpu = torch::tensor({3.0, 4.75,  7.0}, torch::dtype(torch::kFloat64).device(torch::kCPU));

  auto out_cpu_iv = m.forward({a_cpu, b_cpu});
  auto out_cpu = out_cpu_iv.toTensor();
  std::cout << "\nCPU output: " << out_cpu << "\n";

  // Basic correctness check on CPU
  auto expected_cpu = a_cpu + b_cpu;
  check(out_cpu.equal(expected_cpu), "CPU TorchScript result mismatch");

  if (!cuda_available) {
    std::cout << "\nCUDA not available in this build/runtime. CPU sanity PASSED.\n";
    return 0;
  }

  // CUDA test (double tensors)
  auto a_gpu = a_cpu.to(torch::kCUDA);
  auto b_gpu = b_cpu.to(torch::kCUDA);

  auto out_gpu_iv = m.forward({a_gpu, b_gpu});
  auto out_gpu = out_gpu_iv.toTensor();

  // Force sync to surface kernel/runtime problems immediately
  torch::cuda::synchronize();

  std::cout << "GPU output: " << out_gpu.cpu() << "\n";

  // Compare CPU vs GPU
  check(out_gpu.cpu().equal(out_cpu), "GPU TorchScript output != CPU output");

  // Extra CUDA sanity: a small matmul to confirm kernels run fine
  auto x = torch::randn({512, 512}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
  auto y = torch::randn({512, 512}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
  auto z = x.matmul(y);
  torch::cuda::synchronize();
  std::cout << "\nExtra CUDA op OK. z.mean() = " << z.mean().item<double>() << "\n";

  std::cout << "\nPASS: TorchScript + CUDA sanity check succeeded.\n";
  return 0;
}
CPP

# Build (Linux/macOS). Requires CUDA-enabled LibTorch.
# Notes:
# - If you get undefined references, your LIBTORCH is likely CPU-only or mismatched.
# - rpath keeps runtime from needing LD_LIBRARY_PATH.
c++ -O2 -std=c++17 torchscript_cuda_sanity.cpp -o torchscript_cuda_sanity \
  -I"$LIBTORCH/include" \
  -I"$LIBTORCH/include/torch/csrc/api/include" \
  -L"$LIBTORCH/lib" \
  -Wl,--no-as-needed \
  -ltorch_cuda \
  -lc10_cuda \
  -Wl,--as-needed \
  -ltorch \
  -ltorch_cpu \
  -lc10 \
  -Wl,-rpath,"$LIBTORCH/lib"


./torchscript_cuda_sanity
```

To invoke, use 

```bash
chmod +x torchscript_cuda_sanity.sh
LIBTORCH=/usr/local ./torchscript_cuda_sanity.sh
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

TorchML image doesn't work on cards with compute capability 12 #37

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

TorchML image doesn't work on cards with compute capability 12 #37

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions