From 2bcb72e7efb3d63fcc045d6cd5405aad3610113d Mon Sep 17 00:00:00 2001 From: Flakebi Date: Fri, 20 Feb 2026 10:07:28 +0100 Subject: [PATCH 1/3] Add inline asm support for amdgpu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for inline assembly for the amdgpu backend (the amdgcn-amd-amdhsa target). Add register classes for `vgpr` (vector general purpose register) and `sgpr` (scalar general purpose register). The LLVM backend supports two more classes, `reg`, which is either VGPR or SGPR, up to the compiler to decide. As instructions often rely on a register being either a VGPR or SGPR for the assembly to be valid, reg doesn’t seem that useful (I struggled to write correct tests for it), so I didn’t end up adding it. The fourth register class is AGPRs, which only exist on some hardware versions (not the consumer ones) and they have restricted ways to write and read from them, which makes it hard to write a Rust variable into them. They could be used inside assembly blocks, but I didn’t add them as Rust register class. There is one change affecting general inline assembly code, that is `InlineAsmReg::name()` now returns a `Cow` instead of a `&'static str`. Because amdgpu has many registers, 256 VGPRs plus combinations of 2 or 4 VGPRs, and I didn’t want to list hundreds of static strings, the amdgpu reg stores the register number(s) and a non-static String is generated at runtime for the register name. --- compiler/rustc_codegen_gcc/src/asm.rs | 4 + compiler/rustc_codegen_llvm/src/asm.rs | 5 + compiler/rustc_span/src/symbol.rs | 2 + compiler/rustc_target/src/asm/amdgpu.rs | 268 ++++++++++++++++++++++++ compiler/rustc_target/src/asm/mod.rs | 61 ++++-- tests/assembly-llvm/asm/amdgpu-types.rs | 232 ++++++++++++++++++++ 6 files changed, 554 insertions(+), 18 deletions(-) create mode 100644 compiler/rustc_target/src/asm/amdgpu.rs create mode 100644 tests/assembly-llvm/asm/amdgpu-types.rs diff --git a/compiler/rustc_codegen_gcc/src/asm.rs b/compiler/rustc_codegen_gcc/src/asm.rs index 319f3d3278730..389c0453db401 100644 --- a/compiler/rustc_codegen_gcc/src/asm.rs +++ b/compiler/rustc_codegen_gcc/src/asm.rs @@ -670,6 +670,8 @@ fn reg_class_to_gcc(reg_class: InlineAsmRegClass) -> &'static str { InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::preg) => { unreachable!("clobber-only") } + InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr) => "v", + InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::sgpr) => "Sg", InlineAsmRegClass::Arm(ArmInlineAsmRegClass::reg) => "r", InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg) | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::dreg_low16) @@ -767,6 +769,7 @@ fn dummy_output_type<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, reg: InlineAsmRegCl InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::preg) => { unreachable!("clobber-only") } + InlineAsmRegClass::Amdgpu(_) => cx.type_i32(), InlineAsmRegClass::Arm(ArmInlineAsmRegClass::reg) => cx.type_i32(), InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg) | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg_low16) => cx.type_f32(), @@ -953,6 +956,7 @@ fn modifier_to_gcc( InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::preg) => { unreachable!("clobber-only") } + InlineAsmRegClass::Amdgpu(_) => None, InlineAsmRegClass::Arm(ArmInlineAsmRegClass::reg) => None, InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg) | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg_low16) => None, diff --git a/compiler/rustc_codegen_llvm/src/asm.rs b/compiler/rustc_codegen_llvm/src/asm.rs index 80d77be1cc384..1acc588622e03 100644 --- a/compiler/rustc_codegen_llvm/src/asm.rs +++ b/compiler/rustc_codegen_llvm/src/asm.rs @@ -228,6 +228,7 @@ impl<'ll, 'tcx> AsmBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { InlineAsmArch::AArch64 | InlineAsmArch::Arm64EC | InlineAsmArch::Arm => { constraints.push("~{cc}".to_string()); } + InlineAsmArch::Amdgpu => {} InlineAsmArch::X86 | InlineAsmArch::X86_64 => { constraints.extend_from_slice(&[ "~{dirflag}".to_string(), @@ -645,6 +646,8 @@ fn reg_to_llvm(reg: InlineAsmRegOrRegClass, layout: Option<&TyAndLayout<'_>>) -> | Arm(ArmInlineAsmRegClass::dreg_low8) | Arm(ArmInlineAsmRegClass::qreg_low4) => "x", Arm(ArmInlineAsmRegClass::dreg) | Arm(ArmInlineAsmRegClass::qreg) => "w", + Amdgpu(AmdgpuInlineAsmRegClass::sgpr) => "s", + Amdgpu(AmdgpuInlineAsmRegClass::vgpr) => "v", Hexagon(HexagonInlineAsmRegClass::reg) => "r", Hexagon(HexagonInlineAsmRegClass::preg) => unreachable!("clobber-only"), LoongArch(LoongArchInlineAsmRegClass::reg) => "r", @@ -746,6 +749,7 @@ fn modifier_to_llvm( modifier } } + Amdgpu(_) => None, Hexagon(_) => None, LoongArch(_) => None, Mips(_) => None, @@ -826,6 +830,7 @@ fn dummy_output_type<'ll>(cx: &CodegenCx<'ll, '_>, reg: InlineAsmRegClass) -> &' Arm(ArmInlineAsmRegClass::qreg) | Arm(ArmInlineAsmRegClass::qreg_low8) | Arm(ArmInlineAsmRegClass::qreg_low4) => cx.type_vector(cx.type_i64(), 2), + Amdgpu(_) => cx.type_i32(), Hexagon(HexagonInlineAsmRegClass::reg) => cx.type_i32(), Hexagon(HexagonInlineAsmRegClass::preg) => unreachable!("clobber-only"), LoongArch(LoongArchInlineAsmRegClass::reg) => cx.type_i32(), diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index 5623b984b2420..8348453fdb558 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -2074,6 +2074,7 @@ symbols! { self_in_typedefs, self_struct_ctor, semiopaque, + sgpr, sha2, sha3, sha512_sm_x86, @@ -2502,6 +2503,7 @@ symbols! { verbatim, version, vfp2, + vgpr, vis, visible_private_types, volatile, diff --git a/compiler/rustc_target/src/asm/amdgpu.rs b/compiler/rustc_target/src/asm/amdgpu.rs new file mode 100644 index 0000000000000..4e58e75abe6f3 --- /dev/null +++ b/compiler/rustc_target/src/asm/amdgpu.rs @@ -0,0 +1,268 @@ +use std::fmt; + +use rustc_span::Symbol; + +use super::{InlineAsmArch, InlineAsmType, ModifierInfo}; + +def_reg_class! { + Amdgpu AmdgpuInlineAsmRegClass { + sgpr, + vgpr, + } +} + +// See https://llvm.org/docs/AMDGPUOperandSyntax.html +impl AmdgpuInlineAsmRegClass { + pub fn valid_modifiers(self, _arch: InlineAsmArch) -> &'static [char] { + &[] + } + + pub fn suggest_class(self, _arch: InlineAsmArch, _ty: InlineAsmType) -> Option { + None + } + + pub fn suggest_modifier( + self, + _arch: InlineAsmArch, + _ty: InlineAsmType, + ) -> Option { + None + } + + pub fn default_modifier(self, _arch: InlineAsmArch) -> Option { + None + } + + pub fn supported_types( + self, + _arch: InlineAsmArch, + ) -> &'static [(InlineAsmType, Option)] { + types! { _: I16, F16, I32, F32, I64, F64, I128; } + } + + /// The number of supported registers in this class. + /// The returned number is the length, so supported register + /// indices are 0 to max_num()-1. + fn max_num(self) -> u32 { + match self { + Self::sgpr => 106, + Self::vgpr => 256, + } + } + + /// Prefix when printed and register constraint in LLVM. + fn prefix(self) -> &'static str { + match self { + Self::sgpr => "s", + Self::vgpr => "v", + } + } + + /// Get register class from prefix. + fn parse_prefix(prefix: char) -> Result { + match prefix { + 's' => Ok(Self::sgpr), + 'v' => Ok(Self::vgpr), + _ => Err("unknown register prefix"), + } + } +} + +#[derive( + Copy, + Clone, + rustc_macros::Encodable, + rustc_macros::Decodable, + Debug, + Eq, + PartialEq, + PartialOrd, + Hash, + rustc_macros::HashStable_Generic +)] +enum AmdgpuRegRange { + /// Low 16-bit of a register + Low(u32), + /// High 16-bit of a register + High(u32), + /// One or more 32-bit registers, in the inclusive range + Range { start: u32, end: u32 }, +} + +#[derive( + Copy, + Clone, + rustc_macros::Encodable, + rustc_macros::Decodable, + Debug, + Eq, + PartialEq, + PartialOrd, + Hash, + rustc_macros::HashStable_Generic +)] +#[allow(non_camel_case_types)] +pub struct AmdgpuInlineAsmReg { + class: AmdgpuInlineAsmRegClass, + range: AmdgpuRegRange, +} + +impl AmdgpuInlineAsmReg { + pub fn name(self) -> String { + let c = self.class.prefix(); + match self.range { + AmdgpuRegRange::Low(n) => format!("{c}{n}.l"), + AmdgpuRegRange::High(n) => format!("{c}{n}.h"), + AmdgpuRegRange::Range { start, end } if start == end => format!("{c}{start}"), + AmdgpuRegRange::Range { start, end } => format!("{c}[{start}:{end}]"), + } + } + + pub fn reg_class(self) -> AmdgpuInlineAsmRegClass { + self.class + } + + pub fn parse(name: &str) -> Result { + if name.is_empty() { + return Err("invalid empty register"); + } + let class = AmdgpuInlineAsmRegClass::parse_prefix(name.chars().next().unwrap())?; + // Form with range, e.g. s[2:3] + let res; + if name[1..].starts_with('[') { + if !name.ends_with(']') { + return Err("invalid register, missing closing bracket"); + } + if let Some((start, end)) = name[2..name.len() - 1].split_once(':') { + let Ok(start) = start.parse() else { + return Err("invalid register range start"); + }; + let Ok(end) = end.parse() else { + return Err("invalid register range end"); + }; + + // Check range + if start > end { + return Err("invalid reversed register range"); + } + + if end >= class.max_num() { + return Err("too large register for this class"); + } + res = Self { class, range: AmdgpuRegRange::Range { start, end } }; + } else { + return Err("invalid register range"); + } + } else { + let parse_num = |core: &str| { + let Ok(start) = core.parse() else { + return Err("invalid register number"); + }; + + if start >= class.max_num() { + return Err("too large register for this class"); + } + + Ok(start) + }; + + let name = &name[1..]; + let range = if let Some(name) = name.strip_suffix(".l") { + if class == AmdgpuInlineAsmRegClass::sgpr { + return Err("invalid 16-bit SGPR register"); + } + AmdgpuRegRange::Low(parse_num(name)?) + } else if let Some(name) = name.strip_suffix(".h") { + if class == AmdgpuInlineAsmRegClass::sgpr { + return Err("invalid 16-bit SGPR register"); + } + AmdgpuRegRange::High(parse_num(name)?) + } else { + let start = parse_num(name)?; + AmdgpuRegRange::Range { start, end: start } + }; + res = Self { class, range }; + } + Ok(res) + } + + pub fn validate( + self, + _arch: super::InlineAsmArch, + _reloc_model: crate::spec::RelocModel, + _target_features: &rustc_data_structures::fx::FxIndexSet, + _target: &crate::spec::Target, + _is_clobber: bool, + ) -> Result<(), &'static str> { + Ok(()) + } +} + +pub(super) fn fill_reg_map( + _arch: super::InlineAsmArch, + _reloc_model: crate::spec::RelocModel, + _target_features: &rustc_data_structures::fx::FxIndexSet, + _target: &crate::spec::Target, + map: &mut rustc_data_structures::fx::FxHashMap< + super::InlineAsmRegClass, + rustc_data_structures::fx::FxIndexSet, + >, +) { + use super::{InlineAsmReg, InlineAsmRegClass}; + + // Add single registers of each class (no register ranges) + #[allow(rustc::potential_query_instability)] + for class in regclass_map().keys() { + let InlineAsmRegClass::Amdgpu(class) = *class else { unreachable!("Must be amdgpu class") }; + if let Some(set) = map.get_mut(&InlineAsmRegClass::Amdgpu(class)) { + for i in 0..class.max_num() { + set.insert(InlineAsmReg::Amdgpu(AmdgpuInlineAsmReg { + class, + range: AmdgpuRegRange::Range { start: i, end: i }, + })); + } + } + } +} + +impl AmdgpuInlineAsmReg { + pub fn emit( + self, + out: &mut dyn fmt::Write, + _arch: InlineAsmArch, + _modifier: Option, + ) -> fmt::Result { + out.write_str(&self.name()) + } + + pub fn overlapping_regs(self, mut cb: impl FnMut(AmdgpuInlineAsmReg)) { + if self.class != AmdgpuInlineAsmRegClass::sgpr { + // Overlapping 16-bit registers (not supported for sgprs) + if let AmdgpuRegRange::Range { start, end } = self.range { + for i in start..=end { + cb(AmdgpuInlineAsmReg { class: self.class, range: AmdgpuRegRange::Low(i) }); + cb(AmdgpuInlineAsmReg { class: self.class, range: AmdgpuRegRange::High(i) }); + } + } + } + + // Overlapping 32-bit registers, up to size 32 + for size in 1..=32 { + let (start, end) = match self.range { + AmdgpuRegRange::Low(start) | AmdgpuRegRange::High(start) => (start, start), + AmdgpuRegRange::Range { start, end } => (start, end), + }; + + let size_range = size - 1; + for overlap_start in (start - size_range)..=end { + cb(AmdgpuInlineAsmReg { + class: self.class, + range: AmdgpuRegRange::Range { + start: overlap_start, + end: overlap_start + size_range, + }, + }); + } + } + } +} diff --git a/compiler/rustc_target/src/asm/mod.rs b/compiler/rustc_target/src/asm/mod.rs index a10699bbce884..b6e555c93a0ed 100644 --- a/compiler/rustc_target/src/asm/mod.rs +++ b/compiler/rustc_target/src/asm/mod.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::fmt; use rustc_abi::Size; @@ -177,6 +178,7 @@ macro_rules! types { } mod aarch64; +mod amdgpu; mod arm; mod avr; mod bpf; @@ -196,6 +198,7 @@ mod wasm; mod x86; pub use aarch64::{AArch64InlineAsmReg, AArch64InlineAsmRegClass}; +pub use amdgpu::{AmdgpuInlineAsmReg, AmdgpuInlineAsmRegClass}; pub use arm::{ArmInlineAsmReg, ArmInlineAsmRegClass}; pub use avr::{AvrInlineAsmReg, AvrInlineAsmRegClass}; pub use bpf::{BpfInlineAsmReg, BpfInlineAsmRegClass}; @@ -224,6 +227,7 @@ pub enum InlineAsmArch { RiscV32, RiscV64, Nvptx64, + Amdgpu, Hexagon, LoongArch32, LoongArch64, @@ -252,6 +256,7 @@ impl InlineAsmArch { Arch::Arm => Some(Self::Arm), Arch::Arm64EC => Some(Self::Arm64EC), Arch::AArch64 => Some(Self::AArch64), + Arch::AmdGpu => Some(Self::Amdgpu), Arch::RiscV32 => Some(Self::RiscV32), Arch::RiscV64 => Some(Self::RiscV64), Arch::Nvptx64 => Some(Self::Nvptx64), @@ -273,7 +278,7 @@ impl InlineAsmArch { Arch::Msp430 => Some(Self::Msp430), Arch::M68k => Some(Self::M68k), Arch::CSky => Some(Self::CSKY), - Arch::AmdGpu | Arch::Xtensa | Arch::Other(_) => None, + Arch::Xtensa | Arch::Other(_) => None, } } } @@ -283,6 +288,7 @@ impl InlineAsmArch { pub enum InlineAsmReg { X86(X86InlineAsmReg), Arm(ArmInlineAsmReg), + Amdgpu(AmdgpuInlineAsmReg), AArch64(AArch64InlineAsmReg), RiscV(RiscVInlineAsmReg), Nvptx(NvptxInlineAsmReg), @@ -304,24 +310,25 @@ pub enum InlineAsmReg { } impl InlineAsmReg { - pub fn name(self) -> &'static str { + pub fn name(self) -> Cow<'static, str> { match self { - Self::X86(r) => r.name(), - Self::Arm(r) => r.name(), - Self::AArch64(r) => r.name(), - Self::RiscV(r) => r.name(), - Self::PowerPC(r) => r.name(), - Self::Hexagon(r) => r.name(), - Self::LoongArch(r) => r.name(), - Self::Mips(r) => r.name(), - Self::S390x(r) => r.name(), - Self::Sparc(r) => r.name(), - Self::Bpf(r) => r.name(), - Self::Avr(r) => r.name(), - Self::Msp430(r) => r.name(), - Self::M68k(r) => r.name(), - Self::CSKY(r) => r.name(), - Self::Err => "", + Self::X86(r) => r.name().into(), + Self::Arm(r) => r.name().into(), + Self::AArch64(r) => r.name().into(), + Self::Amdgpu(r) => r.name().into(), + Self::RiscV(r) => r.name().into(), + Self::PowerPC(r) => r.name().into(), + Self::Hexagon(r) => r.name().into(), + Self::LoongArch(r) => r.name().into(), + Self::Mips(r) => r.name().into(), + Self::S390x(r) => r.name().into(), + Self::Sparc(r) => r.name().into(), + Self::Bpf(r) => r.name().into(), + Self::Avr(r) => r.name().into(), + Self::Msp430(r) => r.name().into(), + Self::M68k(r) => r.name().into(), + Self::CSKY(r) => r.name().into(), + Self::Err => "".into(), } } @@ -330,6 +337,7 @@ impl InlineAsmReg { Self::X86(r) => InlineAsmRegClass::X86(r.reg_class()), Self::Arm(r) => InlineAsmRegClass::Arm(r.reg_class()), Self::AArch64(r) => InlineAsmRegClass::AArch64(r.reg_class()), + Self::Amdgpu(r) => InlineAsmRegClass::Amdgpu(r.reg_class()), Self::RiscV(r) => InlineAsmRegClass::RiscV(r.reg_class()), Self::PowerPC(r) => InlineAsmRegClass::PowerPC(r.reg_class()), Self::Hexagon(r) => InlineAsmRegClass::Hexagon(r.reg_class()), @@ -356,6 +364,7 @@ impl InlineAsmReg { InlineAsmArch::AArch64 | InlineAsmArch::Arm64EC => { Self::AArch64(AArch64InlineAsmReg::parse(name)?) } + InlineAsmArch::Amdgpu => Self::Amdgpu(AmdgpuInlineAsmReg::parse(name)?), InlineAsmArch::RiscV32 | InlineAsmArch::RiscV64 => { Self::RiscV(RiscVInlineAsmReg::parse(name)?) } @@ -398,6 +407,7 @@ impl InlineAsmReg { Self::X86(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), Self::Arm(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), Self::AArch64(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), + Self::Amdgpu(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), Self::RiscV(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), Self::PowerPC(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), Self::Hexagon(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), @@ -428,6 +438,7 @@ impl InlineAsmReg { Self::X86(r) => r.emit(out, arch, modifier), Self::Arm(r) => r.emit(out, arch, modifier), Self::AArch64(r) => r.emit(out, arch, modifier), + Self::Amdgpu(r) => r.emit(out, arch, modifier), Self::RiscV(r) => r.emit(out, arch, modifier), Self::PowerPC(r) => r.emit(out, arch, modifier), Self::Hexagon(r) => r.emit(out, arch, modifier), @@ -449,6 +460,7 @@ impl InlineAsmReg { Self::X86(r) => r.overlapping_regs(|r| cb(Self::X86(r))), Self::Arm(r) => r.overlapping_regs(|r| cb(Self::Arm(r))), Self::AArch64(_) => cb(self), + Self::Amdgpu(r) => r.overlapping_regs(|r| cb(Self::Amdgpu(r))), Self::RiscV(_) => cb(self), Self::PowerPC(r) => r.overlapping_regs(|r| cb(Self::PowerPC(r))), Self::Hexagon(r) => r.overlapping_regs(|r| cb(Self::Hexagon(r))), @@ -472,6 +484,7 @@ pub enum InlineAsmRegClass { X86(X86InlineAsmRegClass), Arm(ArmInlineAsmRegClass), AArch64(AArch64InlineAsmRegClass), + Amdgpu(AmdgpuInlineAsmRegClass), RiscV(RiscVInlineAsmRegClass), Nvptx(NvptxInlineAsmRegClass), PowerPC(PowerPCInlineAsmRegClass), @@ -497,6 +510,7 @@ impl InlineAsmRegClass { Self::X86(r) => r.name(), Self::Arm(r) => r.name(), Self::AArch64(r) => r.name(), + Self::Amdgpu(r) => r.name(), Self::RiscV(r) => r.name(), Self::Nvptx(r) => r.name(), Self::PowerPC(r) => r.name(), @@ -524,6 +538,7 @@ impl InlineAsmRegClass { Self::X86(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::X86), Self::Arm(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::Arm), Self::AArch64(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::AArch64), + Self::Amdgpu(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::Amdgpu), Self::RiscV(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::RiscV), Self::Nvptx(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::Nvptx), Self::PowerPC(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::PowerPC), @@ -554,6 +569,7 @@ impl InlineAsmRegClass { Self::X86(r) => r.suggest_modifier(arch, ty), Self::Arm(r) => r.suggest_modifier(arch, ty), Self::AArch64(r) => r.suggest_modifier(arch, ty), + Self::Amdgpu(r) => r.suggest_modifier(arch, ty), Self::RiscV(r) => r.suggest_modifier(arch, ty), Self::Nvptx(r) => r.suggest_modifier(arch, ty), Self::PowerPC(r) => r.suggest_modifier(arch, ty), @@ -584,6 +600,7 @@ impl InlineAsmRegClass { Self::X86(r) => r.default_modifier(arch), Self::Arm(r) => r.default_modifier(arch), Self::AArch64(r) => r.default_modifier(arch), + Self::Amdgpu(r) => r.default_modifier(arch), Self::RiscV(r) => r.default_modifier(arch), Self::Nvptx(r) => r.default_modifier(arch), Self::PowerPC(r) => r.default_modifier(arch), @@ -617,6 +634,7 @@ impl InlineAsmRegClass { Self::X86(r) => r.supported_types(arch), Self::Arm(r) => r.supported_types(arch), Self::AArch64(r) => r.supported_types(arch), + Self::Amdgpu(r) => r.supported_types(arch), Self::RiscV(r) => r.supported_types(arch), Self::Nvptx(r) => r.supported_types(arch), Self::PowerPC(r) => r.supported_types(arch), @@ -645,6 +663,7 @@ impl InlineAsmRegClass { InlineAsmArch::AArch64 | InlineAsmArch::Arm64EC => { Self::AArch64(AArch64InlineAsmRegClass::parse(name)?) } + InlineAsmArch::Amdgpu => Self::Amdgpu(AmdgpuInlineAsmRegClass::parse(name)?), InlineAsmArch::RiscV32 | InlineAsmArch::RiscV64 => { Self::RiscV(RiscVInlineAsmRegClass::parse(name)?) } @@ -682,6 +701,7 @@ impl InlineAsmRegClass { Self::X86(r) => r.valid_modifiers(arch), Self::Arm(r) => r.valid_modifiers(arch), Self::AArch64(r) => r.valid_modifiers(arch), + Self::Amdgpu(r) => r.valid_modifiers(arch), Self::RiscV(r) => r.valid_modifiers(arch), Self::Nvptx(r) => r.valid_modifiers(arch), Self::PowerPC(r) => r.valid_modifiers(arch), @@ -843,6 +863,11 @@ pub fn allocatable_registers( aarch64::fill_reg_map(arch, reloc_model, target_features, target, &mut map); map } + InlineAsmArch::Amdgpu => { + let mut map = amdgpu::regclass_map(); + amdgpu::fill_reg_map(arch, reloc_model, target_features, target, &mut map); + map + } InlineAsmArch::RiscV32 | InlineAsmArch::RiscV64 => { let mut map = riscv::regclass_map(); riscv::fill_reg_map(arch, reloc_model, target_features, target, &mut map); diff --git a/tests/assembly-llvm/asm/amdgpu-types.rs b/tests/assembly-llvm/asm/amdgpu-types.rs new file mode 100644 index 0000000000000..e1651560ec755 --- /dev/null +++ b/tests/assembly-llvm/asm/amdgpu-types.rs @@ -0,0 +1,232 @@ +//@ add-minicore +//@ revisions: gfx11 gfx12 +//@ assembly-output: emit-asm +//@ compile-flags: --target amdgcn-amd-amdhsa +//@[gfx11] compile-flags: -Ctarget-cpu=gfx1100 +//@[gfx12] compile-flags: -Ctarget-cpu=gfx1200 +//@ needs-llvm-components: amdgpu +//@ needs-rust-lld + +#![feature(abi_gpu_kernel, no_core, asm_experimental_arch, f16)] +#![crate_type = "rlib"] +#![no_core] +#![allow(asm_sub_register, non_camel_case_types, unused_assignments, unused_variables)] + +extern crate minicore; +use minicore::*; + +type ptr = *mut u8; + +macro_rules! check { + ($func:ident $ty:ident $class:ident $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func(x: $ty) { + let y: $ty; + asm!(concat!($mov, " {}, {}"), out($class) y, in($class) x); + } + }; + + ($func:ident $ret_ty:ident $ret_class:ident $($arg_name:ident: $arg_ty:ident $arg_class:ident,)* + $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func($($arg_name: $arg_ty,)*) { + let result: $ret_ty; + asm!(concat!($mov, " {}", $(", {", stringify!($arg_name), "}",)*), + out($ret_class) result, $($arg_name = in($arg_class) $arg_name,)*); + } + }; +} + +macro_rules! check_reg { + ($func:ident $ty:ident $reg:tt $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func(x: $ty) { + let y: $ty; + asm!(concat!($mov, " ", $reg, ", ", $reg), lateout($reg) y, in($reg) x); + } + }; + + ($func:ident $ret_ty:ident $ret_reg:tt $($arg_name:ident: $arg_ty:ident $arg_reg:tt,)* + $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func($($arg_name: $arg_ty,)*) { + let result: $ret_ty; + asm!(concat!($mov, " ", $ret_reg, $(", ", $arg_reg,)*), lateout($ret_reg) result, + $(in($arg_reg) $arg_name,)*); + } + }; +} + +// CHECK-LABEL: sgpr_i16: +// CHECK: #ASMSTART +// CHECK: s_pack_ll_b32_b16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// CHECK: #ASMEND +check!(sgpr_i16 i32 sgpr x: i16 sgpr, y: i16 sgpr, "s_pack_ll_b32_b16"); + +// gfx11-LABEL: vgpr_i16: +// gfx11: #ASMSTART +// gfx11: v_mov_b16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_i16 i16 vgpr "v_mov_b16"); + +// gfx12-LABEL: sgpr_f16: +// gfx12: #ASMSTART +// gfx12: s_add_f16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// gfx12: #ASMEND +#[cfg(gfx12)] +check!(sgpr_f16 f16 sgpr x: f16 sgpr, y: f16 sgpr, "s_add_f16"); + +// gfx11-LABEL: vgpr_f16: +// gfx11: #ASMSTART +// gfx11: v_mov_b16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_f16 f16 vgpr "v_mov_b16"); + +// CHECK-LABEL: sgpr_i32: +// CHECK: #ASMSTART +// CHECK: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i32 i32 sgpr "s_mov_b32"); + +// CHECK-LABEL: vgpr_i32: +// CHECK: #ASMSTART +// CHECK: v_mov_b32 v{{[0-9]+}}, v{{[0-9]+}} +// CHECK: #ASMEND +check!(vgpr_i32 i32 vgpr "v_mov_b32"); + +// CHECK-LABEL: sgpr_f32: +// CHECK: #ASMSTART +// CHECK: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f32 f32 sgpr "s_mov_b32"); + +// CHECK-LABEL: vgpr_f32: +// CHECK: #ASMSTART +// CHECK: v_mov_b32 v{{[0-9]+}}, v{{[0-9]+}} +// CHECK: #ASMEND +check!(vgpr_f32 f32 vgpr "v_mov_b32"); + +// CHECK-LABEL: sgpr_i64: +// CHECK: #ASMSTART +// CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(sgpr_i64 i64 sgpr "s_mov_b64"); + +// CHECK-LABEL: vgpr_i64: +// CHECK: #ASMSTART +// CHECK: v_lshlrev_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_i64 i64 vgpr x: i32 vgpr, y: i64 vgpr, "v_lshlrev_b64"); + +// CHECK-LABEL: sgpr_f64: +// CHECK: #ASMSTART +// CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(sgpr_f64 f64 sgpr "s_mov_b64"); + +// CHECK-LABEL: vgpr_f64: +// CHECK: #ASMSTART +// CHECK: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_f64 f64 vgpr x: f64 vgpr, y: f64 vgpr, "v_add_f64"); + +// CHECK-LABEL: sgpr_i128: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i128 i128 sgpr x: ptr sgpr, y: i32 sgpr, "s_load_b128"); + +// CHECK-LABEL: vgpr_i128: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_i128 i128 vgpr x: i32 vgpr, y: ptr sgpr, "global_load_b128"); + +// CHECK-LABEL: s0_i16: +// CHECK: #ASMSTART +// CHECK: s_pack_ll_b32_b16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// CHECK: #ASMEND +check_reg!(s0_i16 i32 "s0" x: i16 "s1", y: i16 "s2", "s_pack_ll_b32_b16"); + +// gfx11-LABEL: v0_i16: +// gfx11: #ASMSTART +// gfx11: v_mov_b16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_i16 i16 "v0.l" "v_mov_b16"); + +// gfx12-LABEL: s0_f16: +// gfx12: #ASMSTART +// gfx12: s_add_f16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// gfx12: #ASMEND +#[cfg(gfx12)] +check_reg!(s0_f16 f16 "s0" x: f16 "s1", y: f16 "s2", "s_add_f16"); + +// gfx11-LABEL: v0_f16: +// gfx11: #ASMSTART +// gfx11: v_mov_b16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_f16 f16 "v0.l" "v_mov_b16"); + +// CHECK-LABEL: s0_i32: +// CHECK: #ASMSTART +// CHECK: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i32 i32 "s0" "s_mov_b32"); + +// CHECK-LABEL: v0_i32: +// CHECK: #ASMSTART +// CHECK: v_mov_b32 v{{[0-9]+}}, v{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(v0_i32 i32 "v0" "v_mov_b32"); + +// CHECK-LABEL: s0_f32: +// CHECK: #ASMSTART +// CHECK: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f32 f32 "s0" "s_mov_b32"); + +// CHECK-LABEL: v0_f32: +// CHECK: #ASMSTART +// CHECK: v_mov_b32 v{{[0-9]+}}, v{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(v0_f32 f32 "v0" "v_mov_b32"); + +// CHECK-LABEL: s0_i64: +// CHECK: #ASMSTART +// CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(s0_i64 i64 "s[0:1]" "s_mov_b64"); + +// CHECK-LABEL: v0_i64: +// CHECK: #ASMSTART +// CHECK: v_lshlrev_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_i64 i64 "v[0:1]" x: i32 "v2", y: i64 "v[0:1]", "v_lshlrev_b64"); + +// CHECK-LABEL: s0_f64: +// CHECK: #ASMSTART +// CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(s0_f64 f64 "s[0:1]" "s_mov_b64"); + +// CHECK-LABEL: v0_f64: +// CHECK: #ASMSTART +// CHECK: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_f64 f64 "v[0:1]" x: f64 "v[0:1]", y: f64 "v[2:3]", "v_add_f64"); + +// CHECK-LABEL: s0_i128: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i128 i128 "s[0:3]" x: ptr "s[0:1]", y: i32 "s0", "s_load_b128"); + +// CHECK-LABEL: v0_i128: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_i128 i128 "v[0:3]" x: i32 "v0", y: ptr "s[0:1]", "global_load_b128"); From 631a77ef37684db2d976645d23850a5ea4dcd92d Mon Sep 17 00:00:00 2001 From: Flakebi Date: Mon, 23 Feb 2026 10:38:14 +0100 Subject: [PATCH 2/3] Split amdgpu inline asm reg classes by size --- compiler/rustc_codegen_gcc/src/asm.rs | 9 +- compiler/rustc_codegen_llvm/src/asm.rs | 9 +- compiler/rustc_span/src/symbol.rs | 9 +- compiler/rustc_target/src/asm/amdgpu.rs | 182 +++++++++++++++++++----- tests/assembly-llvm/asm/amdgpu-types.rs | 28 ++-- 5 files changed, 184 insertions(+), 53 deletions(-) diff --git a/compiler/rustc_codegen_gcc/src/asm.rs b/compiler/rustc_codegen_gcc/src/asm.rs index 389c0453db401..9eb714225f8b5 100644 --- a/compiler/rustc_codegen_gcc/src/asm.rs +++ b/compiler/rustc_codegen_gcc/src/asm.rs @@ -670,8 +670,13 @@ fn reg_class_to_gcc(reg_class: InlineAsmRegClass) -> &'static str { InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::preg) => { unreachable!("clobber-only") } - InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr) => "v", - InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::sgpr) => "Sg", + InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::sgpr32) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::sgpr64) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::sgpr128) => "Sg", + InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr16) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr32) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr64) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr128) => "v", InlineAsmRegClass::Arm(ArmInlineAsmRegClass::reg) => "r", InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg) | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::dreg_low16) diff --git a/compiler/rustc_codegen_llvm/src/asm.rs b/compiler/rustc_codegen_llvm/src/asm.rs index 1acc588622e03..28df566fa53f9 100644 --- a/compiler/rustc_codegen_llvm/src/asm.rs +++ b/compiler/rustc_codegen_llvm/src/asm.rs @@ -646,8 +646,13 @@ fn reg_to_llvm(reg: InlineAsmRegOrRegClass, layout: Option<&TyAndLayout<'_>>) -> | Arm(ArmInlineAsmRegClass::dreg_low8) | Arm(ArmInlineAsmRegClass::qreg_low4) => "x", Arm(ArmInlineAsmRegClass::dreg) | Arm(ArmInlineAsmRegClass::qreg) => "w", - Amdgpu(AmdgpuInlineAsmRegClass::sgpr) => "s", - Amdgpu(AmdgpuInlineAsmRegClass::vgpr) => "v", + Amdgpu(AmdgpuInlineAsmRegClass::sgpr32) + | Amdgpu(AmdgpuInlineAsmRegClass::sgpr64) + | Amdgpu(AmdgpuInlineAsmRegClass::sgpr128) => "s", + Amdgpu(AmdgpuInlineAsmRegClass::vgpr16) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr32) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr64) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr128) => "v", Hexagon(HexagonInlineAsmRegClass::reg) => "r", Hexagon(HexagonInlineAsmRegClass::preg) => unreachable!("clobber-only"), LoongArch(LoongArchInlineAsmRegClass::reg) => "r", diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index 8348453fdb558..7e6f60f6ecb46 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -2074,7 +2074,9 @@ symbols! { self_in_typedefs, self_struct_ctor, semiopaque, - sgpr, + sgpr32, + sgpr64, + sgpr128, sha2, sha3, sha512_sm_x86, @@ -2503,7 +2505,10 @@ symbols! { verbatim, version, vfp2, - vgpr, + vgpr16, + vgpr32, + vgpr64, + vgpr128, vis, visible_private_types, volatile, diff --git a/compiler/rustc_target/src/asm/amdgpu.rs b/compiler/rustc_target/src/asm/amdgpu.rs index 4e58e75abe6f3..b64b9ab83f957 100644 --- a/compiler/rustc_target/src/asm/amdgpu.rs +++ b/compiler/rustc_target/src/asm/amdgpu.rs @@ -4,21 +4,95 @@ use rustc_span::Symbol; use super::{InlineAsmArch, InlineAsmType, ModifierInfo}; +// Types are listed as SGPR_*/VGPR_* in llvm/lib/Target/AMDGPU/SIRegisterInfo.td def_reg_class! { Amdgpu AmdgpuInlineAsmRegClass { - sgpr, - vgpr, + sgpr32, + sgpr64, + sgpr128, + vgpr16, + vgpr32, + vgpr64, + vgpr128, } } +#[derive( + Copy, + Clone, + rustc_macros::Encodable, + rustc_macros::Decodable, + Debug, + Eq, + PartialEq, + PartialOrd, + Hash, + rustc_macros::HashStable_Generic +)] +pub enum AmdgpuInlineAsmRegClassType { + Sgpr, + Vgpr, +} + // See https://llvm.org/docs/AMDGPUOperandSyntax.html impl AmdgpuInlineAsmRegClass { + pub fn get_type(self) -> AmdgpuInlineAsmRegClassType { + match self { + Self::sgpr32 + | Self::sgpr64 + | Self::sgpr128 => AmdgpuInlineAsmRegClassType::Sgpr, + Self::vgpr16 + | Self::vgpr32 + | Self::vgpr64 + | Self::vgpr128 => AmdgpuInlineAsmRegClassType::Vgpr, + } + } + + /// Return size of the register class in bytes + pub fn bytes(self) -> u32 { + match self { + Self::vgpr16 => 16 / 8, + Self::sgpr32 | Self::vgpr32 => 32 / 8, + Self::sgpr64 | Self::vgpr64 => 64 / 8, + Self::sgpr128 | Self::vgpr128 => 128 / 8, + } + } + + fn from_type(ty: AmdgpuInlineAsmRegClassType, bytes: u32) -> Option { + let class = match ty { + AmdgpuInlineAsmRegClassType::Sgpr => match bytes * 8 { + 32 => Self::sgpr32, + 64 => Self::sgpr64, + 128 => Self::sgpr128, + _ => return None, + }, + AmdgpuInlineAsmRegClassType::Vgpr => match bytes * 8 { + 16 => Self::vgpr16, + 32 => Self::vgpr32, + 64 => Self::vgpr64, + 128 => Self::vgpr128, + _ => return None, + }, + }; + Some(class) + } + pub fn valid_modifiers(self, _arch: InlineAsmArch) -> &'static [char] { &[] } - pub fn suggest_class(self, _arch: InlineAsmArch, _ty: InlineAsmType) -> Option { - None + pub fn suggest_class(self, _arch: InlineAsmArch, ty: InlineAsmType) -> Option { + // Suggest VGPR for everything as VGPRs have more uses + Some(match ty { + InlineAsmType::I16 => Self::vgpr16, + InlineAsmType::I32 => Self::vgpr32, + InlineAsmType::I64 => Self::vgpr64, + InlineAsmType::I128 => Self::vgpr128, + InlineAsmType::F16 => Self::vgpr16, + InlineAsmType::F32 => Self::vgpr32, + InlineAsmType::F64 => Self::vgpr64, + _ => return None, + }) } pub fn suggest_modifier( @@ -37,37 +111,48 @@ impl AmdgpuInlineAsmRegClass { self, _arch: InlineAsmArch, ) -> &'static [(InlineAsmType, Option)] { - types! { _: I16, F16, I32, F32, I64, F64, I128; } + match self { + Self::vgpr16 => types! { _: I16, F16; }, + Self::sgpr32 | Self::vgpr32 => types! { _: I16, I32, F16, F32; }, + Self::sgpr64 | Self::vgpr64 => types! { _: I64, F64; }, + Self::sgpr128 | Self::vgpr128 => types! { _: I128; }, + } } /// The number of supported registers in this class. /// The returned number is the length, so supported register /// indices are 0 to max_num()-1. fn max_num(self) -> u32 { - match self { - Self::sgpr => 106, - Self::vgpr => 256, + if self == AmdgpuInlineAsmRegClass::vgpr16 { + return 512; } - } - - /// Prefix when printed and register constraint in LLVM. - fn prefix(self) -> &'static str { - match self { - Self::sgpr => "s", - Self::vgpr => "v", + let size = self.bytes(); + match self.get_type() { + AmdgpuInlineAsmRegClassType::Sgpr => 106 - (size / 4 - 1), + AmdgpuInlineAsmRegClassType::Vgpr => 256 - (size / 4 - 1), } } /// Get register class from prefix. - fn parse_prefix(prefix: char) -> Result { + fn parse_prefix(prefix: char) -> Result { match prefix { - 's' => Ok(Self::sgpr), - 'v' => Ok(Self::vgpr), + 's' => Ok(AmdgpuInlineAsmRegClassType::Sgpr), + 'v' => Ok(AmdgpuInlineAsmRegClassType::Vgpr), _ => Err("unknown register prefix"), } } } +impl AmdgpuInlineAsmRegClassType { + /// Prefix when printed and register constraint in LLVM. + fn prefix(self) -> &'static str { + match self { + AmdgpuInlineAsmRegClassType::Sgpr => "s", + AmdgpuInlineAsmRegClassType::Vgpr => "v", + } + } +} + #[derive( Copy, Clone, @@ -103,7 +188,7 @@ enum AmdgpuRegRange { )] #[allow(non_camel_case_types)] pub struct AmdgpuInlineAsmReg { - class: AmdgpuInlineAsmRegClass, + class: AmdgpuInlineAsmRegClassType, range: AmdgpuRegRange, } @@ -118,8 +203,17 @@ impl AmdgpuInlineAsmReg { } } + /// Size of the register in bytes + fn bytes(self) -> u32 { + match self.range { + AmdgpuRegRange::Low(_) | AmdgpuRegRange::High(_) => 2, + AmdgpuRegRange::Range { start, end } => ((end - start) + 1) * 4, + } + } + pub fn reg_class(self) -> AmdgpuInlineAsmRegClass { - self.class + AmdgpuInlineAsmRegClass::from_type(self.class, self.bytes()) + .expect("Failed to emit invalid amdgpu register class") } pub fn parse(name: &str) -> Result { @@ -146,8 +240,14 @@ impl AmdgpuInlineAsmReg { return Err("invalid reversed register range"); } - if end >= class.max_num() { - return Err("too large register for this class"); + if let Some(class) = + AmdgpuInlineAsmRegClass::from_type(class, ((end - start) + 1) * 4) + { + if end >= class.max_num() { + return Err("too large register for this class"); + } + } else { + return Err("invalid register size for this class"); } res = Self { class, range: AmdgpuRegRange::Range { start, end } }; } else { @@ -159,8 +259,12 @@ impl AmdgpuInlineAsmReg { return Err("invalid register number"); }; - if start >= class.max_num() { - return Err("too large register for this class"); + if let Some(class) = AmdgpuInlineAsmRegClass::from_type(class, 4) { + if start >= class.max_num() { + return Err("too large register for this class"); + } + } else { + return Err("invalid register size for this class"); } Ok(start) @@ -168,12 +272,12 @@ impl AmdgpuInlineAsmReg { let name = &name[1..]; let range = if let Some(name) = name.strip_suffix(".l") { - if class == AmdgpuInlineAsmRegClass::sgpr { + if class == AmdgpuInlineAsmRegClassType::Sgpr { return Err("invalid 16-bit SGPR register"); } AmdgpuRegRange::Low(parse_num(name)?) } else if let Some(name) = name.strip_suffix(".h") { - if class == AmdgpuInlineAsmRegClass::sgpr { + if class == AmdgpuInlineAsmRegClassType::Sgpr { return Err("invalid 16-bit SGPR register"); } AmdgpuRegRange::High(parse_num(name)?) @@ -210,16 +314,28 @@ pub(super) fn fill_reg_map( ) { use super::{InlineAsmReg, InlineAsmRegClass}; - // Add single registers of each class (no register ranges) #[allow(rustc::potential_query_instability)] for class in regclass_map().keys() { let InlineAsmRegClass::Amdgpu(class) = *class else { unreachable!("Must be amdgpu class") }; if let Some(set) = map.get_mut(&InlineAsmRegClass::Amdgpu(class)) { - for i in 0..class.max_num() { - set.insert(InlineAsmReg::Amdgpu(AmdgpuInlineAsmReg { - class, - range: AmdgpuRegRange::Range { start: i, end: i }, - })); + if class == AmdgpuInlineAsmRegClass::vgpr16 { + for i in 0..(class.max_num() / 2) { + set.insert(InlineAsmReg::Amdgpu(AmdgpuInlineAsmReg { + class: AmdgpuInlineAsmRegClassType::Vgpr, + range: AmdgpuRegRange::Low(i), + })); + set.insert(InlineAsmReg::Amdgpu(AmdgpuInlineAsmReg { + class: AmdgpuInlineAsmRegClassType::Vgpr, + range: AmdgpuRegRange::High(i), + })); + } + } else { + for i in 0..class.max_num() { + set.insert(InlineAsmReg::Amdgpu(AmdgpuInlineAsmReg { + class: class.get_type(), + range: AmdgpuRegRange::Range { start: i, end: i + class.bytes() / 4 }, + })); + } } } } @@ -236,7 +352,7 @@ impl AmdgpuInlineAsmReg { } pub fn overlapping_regs(self, mut cb: impl FnMut(AmdgpuInlineAsmReg)) { - if self.class != AmdgpuInlineAsmRegClass::sgpr { + if self.class != AmdgpuInlineAsmRegClassType::Sgpr { // Overlapping 16-bit registers (not supported for sgprs) if let AmdgpuRegRange::Range { start, end } = self.range { for i in start..=end { diff --git a/tests/assembly-llvm/asm/amdgpu-types.rs b/tests/assembly-llvm/asm/amdgpu-types.rs index e1651560ec755..fe8ae88ee83ef 100644 --- a/tests/assembly-llvm/asm/amdgpu-types.rs +++ b/tests/assembly-llvm/asm/amdgpu-types.rs @@ -61,88 +61,88 @@ macro_rules! check_reg { // CHECK: #ASMSTART // CHECK: s_pack_ll_b32_b16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} // CHECK: #ASMEND -check!(sgpr_i16 i32 sgpr x: i16 sgpr, y: i16 sgpr, "s_pack_ll_b32_b16"); +check!(sgpr_i16 i32 sgpr32 x: i16 sgpr32, y: i16 sgpr32, "s_pack_ll_b32_b16"); // gfx11-LABEL: vgpr_i16: // gfx11: #ASMSTART // gfx11: v_mov_b16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} // gfx11: #ASMEND #[cfg(gfx11)] -check!(vgpr_i16 i16 vgpr "v_mov_b16"); +check!(vgpr_i16 i16 vgpr32 "v_mov_b16"); // gfx12-LABEL: sgpr_f16: // gfx12: #ASMSTART // gfx12: s_add_f16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} // gfx12: #ASMEND #[cfg(gfx12)] -check!(sgpr_f16 f16 sgpr x: f16 sgpr, y: f16 sgpr, "s_add_f16"); +check!(sgpr_f16 f16 sgpr32 x: f16 sgpr32, y: f16 sgpr32, "s_add_f16"); // gfx11-LABEL: vgpr_f16: // gfx11: #ASMSTART // gfx11: v_mov_b16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} // gfx11: #ASMEND #[cfg(gfx11)] -check!(vgpr_f16 f16 vgpr "v_mov_b16"); +check!(vgpr_f16 f16 vgpr32 "v_mov_b16"); // CHECK-LABEL: sgpr_i32: // CHECK: #ASMSTART // CHECK: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} // CHECK: #ASMEND -check!(sgpr_i32 i32 sgpr "s_mov_b32"); +check!(sgpr_i32 i32 sgpr32 "s_mov_b32"); // CHECK-LABEL: vgpr_i32: // CHECK: #ASMSTART // CHECK: v_mov_b32 v{{[0-9]+}}, v{{[0-9]+}} // CHECK: #ASMEND -check!(vgpr_i32 i32 vgpr "v_mov_b32"); +check!(vgpr_i32 i32 vgpr32 "v_mov_b32"); // CHECK-LABEL: sgpr_f32: // CHECK: #ASMSTART // CHECK: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} // CHECK: #ASMEND -check!(sgpr_f32 f32 sgpr "s_mov_b32"); +check!(sgpr_f32 f32 sgpr32 "s_mov_b32"); // CHECK-LABEL: vgpr_f32: // CHECK: #ASMSTART // CHECK: v_mov_b32 v{{[0-9]+}}, v{{[0-9]+}} // CHECK: #ASMEND -check!(vgpr_f32 f32 vgpr "v_mov_b32"); +check!(vgpr_f32 f32 vgpr32 "v_mov_b32"); // CHECK-LABEL: sgpr_i64: // CHECK: #ASMSTART // CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} // CHECK: #ASMEND -check!(sgpr_i64 i64 sgpr "s_mov_b64"); +check!(sgpr_i64 i64 sgpr64 "s_mov_b64"); // CHECK-LABEL: vgpr_i64: // CHECK: #ASMSTART // CHECK: v_lshlrev_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} // CHECK: #ASMEND -check!(vgpr_i64 i64 vgpr x: i32 vgpr, y: i64 vgpr, "v_lshlrev_b64"); +check!(vgpr_i64 i64 vgpr64 x: i32 vgpr32, y: i64 vgpr64, "v_lshlrev_b64"); // CHECK-LABEL: sgpr_f64: // CHECK: #ASMSTART // CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} // CHECK: #ASMEND -check!(sgpr_f64 f64 sgpr "s_mov_b64"); +check!(sgpr_f64 f64 sgpr64 "s_mov_b64"); // CHECK-LABEL: vgpr_f64: // CHECK: #ASMSTART // CHECK: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} // CHECK: #ASMEND -check!(vgpr_f64 f64 vgpr x: f64 vgpr, y: f64 vgpr, "v_add_f64"); +check!(vgpr_f64 f64 vgpr64 x: f64 vgpr64, y: f64 vgpr64, "v_add_f64"); // CHECK-LABEL: sgpr_i128: // CHECK: #ASMSTART // CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} // CHECK: #ASMEND -check!(sgpr_i128 i128 sgpr x: ptr sgpr, y: i32 sgpr, "s_load_b128"); +check!(sgpr_i128 i128 sgpr128 x: ptr sgpr64, y: i32 sgpr32, "s_load_b128"); // CHECK-LABEL: vgpr_i128: // CHECK: #ASMSTART // CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} // CHECK: #ASMEND -check!(vgpr_i128 i128 vgpr x: i32 vgpr, y: ptr sgpr, "global_load_b128"); +check!(vgpr_i128 i128 vgpr128 x: i32 vgpr32, y: ptr sgpr64, "global_load_b128"); // CHECK-LABEL: s0_i16: // CHECK: #ASMSTART From 246bc6db8ce666646b3c019f927cd0cae60c891a Mon Sep 17 00:00:00 2001 From: Flakebi Date: Mon, 23 Feb 2026 11:12:58 +0100 Subject: [PATCH 3/3] Add vector types to amdgpu inline asm --- compiler/rustc_codegen_gcc/src/asm.rs | 18 +- compiler/rustc_codegen_llvm/src/asm.rs | 18 +- compiler/rustc_span/src/symbol.rs | 14 + compiler/rustc_target/src/asm/amdgpu.rs | 114 ++- tests/assembly-llvm/asm/amdgpu-vec-types.rs | 845 +++++++++++++++++++ tests/assembly-llvm/asm/amdgpu-vec-types2.rs | 283 +++++++ 6 files changed, 1282 insertions(+), 10 deletions(-) create mode 100644 tests/assembly-llvm/asm/amdgpu-vec-types.rs create mode 100644 tests/assembly-llvm/asm/amdgpu-vec-types2.rs diff --git a/compiler/rustc_codegen_gcc/src/asm.rs b/compiler/rustc_codegen_gcc/src/asm.rs index 9eb714225f8b5..5a2c297f396cd 100644 --- a/compiler/rustc_codegen_gcc/src/asm.rs +++ b/compiler/rustc_codegen_gcc/src/asm.rs @@ -672,11 +672,25 @@ fn reg_class_to_gcc(reg_class: InlineAsmRegClass) -> &'static str { } InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::sgpr32) | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::sgpr64) - | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::sgpr128) => "Sg", + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::sgpr96) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::sgpr128) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::sgpr256) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::sgpr512) => "Sg", InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr16) | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr32) | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr64) - | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr128) => "v", + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr96) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr128) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr160) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr192) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr224) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr256) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr288) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr320) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr352) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr384) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr512) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr1024) => "v", InlineAsmRegClass::Arm(ArmInlineAsmRegClass::reg) => "r", InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg) | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::dreg_low16) diff --git a/compiler/rustc_codegen_llvm/src/asm.rs b/compiler/rustc_codegen_llvm/src/asm.rs index 28df566fa53f9..37e3c12ca31ef 100644 --- a/compiler/rustc_codegen_llvm/src/asm.rs +++ b/compiler/rustc_codegen_llvm/src/asm.rs @@ -648,11 +648,25 @@ fn reg_to_llvm(reg: InlineAsmRegOrRegClass, layout: Option<&TyAndLayout<'_>>) -> Arm(ArmInlineAsmRegClass::dreg) | Arm(ArmInlineAsmRegClass::qreg) => "w", Amdgpu(AmdgpuInlineAsmRegClass::sgpr32) | Amdgpu(AmdgpuInlineAsmRegClass::sgpr64) - | Amdgpu(AmdgpuInlineAsmRegClass::sgpr128) => "s", + | Amdgpu(AmdgpuInlineAsmRegClass::sgpr96) + | Amdgpu(AmdgpuInlineAsmRegClass::sgpr128) + | Amdgpu(AmdgpuInlineAsmRegClass::sgpr256) + | Amdgpu(AmdgpuInlineAsmRegClass::sgpr512) => "s", Amdgpu(AmdgpuInlineAsmRegClass::vgpr16) | Amdgpu(AmdgpuInlineAsmRegClass::vgpr32) | Amdgpu(AmdgpuInlineAsmRegClass::vgpr64) - | Amdgpu(AmdgpuInlineAsmRegClass::vgpr128) => "v", + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr96) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr128) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr160) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr192) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr224) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr256) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr288) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr320) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr352) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr384) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr512) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr1024) => "v", Hexagon(HexagonInlineAsmRegClass::reg) => "r", Hexagon(HexagonInlineAsmRegClass::preg) => unreachable!("clobber-only"), LoongArch(LoongArchInlineAsmRegClass::reg) => "r", diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index 7e6f60f6ecb46..ca33672b9ca3f 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -2076,7 +2076,10 @@ symbols! { semiopaque, sgpr32, sgpr64, + sgpr96, sgpr128, + sgpr256, + sgpr512, sha2, sha3, sha512_sm_x86, @@ -2508,7 +2511,18 @@ symbols! { vgpr16, vgpr32, vgpr64, + vgpr96, vgpr128, + vgpr160, + vgpr192, + vgpr224, + vgpr256, + vgpr288, + vgpr320, + vgpr352, + vgpr384, + vgpr512, + vgpr1024, vis, visible_private_types, volatile, diff --git a/compiler/rustc_target/src/asm/amdgpu.rs b/compiler/rustc_target/src/asm/amdgpu.rs index b64b9ab83f957..7e7ab17fe018e 100644 --- a/compiler/rustc_target/src/asm/amdgpu.rs +++ b/compiler/rustc_target/src/asm/amdgpu.rs @@ -9,11 +9,25 @@ def_reg_class! { Amdgpu AmdgpuInlineAsmRegClass { sgpr32, sgpr64, + sgpr96, sgpr128, + sgpr256, + sgpr512, vgpr16, vgpr32, vgpr64, + vgpr96, vgpr128, + vgpr160, + vgpr192, + vgpr224, + vgpr256, + vgpr288, + vgpr320, + vgpr352, + vgpr384, + vgpr512, + vgpr1024, } } @@ -40,11 +54,25 @@ impl AmdgpuInlineAsmRegClass { match self { Self::sgpr32 | Self::sgpr64 - | Self::sgpr128 => AmdgpuInlineAsmRegClassType::Sgpr, + | Self::sgpr96 + | Self::sgpr128 + | Self::sgpr256 + | Self::sgpr512 => AmdgpuInlineAsmRegClassType::Sgpr, Self::vgpr16 | Self::vgpr32 | Self::vgpr64 - | Self::vgpr128 => AmdgpuInlineAsmRegClassType::Vgpr, + | Self::vgpr96 + | Self::vgpr128 + | Self::vgpr160 + | Self::vgpr192 + | Self::vgpr224 + | Self::vgpr256 + | Self::vgpr288 + | Self::vgpr320 + | Self::vgpr352 + | Self::vgpr384 + | Self::vgpr512 + | Self::vgpr1024 => AmdgpuInlineAsmRegClassType::Vgpr, } } @@ -54,7 +82,18 @@ impl AmdgpuInlineAsmRegClass { Self::vgpr16 => 16 / 8, Self::sgpr32 | Self::vgpr32 => 32 / 8, Self::sgpr64 | Self::vgpr64 => 64 / 8, + Self::sgpr96 | Self::vgpr96 => 96 / 8, Self::sgpr128 | Self::vgpr128 => 128 / 8, + Self::vgpr160 => 160 / 8, + Self::vgpr192 => 192 / 8, + Self::vgpr224 => 224 / 8, + Self::sgpr256 | Self::vgpr256 => 256 / 8, + Self::vgpr288 => 288 / 8, + Self::vgpr320 => 320 / 8, + Self::vgpr352 => 352 / 8, + Self::vgpr384 => 384 / 8, + Self::sgpr512 | Self::vgpr512 => 512 / 8, + Self::vgpr1024 => 1024 / 8, } } @@ -63,14 +102,28 @@ impl AmdgpuInlineAsmRegClass { AmdgpuInlineAsmRegClassType::Sgpr => match bytes * 8 { 32 => Self::sgpr32, 64 => Self::sgpr64, + 96 => Self::sgpr96, 128 => Self::sgpr128, + 256 => Self::sgpr256, + 512 => Self::sgpr512, _ => return None, }, AmdgpuInlineAsmRegClassType::Vgpr => match bytes * 8 { 16 => Self::vgpr16, 32 => Self::vgpr32, 64 => Self::vgpr64, + 96 => Self::vgpr96, 128 => Self::vgpr128, + 160 => Self::vgpr160, + 192 => Self::vgpr192, + 224 => Self::vgpr224, + 256 => Self::vgpr256, + 288 => Self::vgpr288, + 320 => Self::vgpr320, + 352 => Self::vgpr352, + 384 => Self::vgpr384, + 512 => Self::vgpr512, + 1024 => Self::vgpr1024, _ => return None, }, }; @@ -91,7 +144,19 @@ impl AmdgpuInlineAsmRegClass { InlineAsmType::F16 => Self::vgpr16, InlineAsmType::F32 => Self::vgpr32, InlineAsmType::F64 => Self::vgpr64, - _ => return None, + _ => { + let bytes = match ty { + InlineAsmType::VecI16(n) => n * (16 / 8), + InlineAsmType::VecI32(n) => n * (32 / 8), + InlineAsmType::VecI64(n) => n * (64 / 8), + InlineAsmType::VecI128(n) => n * (128 / 8), + InlineAsmType::VecF16(n) => n * (16 / 8), + InlineAsmType::VecF32(n) => n * (32 / 8), + InlineAsmType::VecF64(n) => n * (64 / 8), + _ => return None, + }; + return Self::from_type(AmdgpuInlineAsmRegClassType::Vgpr, bytes as u32); + } }) } @@ -113,9 +178,46 @@ impl AmdgpuInlineAsmRegClass { ) -> &'static [(InlineAsmType, Option)] { match self { Self::vgpr16 => types! { _: I16, F16; }, - Self::sgpr32 | Self::vgpr32 => types! { _: I16, I32, F16, F32; }, - Self::sgpr64 | Self::vgpr64 => types! { _: I64, F64; }, - Self::sgpr128 | Self::vgpr128 => types! { _: I128; }, + Self::sgpr32 | Self::vgpr32 => types! { _: I16, I32, F16, F32, + VecI16(32 / 16), + VecF16(32 / 16); + }, + Self::sgpr64 | Self::vgpr64 => types! { + _: I64, F64, VecI16(64 / 16), VecI32(64 / 32), + VecF16(64 / 16), VecF32(64 / 32); + }, + Self::sgpr96 | Self::vgpr96 => types! { _: VecI32(96 / 32), VecF32(96 / 32); }, + Self::sgpr128 | Self::vgpr128 => types! { _: I128, + VecI16(128 / 16), VecI32(128 / 32), VecI64(128 / 64), + VecF16(128 / 16), VecF32(128 / 32), VecF64(128 / 64); + }, + Self::vgpr160 => types! { _: VecI32(160 / 32), VecF32(160 / 32); }, + Self::vgpr192 => types! { _: + VecI32(192 / 32), VecI64(192 / 64), + VecF32(192 / 32), VecF64(192 / 64); + }, + Self::vgpr224 => types! { _: VecI32(224 / 32), VecF32(224 / 32); }, + Self::sgpr256 => types! { _: + VecI16(256 / 16), VecI32(256 / 32), VecI64(256 / 64), + VecF16(256 / 16), VecF32(256 / 32), VecF64(256 / 64); + }, + Self::vgpr256 => types! { _: + VecI16(256 / 16), VecI32(256 / 32), + VecF16(256 / 16), VecF32(256 / 32), VecF64(256 / 64); + }, + Self::vgpr288 => types! { _: VecI32(288 / 32), VecF32(288 / 32); }, + Self::vgpr320 => types! { _: VecI32(320 / 32), VecF32(320 / 32); }, + Self::vgpr352 => types! { _: VecI32(352 / 32), VecF32(352 / 32); }, + Self::vgpr384 => types! { _: VecI32(384 / 32), VecF32(384 / 32); }, + Self::sgpr512 => types! { _: + VecI16(512 / 16), VecI32(512 / 32), VecI64(512 / 64), + VecF16(512 / 16), VecF32(512 / 32), VecF64(512 / 64); + }, + Self::vgpr512 => types! { _: + VecI16(512 / 16), VecI32(512 / 32), + VecF16(512 / 16), VecF32(512 / 32); + }, + Self::vgpr1024 => types! { _: VecF32(1024 / 32); }, } } diff --git a/tests/assembly-llvm/asm/amdgpu-vec-types.rs b/tests/assembly-llvm/asm/amdgpu-vec-types.rs new file mode 100644 index 0000000000000..68e4a41da58ff --- /dev/null +++ b/tests/assembly-llvm/asm/amdgpu-vec-types.rs @@ -0,0 +1,845 @@ +//@ add-minicore +//@ revisions: gfx11 gfx12 +//@ assembly-output: emit-asm +//@ compile-flags: --target amdgcn-amd-amdhsa +//@[gfx11] compile-flags: -Ctarget-cpu=gfx1100 +//@[gfx12] compile-flags: -Ctarget-cpu=gfx1200 +//@ needs-llvm-components: amdgpu +//@ needs-rust-lld +// ignore-tidy-linelength + +#![feature(abi_gpu_kernel, no_core, asm_experimental_arch, repr_simd, f16)] +#![crate_type = "rlib"] +#![no_core] +#![allow( + asm_sub_register, + improper_gpu_kernel_arg, + improper_ctypes_definitions, + non_camel_case_types, + unused_assignments, + unused_variables +)] + +extern crate minicore; +use minicore::*; + +type ptr = *mut u8; + +#[repr(simd)] +pub struct i16x2([i16; 2]); +#[repr(simd)] +pub struct f16x2([f16; 2]); + +#[repr(simd)] +pub struct i16x4([i16; 4]); +#[repr(simd)] +pub struct f16x4([f16; 4]); +#[repr(simd)] +pub struct i32x2([i32; 2]); +#[repr(simd)] +pub struct f32x2([f32; 2]); + +#[repr(simd)] +pub struct i32x3([i32; 3]); +#[repr(simd)] +pub struct f32x3([f32; 3]); + +#[repr(simd)] +pub struct i16x8([i16; 8]); +#[repr(simd)] +pub struct f16x8([f16; 8]); +#[repr(simd)] +pub struct i32x4([i32; 4]); +#[repr(simd)] +pub struct f32x4([f32; 4]); +#[repr(simd)] +pub struct i64x2([i64; 2]); +#[repr(simd)] +pub struct f64x2([f64; 2]); + +#[repr(simd)] +pub struct i32x5([i32; 5]); +#[repr(simd)] +pub struct f32x5([f32; 5]); + +#[repr(simd)] +pub struct i32x6([i32; 6]); +#[repr(simd)] +pub struct f32x6([f32; 6]); +#[repr(simd)] +pub struct i64x3([i64; 3]); +#[repr(simd)] +pub struct f64x3([f64; 3]); + +#[repr(simd)] +pub struct i32x7([i32; 7]); +#[repr(simd)] +pub struct f32x7([f32; 7]); + +#[repr(simd)] +pub struct i16x16([i16; 16]); +#[repr(simd)] +pub struct f16x16([f16; 16]); +#[repr(simd)] +pub struct i32x8([i32; 8]); +#[repr(simd)] +pub struct f32x8([f32; 8]); +#[repr(simd)] +pub struct i64x4([i64; 4]); +#[repr(simd)] +pub struct f64x4([f64; 4]); + +#[repr(simd)] +pub struct i32x10([i32; 10]); +#[repr(simd)] +pub struct f32x10([f32; 10]); + +#[repr(simd)] +pub struct i16x32([i16; 32]); +#[repr(simd)] +pub struct f16x32([f16; 32]); +#[repr(simd)] +pub struct i32x16([i32; 16]); +#[repr(simd)] +pub struct f32x16([f32; 16]); +#[repr(simd)] +pub struct i64x8([i64; 8]); +#[repr(simd)] +pub struct f64x8([f64; 8]); + +macro_rules! impl_copy { + ($($ty:ident)*) => { + $( + impl Copy for $ty {} + )* + }; +} + +impl_copy!( + i16x2 f16x2 i16x4 f16x4 i32x2 f32x2 i32x3 f32x3 i16x8 f16x8 i32x4 f32x4 + i64x2 f64x2 i32x5 f32x5 i32x6 f32x6 i64x3 f64x3 i32x7 f32x7 i16x16 f16x16 + i32x8 f32x8 i64x4 f64x4 i32x10 f32x10 i16x32 f16x32 i32x16 f32x16 i64x8 + f64x8 +); + +macro_rules! check { + ($func:ident $ty:ident $class:ident $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func(x: $ty) { + let y: $ty; + asm!(concat!($mov, " {}, {}"), out($class) y, in($class) x); + } + }; + + ($func:ident $ret_ty:ident $ret_class:ident $($arg_name:ident: $arg_ty:ident $arg_class:ident,)* + $mov:literal) => { + check!($func $ret_ty $ret_class $($arg_name: $arg_ty $arg_class,)* $mov, ""); + }; + + ($func:ident $ret_ty:ident $ret_class:ident $($arg_name:ident: $arg_ty:ident $arg_class:ident,)* + $mov:literal, $tail:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func($($arg_name: $arg_ty,)*) { + let result: $ret_ty; + asm!(concat!($mov, " {}", $(", {", stringify!($arg_name), "}",)* $tail), + out($ret_class) result, $($arg_name = in($arg_class) $arg_name,)*); + } + }; +} + +macro_rules! check_reg { + ($func:ident $ty:ident $reg:tt $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func(x: $ty) { + let y: $ty; + asm!(concat!($mov, " ", $reg, ", ", $reg), lateout($reg) y, in($reg) x); + } + }; + + ($func:ident $ret_ty:ident $ret_reg:tt $($arg_name:ident: $arg_ty:ident $arg_reg:tt,)* + $mov:literal) => { + check_reg!($func $ret_ty $ret_reg $($arg_name: $arg_ty $arg_reg,)* $mov, ""); + }; + + ($func:ident $ret_ty:ident $ret_reg:tt $($arg_name:ident: $arg_ty:ident $arg_reg:tt,)* + $mov:literal, $tail:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func($($arg_name: $arg_ty,)*) { + let result: $ret_ty; + asm!(concat!($mov, " ", $ret_reg, $(", ", $arg_reg,)* $tail), lateout($ret_reg) result, + $(in($arg_reg) $arg_name,)*); + } + }; +} + +// CHECK-LABEL: sgpr_i16x2: +// CHECK: #ASMSTART +// CHECK: s_pack_ll_b32_b16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// CHECK: #ASMEND +check!(sgpr_i16x2 i16x2 sgpr32 x: i16 sgpr32, y: i16 sgpr32, "s_pack_ll_b32_b16"); + +// CHECK-LABEL: sgpr_f16x2: +// CHECK: #ASMSTART +// CHECK: s_pack_ll_b32_b16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// CHECK: #ASMEND +check!(sgpr_f16x2 f16x2 sgpr32 x: i16 sgpr32, y: i16 sgpr32, "s_pack_ll_b32_b16"); + +// CHECK-LABEL: vgpr_i16x2: +// CHECK: #ASMSTART +// CHECK: v_pk_add_i16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// CHECK: #ASMEND +check!(vgpr_i16x2 i16x2 vgpr32 x: i16x2 vgpr32, y: i16x2 vgpr32, "v_pk_add_i16"); + +// CHECK-LABEL: vgpr_f16x2: +// CHECK: #ASMSTART +// CHECK: v_pk_add_f16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// CHECK: #ASMEND +check!(vgpr_f16x2 f16x2 vgpr32 x: f16x2 vgpr32, y: f16x2 vgpr32, "v_pk_add_f16"); + +// CHECK-LABEL: sgpr_i16x4: +// CHECK: #ASMSTART +// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i16x4 i16x4 sgpr64 x: ptr sgpr64, y: i32 sgpr32, "s_load_b64"); + +// CHECK-LABEL: sgpr_f16x4: +// CHECK: #ASMSTART +// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f16x4 f16x4 sgpr64 x: ptr sgpr64, y: i32 sgpr32, "s_load_b64"); + +// CHECK-LABEL: sgpr_i32x2: +// CHECK: #ASMSTART +// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i32x2 i32x2 sgpr64 x: ptr sgpr64, y: i32 sgpr32, "s_load_b64"); + +// CHECK-LABEL: sgpr_f32x2: +// CHECK: #ASMSTART +// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f32x2 f32x2 sgpr64 x: ptr sgpr64, y: i32 sgpr32, "s_load_b64"); + +// CHECK-LABEL: vgpr_i16x4: +// CHECK: #ASMSTART +// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_i16x4 i16x4 vgpr64 x: i32 vgpr32, y: ptr sgpr64, "global_load_b64"); + +// CHECK-LABEL: vgpr_f16x4: +// CHECK: #ASMSTART +// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_f16x4 f16x4 vgpr64 x: i32 vgpr32, y: ptr sgpr64, "global_load_b64"); + +// CHECK-LABEL: vgpr_i32x2: +// CHECK: #ASMSTART +// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_i32x2 i32x2 vgpr64 x: i32 vgpr32, y: ptr sgpr64, "global_load_b64"); + +// CHECK-LABEL: vgpr_f32x2: +// CHECK: #ASMSTART +// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_f32x2 f32x2 vgpr64 x: i32 vgpr32, y: ptr sgpr64, "global_load_b64"); + +// gfx12-LABEL: sgpr_i32x3: +// gfx12: #ASMSTART +// gfx12: s_load_b96 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// gfx12: #ASMEND +#[cfg(gfx12)] +check!(sgpr_i32x3 i32x3 sgpr96 x: ptr sgpr64, y: i32 sgpr32, "s_load_b96"); + +// gfx12-LABEL: sgpr_f32x3: +// gfx12: #ASMSTART +// gfx12: s_load_b96 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// gfx12: #ASMEND +#[cfg(gfx12)] +check!(sgpr_f32x3 f32x3 sgpr96 x: ptr sgpr64, y: i32 sgpr32, "s_load_b96"); + +// CHECK-LABEL: vgpr_i32x3: +// CHECK: #ASMSTART +// CHECK: global_load_b96 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_i32x3 i32x3 vgpr96 x: i32 vgpr32, y: ptr sgpr64, "global_load_b96"); + +// CHECK-LABEL: vgpr_f32x3: +// CHECK: #ASMSTART +// CHECK: global_load_b96 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_f32x3 f32x3 vgpr96 x: i32 vgpr32, y: ptr sgpr64, "global_load_b96"); + +// CHECK-LABEL: sgpr_i16x8: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i16x8 i16x8 sgpr128 x: ptr sgpr64, y: i32 sgpr32, "s_load_b128"); + +// CHECK-LABEL: sgpr_f16x8: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f16x8 f16x8 sgpr128 x: ptr sgpr64, y: i32 sgpr32, "s_load_b128"); + +// CHECK-LABEL: sgpr_i32x4: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i32x4 i32x4 sgpr128 x: ptr sgpr64, y: i32 sgpr32, "s_load_b128"); + +// CHECK-LABEL: sgpr_f32x4: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f32x4 f32x4 sgpr128 x: ptr sgpr64, y: i32 sgpr32, "s_load_b128"); + +// CHECK-LABEL: sgpr_i64x2: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i64x2 i64x2 sgpr128 x: ptr sgpr64, y: i32 sgpr32, "s_load_b128"); + +// CHECK-LABEL: sgpr_f64x2: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f64x2 f64x2 sgpr128 x: ptr sgpr64, y: i32 sgpr32, "s_load_b128"); + +// CHECK-LABEL: vgpr_i16x8: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_i16x8 i16x8 vgpr128 x: i32 vgpr32, y: ptr sgpr64, "global_load_b128"); + +// CHECK-LABEL: vgpr_f16x8: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_f16x8 f16x8 vgpr128 x: i32 vgpr32, y: ptr sgpr64, "global_load_b128"); + +// CHECK-LABEL: vgpr_i32x4: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_i32x4 i32x4 vgpr128 x: i32 vgpr32, y: ptr sgpr64, "global_load_b128"); + +// CHECK-LABEL: vgpr_f32x4: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_f32x4 f32x4 vgpr128 x: i32 vgpr32, y: ptr sgpr64, "global_load_b128"); + +// CHECK-LABEL: vgpr_i64x2: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_i64x2 i64x2 vgpr128 x: i32 vgpr32, y: ptr sgpr64, "global_load_b128"); + +// CHECK-LABEL: vgpr_f64x2: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_f64x2 f64x2 vgpr128 x: i32 vgpr32, y: ptr sgpr64, "global_load_b128"); + +// CHECK-LABEL: vgpr_i32x5: +// CHECK: #ASMSTART +// CHECK: image_load v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_1D tfe +// CHECK: #ASMEND +check!(vgpr_i32x5 i32x5 vgpr160 x: i32 vgpr32, y: i32x8 sgpr256, "image_load", + " dmask:0xf dim:SQ_RSRC_IMG_1D tfe"); + +// CHECK-LABEL: vgpr_f32x5: +// CHECK: #ASMSTART +// CHECK: image_load v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_1D tfe +// CHECK: #ASMEND +check!(vgpr_f32x5 f32x5 vgpr160 x: i32 vgpr32, y: i32x8 sgpr256, "image_load", + " dmask:0xf dim:SQ_RSRC_IMG_1D tfe"); + +// gfx11-LABEL: vgpr_i32x6: +// gfx11: #ASMSTART +// gfx11: image_sample_d v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_i32x6 i32x4 vgpr128 x: i32x6 vgpr192, y: i32x8 sgpr256, z: i32x4 sgpr128, + "image_sample_d", " dmask:0xf dim:SQ_RSRC_IMG_2D"); + +// gfx11-LABEL: vgpr_f32x6: +// gfx11: #ASMSTART +// gfx11: image_sample_d v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_f32x6 i32x4 vgpr128 x: f32x6 vgpr192, y: i32x8 sgpr256, z: i32x4 sgpr128, + "image_sample_d", " dmask:0xf dim:SQ_RSRC_IMG_2D"); + +// gfx11-LABEL: vgpr_i32x7: +// gfx11: #ASMSTART +// gfx11: image_sample_d_cl v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_i32x7 i32x4 vgpr128 x: i32x7 vgpr224, y: i32x8 sgpr256, z: i32x4 sgpr128, + "image_sample_d_cl", " dmask:0xf dim:SQ_RSRC_IMG_2D"); + +// gfx11-LABEL: vgpr_f32x7: +// gfx11: #ASMSTART +// gfx11: image_sample_d_cl v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_f32x7 i32x4 vgpr128 x: f32x7 vgpr224, y: i32x8 sgpr256, z: i32x4 sgpr128, + "image_sample_d_cl", " dmask:0xf dim:SQ_RSRC_IMG_2D"); + +// CHECK-LABEL: sgpr_i16x16: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i16x16 i16x16 sgpr256 x: ptr sgpr64, y: i32 sgpr32, "s_load_b256"); + +// CHECK-LABEL: sgpr_f16x16: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f16x16 f16x16 sgpr256 x: ptr sgpr64, y: i32 sgpr32, "s_load_b256"); + +// CHECK-LABEL: sgpr_i32x8: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i32x8 i32x8 sgpr256 x: ptr sgpr64, y: i32 sgpr32, "s_load_b256"); + +// CHECK-LABEL: sgpr_f32x8: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f32x8 f32x8 sgpr256 x: ptr sgpr64, y: i32 sgpr32, "s_load_b256"); + +// CHECK-LABEL: sgpr_i64x4: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i64x4 i64x4 sgpr256 x: ptr sgpr64, y: i32 sgpr32, "s_load_b256"); + +// CHECK-LABEL: sgpr_f64x4: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f64x4 f64x4 sgpr256 x: ptr sgpr64, y: i32 sgpr32, "s_load_b256"); + +// gfx11-LABEL: vgpr_i16x16: +// gfx11: #ASMSTART +// gfx11: v_wmma_f32_16x16x16_bf16 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_i16x16 f32x8 vgpr256 x: i32x8 vgpr256, y: i16x16 vgpr256, z: f32x8 vgpr256, + "v_wmma_f32_16x16x16_bf16"); + +// gfx11-LABEL: vgpr_f16x16: +// gfx11: #ASMSTART +// gfx11: v_wmma_f32_16x16x16_f16 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_f16x16 f32x8 vgpr256 x: f16x16 vgpr256, y: f16x16 vgpr256, z: f32x8 vgpr256, + "v_wmma_f32_16x16x16_f16"); + +// gfx11-LABEL: vgpr_i32x8: +// gfx11: #ASMSTART +// gfx11: v_wmma_i32_16x16x16_iu8 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_i32x8 i32x8 vgpr256 x: i32x4 vgpr128, y: i32x4 vgpr128, z: i32x8 vgpr256, + "v_wmma_i32_16x16x16_iu8"); + +// gfx12-LABEL: vgpr_f32x8: +// gfx12: #ASMSTART +// gfx12: v_wmma_f32_16x16x16_fp8_fp8 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx12: #ASMEND +#[cfg(gfx12)] +check!(vgpr_f32x8 f32x8 vgpr256 x: f32x2 vgpr64, y: f32x2 vgpr64, z: f32x8 vgpr256, + "v_wmma_f32_16x16x16_fp8_fp8"); + +// gfx12-LABEL: vgpr_i32x10: +// gfx12: #ASMSTART +// gfx12: image_bvh8_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3] +// gfx12: #ASMEND +#[cfg(gfx12)] +check!(vgpr_i32x10 i32x10 vgpr320 "image_bvh8_intersect_ray", + ", [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3]"); + +// gfx12-LABEL: vgpr_f32x10: +// gfx12: #ASMSTART +// gfx12: image_bvh8_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3] +// gfx12: #ASMEND +#[cfg(gfx12)] +check!(vgpr_f32x10 f32x10 vgpr320 "image_bvh8_intersect_ray", + ", [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3]"); + +// CHECK-LABEL: sgpr_i16x32: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i16x32 i16x32 sgpr512 x: ptr sgpr64, y: i32 sgpr32, "s_load_b512"); + +// CHECK-LABEL: sgpr_f16x32: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f16x32 f16x32 sgpr512 x: ptr sgpr64, y: i32 sgpr32, "s_load_b512"); + +// CHECK-LABEL: sgpr_i32x16: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i32x16 i32x16 sgpr512 x: ptr sgpr64, y: i32 sgpr32, "s_load_b512"); + +// CHECK-LABEL: sgpr_f32x16: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f32x16 f32x16 sgpr512 x: ptr sgpr64, y: i32 sgpr32, "s_load_b512"); + +// CHECK-LABEL: sgpr_i64x8: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i64x8 i64x8 sgpr512 x: ptr sgpr64, y: i32 sgpr32, "s_load_b512"); + +// CHECK-LABEL: sgpr_f64x8: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f64x8 f64x8 sgpr512 x: ptr sgpr64, y: i32 sgpr32, "s_load_b512"); + +// CHECK-LABEL: s0_i16x2: +// CHECK: #ASMSTART +// CHECK: s_pack_ll_b32_b16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// CHECK: #ASMEND +check_reg!(s0_i16x2 i16x2 "s0" x: i16 "s1", y: i16 "s2", "s_pack_ll_b32_b16"); + +// CHECK-LABEL: s0_f16x2: +// CHECK: #ASMSTART +// CHECK: s_pack_ll_b32_b16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// CHECK: #ASMEND +check_reg!(s0_f16x2 f16x2 "s0" x: i16 "s1", y: i16 "s2", "s_pack_ll_b32_b16"); + +// CHECK-LABEL: v0_i16x2: +// CHECK: #ASMSTART +// CHECK: v_pk_add_i16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// CHECK: #ASMEND +check_reg!(v0_i16x2 i16x2 "v0" x: i16x2 "v1", y: i16x2 "v2", "v_pk_add_i16"); + +// CHECK-LABEL: v0_f16x2: +// CHECK: #ASMSTART +// CHECK: v_pk_add_f16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// CHECK: #ASMEND +check_reg!(v0_f16x2 f16x2 "v0" x: f16x2 "v1", y: f16x2 "v2", "v_pk_add_f16"); + +// CHECK-LABEL: s0_i16x4: +// CHECK: #ASMSTART +// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i16x4 i16x4 "s[0:1]" x: ptr "s[2:3]", y: i32 "s4", "s_load_b64"); + +// CHECK-LABEL: s0_f16x4: +// CHECK: #ASMSTART +// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f16x4 f16x4 "s[0:1]" x: ptr "s[2:3]", y: i32 "s4", "s_load_b64"); + +// CHECK-LABEL: s0_i32x2: +// CHECK: #ASMSTART +// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i32x2 i32x2 "s[0:1]" x: ptr "s[2:3]", y: i32 "s4", "s_load_b64"); + +// CHECK-LABEL: s0_f32x2: +// CHECK: #ASMSTART +// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f32x2 f32x2 "s[0:1]" x: ptr "s[2:3]", y: i32 "s4", "s_load_b64"); + +// CHECK-LABEL: v0_i16x4: +// CHECK: #ASMSTART +// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_i16x4 i16x4 "v[0:1]" x: i32 "v2", y: ptr "s[0:1]", "global_load_b64"); + +// CHECK-LABEL: v0_f16x4: +// CHECK: #ASMSTART +// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_f16x4 f16x4 "v[0:1]" x: i32 "v2", y: ptr "s[0:1]", "global_load_b64"); + +// CHECK-LABEL: v0_i32x2: +// CHECK: #ASMSTART +// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_i32x2 i32x2 "v[0:1]" x: i32 "v2", y: ptr "s[0:1]", "global_load_b64"); + +// CHECK-LABEL: v0_f32x2: +// CHECK: #ASMSTART +// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_f32x2 f32x2 "v[0:1]" x: i32 "v2", y: ptr "s[0:1]", "global_load_b64"); + +// gfx12-LABEL: s0_i32x3: +// gfx12: #ASMSTART +// gfx12: s_load_b96 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// gfx12: #ASMEND +#[cfg(gfx12)] +check_reg!(s0_i32x3 i32x3 "s[0:2]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b96"); + +// gfx12-LABEL: s0_f32x3: +// gfx12: #ASMSTART +// gfx12: s_load_b96 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// gfx12: #ASMEND +#[cfg(gfx12)] +check_reg!(s0_f32x3 f32x3 "s[0:2]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b96"); + +// CHECK-LABEL: v0_i32x3: +// CHECK: #ASMSTART +// CHECK: global_load_b96 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_i32x3 i32x3 "v[0:2]" x: i32 "v3", y: ptr "s[0:1]", "global_load_b96"); + +// CHECK-LABEL: v0_f32x3: +// CHECK: #ASMSTART +// CHECK: global_load_b96 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_f32x3 f32x3 "v[0:2]" x: i32 "v3", y: ptr "s[0:1]", "global_load_b96"); + +// CHECK-LABEL: s0_i16x8: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i16x8 i16x8 "s[0:3]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b128"); + +// CHECK-LABEL: s0_f16x8: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f16x8 f16x8 "s[0:3]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b128"); + +// CHECK-LABEL: s0_i32x4: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i32x4 i32x4 "s[0:3]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b128"); + +// CHECK-LABEL: s0_f32x4: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f32x4 f32x4 "s[0:3]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b128"); + +// CHECK-LABEL: s0_i64x2: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i64x2 i64x2 "s[0:3]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b128"); + +// CHECK-LABEL: s0_f64x2: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f64x2 f64x2 "s[0:3]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b128"); + +// CHECK-LABEL: v0_i16x8: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_i16x8 i16x8 "v[0:3]" x: i32 "v4", y: ptr "s[0:1]", "global_load_b128"); + +// CHECK-LABEL: v0_f16x8: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_f16x8 f16x8 "v[0:3]" x: i32 "v4", y: ptr "s[0:1]", "global_load_b128"); + +// CHECK-LABEL: v0_i32x4: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_i32x4 i32x4 "v[0:3]" x: i32 "v4", y: ptr "s[0:1]", "global_load_b128"); + +// CHECK-LABEL: v0_f32x4: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_f32x4 f32x4 "v[0:3]" x: i32 "v4", y: ptr "s[0:1]", "global_load_b128"); + +// CHECK-LABEL: v0_i64x2: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_i64x2 i64x2 "v[0:3]" x: i32 "v4", y: ptr "s[0:1]", "global_load_b128"); + +// CHECK-LABEL: v0_f64x2: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_f64x2 f64x2 "v[0:3]" x: i32 "v4", y: ptr "s[0:1]", "global_load_b128"); + +// CHECK-LABEL: v0_i32x5: +// CHECK: #ASMSTART +// CHECK: image_load v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_1D tfe +// CHECK: #ASMEND +check_reg!(v0_i32x5 i32x5 "v[0:4]" x: i32 "v5", y: i32x8 "s[0:7]", "image_load", + " dmask:0xf dim:SQ_RSRC_IMG_1D tfe"); + +// CHECK-LABEL: v0_f32x5: +// CHECK: #ASMSTART +// CHECK: image_load v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_1D tfe +// CHECK: #ASMEND +check_reg!(v0_f32x5 f32x5 "v[0:4]" x: i32 "v5", y: i32x8 "s[0:7]", "image_load", + " dmask:0xf dim:SQ_RSRC_IMG_1D tfe"); + +// gfx11-LABEL: v0_i32x6: +// gfx11: #ASMSTART +// gfx11: image_sample_d v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_i32x6 i32x4 "v[0:3]" x: i32x6 "v[4:9]", y: i32x8 "s[0:7]", z: i32x4 "s[8:11]", + "image_sample_d", " dmask:0xf dim:SQ_RSRC_IMG_2D"); + +// gfx11-LABEL: v0_f32x6: +// gfx11: #ASMSTART +// gfx11: image_sample_d v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_f32x6 i32x4 "v[0:3]" x: f32x6 "v[4:9]", y: i32x8 "s[0:7]", z: i32x4 "s[8:11]", + "image_sample_d", " dmask:0xf dim:SQ_RSRC_IMG_2D"); + +// gfx11-LABEL: v0_i32x7: +// gfx11: #ASMSTART +// gfx11: image_sample_d_cl v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_i32x7 i32x4 "v[0:3]" x: i32x7 "v[4:10]", y: i32x8 "s[0:7]", z: i32x4 "s[8:11]", + "image_sample_d_cl", " dmask:0xf dim:SQ_RSRC_IMG_2D"); + +// gfx11-LABEL: v0_f32x7: +// gfx11: #ASMSTART +// gfx11: image_sample_d_cl v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_f32x7 i32x4 "v[0:3]" x: f32x7 "v[4:10]", y: i32x8 "s[0:7]", z: i32x4 "s[8:11]", + "image_sample_d_cl", " dmask:0xf dim:SQ_RSRC_IMG_2D"); + +// CHECK-LABEL: s0_i16x16: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i16x16 i16x16 "s[0:7]" x: ptr "s[8:9]", y: i32 "s10", "s_load_b256"); + +// CHECK-LABEL: s0_f16x16: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f16x16 f16x16 "s[0:7]" x: ptr "s[8:9]", y: i32 "s10", "s_load_b256"); + +// CHECK-LABEL: s0_i32x8: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i32x8 i32x8 "s[0:7]" x: ptr "s[8:9]", y: i32 "s10", "s_load_b256"); + +// CHECK-LABEL: s0_f32x8: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f32x8 f32x8 "s[0:7]" x: ptr "s[8:9]", y: i32 "s10", "s_load_b256"); + +// CHECK-LABEL: s0_i64x4: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i64x4 i64x4 "s[0:7]" x: ptr "s[8:9]", y: i32 "s10", "s_load_b256"); + +// CHECK-LABEL: s0_f64x4: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f64x4 f64x4 "s[0:7]" x: ptr "s[8:9]", y: i32 "s10", "s_load_b256"); + +// gfx11-LABEL: v0_i16x16: +// gfx11: #ASMSTART +// gfx11: v_wmma_f32_16x16x16_bf16 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_i16x16 f32x8 "v[0:7]" x: i32x8 "v[8:15]", y: i16x16 "v[16:23]", z: f32x8 "v[24:31]", + "v_wmma_f32_16x16x16_bf16"); + +// gfx11-LABEL: v0_f16x16: +// gfx11: #ASMSTART +// gfx11: v_wmma_f32_16x16x16_f16 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_f16x16 f32x8 "v[0:7]" x: f16x16 "v[8:15]", y: f16x16 "v[16:23]", z: f32x8 "v[24:31]", + "v_wmma_f32_16x16x16_f16"); + +// gfx11-LABEL: v0_i32x8: +// gfx11: #ASMSTART +// gfx11: v_wmma_i32_16x16x16_iu8 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_i32x8 i32x8 "v[0:7]" x: i32x4 "v[8:11]", y: i32x4 "v[16:19]", z: i32x8 "v[24:31]", + "v_wmma_i32_16x16x16_iu8"); + +// gfx12-LABEL: v0_f32x8: +// gfx12: #ASMSTART +// gfx12: v_wmma_f32_16x16x16_fp8_fp8 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx12: #ASMEND +#[cfg(gfx12)] +check_reg!(v0_f32x8 f32x8 "v[0:7]" x: f32x2 "v[8:9]", y: f32x2 "v[16:17]", z: f32x8 "v[24:31]", + "v_wmma_f32_16x16x16_fp8_fp8"); + +// gfx12-LABEL: v0_i32x10: +// gfx12: #ASMSTART +// gfx12: image_bvh8_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3] +// gfx12: #ASMEND +#[cfg(gfx12)] +check_reg!(v0_i32x10 i32x10 "v[0:9]" "image_bvh8_intersect_ray", + ", [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3]"); + +// gfx12-LABEL: v0_f32x10: +// gfx12: #ASMSTART +// gfx12: image_bvh8_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3] +// gfx12: #ASMEND +#[cfg(gfx12)] +check_reg!(v0_f32x10 f32x10 "v[0:9]" "image_bvh8_intersect_ray", + ", [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3]"); + +// CHECK-LABEL: s0_i16x32: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i16x32 i16x32 "s[0:15]" x: ptr "s[16:17]", y: i32 "s18", "s_load_b512"); + +// CHECK-LABEL: s0_f16x32: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f16x32 f16x32 "s[0:15]" x: ptr "s[16:17]", y: i32 "s18", "s_load_b512"); + +// CHECK-LABEL: s0_i32x16: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i32x16 i32x16 "s[0:15]" x: ptr "s[16:17]", y: i32 "s18", "s_load_b512"); + +// CHECK-LABEL: s0_f32x16: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f32x16 f32x16 "s[0:15]" x: ptr "s[16:17]", y: i32 "s18", "s_load_b512"); + +// CHECK-LABEL: s0_i64x8: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i64x8 i64x8 "s[0:15]" x: ptr "s[16:17]", y: i32 "s18", "s_load_b512"); + +// CHECK-LABEL: s0_f64x8: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f64x8 f64x8 "s[0:15]" x: ptr "s[16:17]", y: i32 "s18", "s_load_b512"); diff --git a/tests/assembly-llvm/asm/amdgpu-vec-types2.rs b/tests/assembly-llvm/asm/amdgpu-vec-types2.rs new file mode 100644 index 0000000000000..14427e3b9d528 --- /dev/null +++ b/tests/assembly-llvm/asm/amdgpu-vec-types2.rs @@ -0,0 +1,283 @@ +//@ add-minicore +//@ revisions: gfx942 gfx950 gfx1030 +//@ assembly-output: emit-asm +//@ compile-flags: --target amdgcn-amd-amdhsa +//@[gfx942] compile-flags: -Ctarget-cpu=gfx942 +//@[gfx950] compile-flags: -Ctarget-cpu=gfx950 +//@[gfx1030] compile-flags: -Ctarget-cpu=gfx1030 +//@ needs-llvm-components: amdgpu +//@ needs-rust-lld +// ignore-tidy-linelength + +// Tests for different gfx versions that do not fit in gfx11 and 12 + +#![feature(abi_gpu_kernel, no_core, asm_experimental_arch, repr_simd, f16)] +#![crate_type = "rlib"] +#![no_core] +#![allow( + asm_sub_register, + improper_gpu_kernel_arg, + improper_ctypes_definitions, + non_camel_case_types, + unused_assignments, + unused_variables +)] + +extern crate minicore; +use minicore::*; + +type ptr = *mut u8; + +#[repr(simd)] +pub struct i32x4([i32; 4]); +#[repr(simd)] +pub struct f32x4([f32; 4]); + +#[repr(simd)] +pub struct f64x4([f64; 4]); + +#[repr(simd)] +pub struct i32x9([i32; 9]); +#[repr(simd)] +pub struct f32x9([f32; 9]); + +#[repr(simd)] +pub struct i32x11([i32; 11]); +#[repr(simd)] +pub struct f32x11([f32; 11]); + +#[repr(simd)] +pub struct i32x12([i32; 12]); +#[repr(simd)] +pub struct f32x12([f32; 12]); + +#[repr(simd)] +pub struct i16x32([i16; 32]); +#[repr(simd)] +pub struct f16x32([f16; 32]); +#[repr(simd)] +pub struct i32x16([i32; 16]); +#[repr(simd)] +pub struct f32x16([f32; 16]); + +#[repr(simd)] +pub struct f32x32([f32; 32]); + +macro_rules! impl_copy { + ($($ty:ident)*) => { + $( + impl Copy for $ty {} + )* + }; +} + +impl_copy!( + i32x4 f32x4 f64x4 i32x9 f32x9 i32x11 f32x11 i32x12 f32x12 i16x32 f16x32 + i32x16 f32x16 f32x32 +); + +macro_rules! check { + ($func:ident $ty:ident $class:ident $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func(x: $ty) { + let y: $ty; + asm!(concat!($mov, " {}, {}"), out($class) y, in($class) x); + } + }; + + ($func:ident $ret_ty:ident $ret_class:ident $($arg_name:ident: $arg_ty:ident $arg_class:ident,)* + $mov:literal) => { + check!($func $ret_ty $ret_class $($arg_name: $arg_ty $arg_class,)* $mov, ""); + }; + + ($func:ident $ret_ty:ident $ret_class:ident $($arg_name:ident: $arg_ty:ident $arg_class:ident,)* + $mov:literal, $tail:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func($($arg_name: $arg_ty,)*) { + let result: $ret_ty; + asm!(concat!($mov, " {}", $(", {", stringify!($arg_name), "}",)* $tail), + out($ret_class) result, $($arg_name = in($arg_class) $arg_name,)*); + } + }; +} + +macro_rules! check_reg { + ($func:ident $ty:ident $reg:tt $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func(x: $ty) { + let y: $ty; + asm!(concat!($mov, " ", $reg, ", ", $reg), lateout($reg) y, in($reg) x); + } + }; + + ($func:ident $ret_ty:ident $ret_reg:tt $($arg_name:ident: $arg_ty:ident $arg_reg:tt,)* + $mov:literal) => { + check_reg!($func $ret_ty $ret_reg $($arg_name: $arg_ty $arg_reg,)* $mov, ""); + }; + + ($func:ident $ret_ty:ident $ret_reg:tt $($arg_name:ident: $arg_ty:ident $arg_reg:tt,)* + $mov:literal, $tail:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func($($arg_name: $arg_ty,)*) { + let result: $ret_ty; + asm!(concat!($mov, " ", $ret_reg, $(", ", $arg_reg,)* $tail), lateout($ret_reg) result, + $(in($arg_reg) $arg_name,)*); + } + }; +} + +// gfx942-LABEL: vgpr_f64x4: +// gfx942: #ASMSTART +// gfx942: v_mfma_f64_16x16x4_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx942: #ASMEND +#[cfg(gfx942)] +check!(vgpr_f64x4 f64x4 vgpr256 x: f64 vgpr64, y: f64 vgpr64, z: f64x4 vgpr256, + "v_mfma_f64_16x16x4_f64"); + +// gfx1030-LABEL: vgpr_i32x9: +// gfx1030: #ASMSTART +// gfx1030: image_bvh64_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} a16 +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check!(vgpr_i32x9 i32x4 vgpr128 x: i32x9 vgpr288, y: i32x4 sgpr128, "image_bvh64_intersect_ray", + " a16"); + +// gfx1030-LABEL: vgpr_f32x9: +// gfx1030: #ASMSTART +// gfx1030: image_bvh64_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} a16 +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check!(vgpr_f32x9 i32x4 vgpr128 x: f32x9 vgpr288, y: i32x4 sgpr128, "image_bvh64_intersect_ray", + " a16"); + +// gfx1030-LABEL: vgpr_i32x11: +// gfx1030: #ASMSTART +// gfx1030: image_bvh_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check!(vgpr_i32x11 i32x4 vgpr128 x: i32x11 vgpr352, y: i32x4 sgpr128, "image_bvh_intersect_ray"); + +// gfx1030-LABEL: vgpr_f32x11: +// gfx1030: #ASMSTART +// gfx1030: image_bvh_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check!(vgpr_f32x11 i32x4 vgpr128 x: f32x11 vgpr352, y: i32x4 sgpr128, "image_bvh_intersect_ray"); + +// gfx1030-LABEL: vgpr_i32x12: +// gfx1030: #ASMSTART +// gfx1030: image_bvh64_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check!(vgpr_i32x12 i32x4 vgpr128 x: i32x12 vgpr384, y: i32x4 sgpr128, "image_bvh64_intersect_ray"); + +// gfx1030-LABEL: vgpr_f32x12: +// gfx1030: #ASMSTART +// gfx1030: image_bvh64_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check!(vgpr_f32x12 i32x4 vgpr128 x: f32x12 vgpr384, y: i32x4 sgpr128, "image_bvh64_intersect_ray"); + +// gfx950-LABEL: vgpr_i32x16: +// gfx950: #ASMSTART +// gfx950: v_mfma_i32_32x32x32_i8 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx950: #ASMEND +#[cfg(gfx950)] +check!(vgpr_i32x16 i32x16 vgpr512 x: i32x4 vgpr128, y: i32x4 vgpr128, z: i16x32 vgpr512, + "v_mfma_i32_32x32x32_i8"); + +// gfx950-LABEL: vgpr_f32x16: +// gfx950: #ASMSTART +// gfx950: v_mfma_f32_32x32x16_f16 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx950: #ASMEND +#[cfg(gfx950)] +check!(vgpr_f32x16 f32x16 vgpr512 x: f32x4 vgpr128, y: f32x4 vgpr128, z: f16x32 vgpr512, + "v_mfma_f32_32x32x16_f16"); + +// gfx942-LABEL: vgpr_f32x32: +// gfx942: #ASMSTART +// gfx942: v_mfma_f32_32x32x1_2b_f32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx942: #ASMEND +#[cfg(gfx942)] +check!(vgpr_f32x32 f32x32 vgpr1024 x: f32 vgpr32, y: f32 vgpr32, "v_mfma_f32_32x32x1_2b_f32", + ", v[0:31]"); + +// gfx942-LABEL: v0_f64x4: +// gfx942: #ASMSTART +// gfx942: v_mfma_f64_16x16x4_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx942: #ASMEND +#[cfg(gfx942)] +check_reg!(v0_f64x4 f64x4 "v[0:7]" x: f64 "v[8:9]", y: f64 "v[10:11]", z: f64x4 "v[16:23]", + "v_mfma_f64_16x16x4_f64"); + +// gfx1030-LABEL: v0_i32x9: +// gfx1030: #ASMSTART +// gfx1030: image_bvh64_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} a16 +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check_reg!(v0_i32x9 i32x4 "v[0:3]" x: i32x9 "v[8:16]", y: i32x4 "s[0:3]", + "image_bvh64_intersect_ray", " a16"); + +// gfx1030-LABEL: v0_f32x9: +// gfx1030: #ASMSTART +// gfx1030: image_bvh64_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} a16 +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check_reg!(v0_f32x9 i32x4 "v[0:3]" x: f32x9 "v[8:16]", y: i32x4 "s[0:3]", + "image_bvh64_intersect_ray", " a16"); + +// gfx1030-LABEL: v0_i32x11: +// gfx1030: #ASMSTART +// gfx1030: image_bvh_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check_reg!(v0_i32x11 i32x4 "v[0:3]" x: i32x11 "v[8:18]", y: i32x4 "s[0:3]", + "image_bvh_intersect_ray"); + +// gfx1030-LABEL: v0_f32x11: +// gfx1030: #ASMSTART +// gfx1030: image_bvh_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check_reg!(v0_f32x11 i32x4 "v[0:3]" x: f32x11 "v[8:18]", y: i32x4 "s[0:3]", + "image_bvh_intersect_ray"); + +// gfx1030-LABEL: v0_i32x12: +// gfx1030: #ASMSTART +// gfx1030: image_bvh64_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check_reg!(v0_i32x12 i32x4 "v[0:3]" x: i32x12 "v[8:19]", y: i32x4 "s[0:3]", + "image_bvh64_intersect_ray"); + +// gfx1030-LABEL: v0_f32x12: +// gfx1030: #ASMSTART +// gfx1030: image_bvh64_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check_reg!(v0_f32x12 i32x4 "v[0:3]" x: f32x12 "v[8:19]", y: i32x4 "s[0:3]", + "image_bvh64_intersect_ray"); + +// gfx950-LABEL: v0_i32x16: +// gfx950: #ASMSTART +// gfx950: v_mfma_i32_32x32x32_i8 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx950: #ASMEND +#[cfg(gfx950)] +check_reg!(v0_i32x16 i32x16 "v[0:15]" x: i32x4 "v[16:19]", y: i32x4 "v[20:23]", z: i16x32 "v[0:15]", + "v_mfma_i32_32x32x32_i8"); + +// gfx950-LABEL: v0_f32x16: +// gfx950: #ASMSTART +// gfx950: v_mfma_f32_32x32x16_f16 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx950: #ASMEND +#[cfg(gfx950)] +check_reg!(v0_f32x16 f32x16 "v[0:15]" x: f32x4 "v[16:19]", y: f32x4 "v[20:23]", z: f16x32 "v[0:15]", + "v_mfma_f32_32x32x16_f16"); + +// gfx942-LABEL: v0_f32x32: +// gfx942: #ASMSTART +// gfx942: v_mfma_f32_32x32x1_2b_f32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx942: #ASMEND +#[cfg(gfx942)] +check_reg!(v0_f32x32 f32x32 "v[0:31]" x: f32 "v32", y: f32 "v33", "v_mfma_f32_32x32x1_2b_f32", + ", v[0:31]");