diff --git a/compiler/rustc_codegen_gcc/src/asm.rs b/compiler/rustc_codegen_gcc/src/asm.rs index 319f3d3278730..5a2c297f396cd 100644 --- a/compiler/rustc_codegen_gcc/src/asm.rs +++ b/compiler/rustc_codegen_gcc/src/asm.rs @@ -670,6 +670,27 @@ fn reg_class_to_gcc(reg_class: InlineAsmRegClass) -> &'static str { InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::preg) => { unreachable!("clobber-only") } + InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::sgpr32) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::sgpr64) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::sgpr96) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::sgpr128) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::sgpr256) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::sgpr512) => "Sg", + InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr16) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr32) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr64) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr96) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr128) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr160) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr192) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr224) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr256) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr288) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr320) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr352) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr384) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr512) + | InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr1024) => "v", InlineAsmRegClass::Arm(ArmInlineAsmRegClass::reg) => "r", InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg) | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::dreg_low16) @@ -767,6 +788,7 @@ fn dummy_output_type<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, reg: InlineAsmRegCl InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::preg) => { unreachable!("clobber-only") } + InlineAsmRegClass::Amdgpu(_) => cx.type_i32(), InlineAsmRegClass::Arm(ArmInlineAsmRegClass::reg) => cx.type_i32(), InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg) | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg_low16) => cx.type_f32(), @@ -953,6 +975,7 @@ fn modifier_to_gcc( InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::preg) => { unreachable!("clobber-only") } + InlineAsmRegClass::Amdgpu(_) => None, InlineAsmRegClass::Arm(ArmInlineAsmRegClass::reg) => None, InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg) | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg_low16) => None, diff --git a/compiler/rustc_codegen_llvm/src/asm.rs b/compiler/rustc_codegen_llvm/src/asm.rs index 80d77be1cc384..37e3c12ca31ef 100644 --- a/compiler/rustc_codegen_llvm/src/asm.rs +++ b/compiler/rustc_codegen_llvm/src/asm.rs @@ -228,6 +228,7 @@ impl<'ll, 'tcx> AsmBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { InlineAsmArch::AArch64 | InlineAsmArch::Arm64EC | InlineAsmArch::Arm => { constraints.push("~{cc}".to_string()); } + InlineAsmArch::Amdgpu => {} InlineAsmArch::X86 | InlineAsmArch::X86_64 => { constraints.extend_from_slice(&[ "~{dirflag}".to_string(), @@ -645,6 +646,27 @@ fn reg_to_llvm(reg: InlineAsmRegOrRegClass, layout: Option<&TyAndLayout<'_>>) -> | Arm(ArmInlineAsmRegClass::dreg_low8) | Arm(ArmInlineAsmRegClass::qreg_low4) => "x", Arm(ArmInlineAsmRegClass::dreg) | Arm(ArmInlineAsmRegClass::qreg) => "w", + Amdgpu(AmdgpuInlineAsmRegClass::sgpr32) + | Amdgpu(AmdgpuInlineAsmRegClass::sgpr64) + | Amdgpu(AmdgpuInlineAsmRegClass::sgpr96) + | Amdgpu(AmdgpuInlineAsmRegClass::sgpr128) + | Amdgpu(AmdgpuInlineAsmRegClass::sgpr256) + | Amdgpu(AmdgpuInlineAsmRegClass::sgpr512) => "s", + Amdgpu(AmdgpuInlineAsmRegClass::vgpr16) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr32) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr64) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr96) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr128) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr160) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr192) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr224) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr256) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr288) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr320) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr352) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr384) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr512) + | Amdgpu(AmdgpuInlineAsmRegClass::vgpr1024) => "v", Hexagon(HexagonInlineAsmRegClass::reg) => "r", Hexagon(HexagonInlineAsmRegClass::preg) => unreachable!("clobber-only"), LoongArch(LoongArchInlineAsmRegClass::reg) => "r", @@ -746,6 +768,7 @@ fn modifier_to_llvm( modifier } } + Amdgpu(_) => None, Hexagon(_) => None, LoongArch(_) => None, Mips(_) => None, @@ -826,6 +849,7 @@ fn dummy_output_type<'ll>(cx: &CodegenCx<'ll, '_>, reg: InlineAsmRegClass) -> &' Arm(ArmInlineAsmRegClass::qreg) | Arm(ArmInlineAsmRegClass::qreg_low8) | Arm(ArmInlineAsmRegClass::qreg_low4) => cx.type_vector(cx.type_i64(), 2), + Amdgpu(_) => cx.type_i32(), Hexagon(HexagonInlineAsmRegClass::reg) => cx.type_i32(), Hexagon(HexagonInlineAsmRegClass::preg) => unreachable!("clobber-only"), LoongArch(LoongArchInlineAsmRegClass::reg) => cx.type_i32(), diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index 5623b984b2420..ca33672b9ca3f 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -2074,6 +2074,12 @@ symbols! { self_in_typedefs, self_struct_ctor, semiopaque, + sgpr32, + sgpr64, + sgpr96, + sgpr128, + sgpr256, + sgpr512, sha2, sha3, sha512_sm_x86, @@ -2502,6 +2508,21 @@ symbols! { verbatim, version, vfp2, + vgpr16, + vgpr32, + vgpr64, + vgpr96, + vgpr128, + vgpr160, + vgpr192, + vgpr224, + vgpr256, + vgpr288, + vgpr320, + vgpr352, + vgpr384, + vgpr512, + vgpr1024, vis, visible_private_types, volatile, diff --git a/compiler/rustc_target/src/asm/amdgpu.rs b/compiler/rustc_target/src/asm/amdgpu.rs new file mode 100644 index 0000000000000..7e7ab17fe018e --- /dev/null +++ b/compiler/rustc_target/src/asm/amdgpu.rs @@ -0,0 +1,486 @@ +use std::fmt; + +use rustc_span::Symbol; + +use super::{InlineAsmArch, InlineAsmType, ModifierInfo}; + +// Types are listed as SGPR_*/VGPR_* in llvm/lib/Target/AMDGPU/SIRegisterInfo.td +def_reg_class! { + Amdgpu AmdgpuInlineAsmRegClass { + sgpr32, + sgpr64, + sgpr96, + sgpr128, + sgpr256, + sgpr512, + vgpr16, + vgpr32, + vgpr64, + vgpr96, + vgpr128, + vgpr160, + vgpr192, + vgpr224, + vgpr256, + vgpr288, + vgpr320, + vgpr352, + vgpr384, + vgpr512, + vgpr1024, + } +} + +#[derive( + Copy, + Clone, + rustc_macros::Encodable, + rustc_macros::Decodable, + Debug, + Eq, + PartialEq, + PartialOrd, + Hash, + rustc_macros::HashStable_Generic +)] +pub enum AmdgpuInlineAsmRegClassType { + Sgpr, + Vgpr, +} + +// See https://llvm.org/docs/AMDGPUOperandSyntax.html +impl AmdgpuInlineAsmRegClass { + pub fn get_type(self) -> AmdgpuInlineAsmRegClassType { + match self { + Self::sgpr32 + | Self::sgpr64 + | Self::sgpr96 + | Self::sgpr128 + | Self::sgpr256 + | Self::sgpr512 => AmdgpuInlineAsmRegClassType::Sgpr, + Self::vgpr16 + | Self::vgpr32 + | Self::vgpr64 + | Self::vgpr96 + | Self::vgpr128 + | Self::vgpr160 + | Self::vgpr192 + | Self::vgpr224 + | Self::vgpr256 + | Self::vgpr288 + | Self::vgpr320 + | Self::vgpr352 + | Self::vgpr384 + | Self::vgpr512 + | Self::vgpr1024 => AmdgpuInlineAsmRegClassType::Vgpr, + } + } + + /// Return size of the register class in bytes + pub fn bytes(self) -> u32 { + match self { + Self::vgpr16 => 16 / 8, + Self::sgpr32 | Self::vgpr32 => 32 / 8, + Self::sgpr64 | Self::vgpr64 => 64 / 8, + Self::sgpr96 | Self::vgpr96 => 96 / 8, + Self::sgpr128 | Self::vgpr128 => 128 / 8, + Self::vgpr160 => 160 / 8, + Self::vgpr192 => 192 / 8, + Self::vgpr224 => 224 / 8, + Self::sgpr256 | Self::vgpr256 => 256 / 8, + Self::vgpr288 => 288 / 8, + Self::vgpr320 => 320 / 8, + Self::vgpr352 => 352 / 8, + Self::vgpr384 => 384 / 8, + Self::sgpr512 | Self::vgpr512 => 512 / 8, + Self::vgpr1024 => 1024 / 8, + } + } + + fn from_type(ty: AmdgpuInlineAsmRegClassType, bytes: u32) -> Option { + let class = match ty { + AmdgpuInlineAsmRegClassType::Sgpr => match bytes * 8 { + 32 => Self::sgpr32, + 64 => Self::sgpr64, + 96 => Self::sgpr96, + 128 => Self::sgpr128, + 256 => Self::sgpr256, + 512 => Self::sgpr512, + _ => return None, + }, + AmdgpuInlineAsmRegClassType::Vgpr => match bytes * 8 { + 16 => Self::vgpr16, + 32 => Self::vgpr32, + 64 => Self::vgpr64, + 96 => Self::vgpr96, + 128 => Self::vgpr128, + 160 => Self::vgpr160, + 192 => Self::vgpr192, + 224 => Self::vgpr224, + 256 => Self::vgpr256, + 288 => Self::vgpr288, + 320 => Self::vgpr320, + 352 => Self::vgpr352, + 384 => Self::vgpr384, + 512 => Self::vgpr512, + 1024 => Self::vgpr1024, + _ => return None, + }, + }; + Some(class) + } + + pub fn valid_modifiers(self, _arch: InlineAsmArch) -> &'static [char] { + &[] + } + + pub fn suggest_class(self, _arch: InlineAsmArch, ty: InlineAsmType) -> Option { + // Suggest VGPR for everything as VGPRs have more uses + Some(match ty { + InlineAsmType::I16 => Self::vgpr16, + InlineAsmType::I32 => Self::vgpr32, + InlineAsmType::I64 => Self::vgpr64, + InlineAsmType::I128 => Self::vgpr128, + InlineAsmType::F16 => Self::vgpr16, + InlineAsmType::F32 => Self::vgpr32, + InlineAsmType::F64 => Self::vgpr64, + _ => { + let bytes = match ty { + InlineAsmType::VecI16(n) => n * (16 / 8), + InlineAsmType::VecI32(n) => n * (32 / 8), + InlineAsmType::VecI64(n) => n * (64 / 8), + InlineAsmType::VecI128(n) => n * (128 / 8), + InlineAsmType::VecF16(n) => n * (16 / 8), + InlineAsmType::VecF32(n) => n * (32 / 8), + InlineAsmType::VecF64(n) => n * (64 / 8), + _ => return None, + }; + return Self::from_type(AmdgpuInlineAsmRegClassType::Vgpr, bytes as u32); + } + }) + } + + pub fn suggest_modifier( + self, + _arch: InlineAsmArch, + _ty: InlineAsmType, + ) -> Option { + None + } + + pub fn default_modifier(self, _arch: InlineAsmArch) -> Option { + None + } + + pub fn supported_types( + self, + _arch: InlineAsmArch, + ) -> &'static [(InlineAsmType, Option)] { + match self { + Self::vgpr16 => types! { _: I16, F16; }, + Self::sgpr32 | Self::vgpr32 => types! { _: I16, I32, F16, F32, + VecI16(32 / 16), + VecF16(32 / 16); + }, + Self::sgpr64 | Self::vgpr64 => types! { + _: I64, F64, VecI16(64 / 16), VecI32(64 / 32), + VecF16(64 / 16), VecF32(64 / 32); + }, + Self::sgpr96 | Self::vgpr96 => types! { _: VecI32(96 / 32), VecF32(96 / 32); }, + Self::sgpr128 | Self::vgpr128 => types! { _: I128, + VecI16(128 / 16), VecI32(128 / 32), VecI64(128 / 64), + VecF16(128 / 16), VecF32(128 / 32), VecF64(128 / 64); + }, + Self::vgpr160 => types! { _: VecI32(160 / 32), VecF32(160 / 32); }, + Self::vgpr192 => types! { _: + VecI32(192 / 32), VecI64(192 / 64), + VecF32(192 / 32), VecF64(192 / 64); + }, + Self::vgpr224 => types! { _: VecI32(224 / 32), VecF32(224 / 32); }, + Self::sgpr256 => types! { _: + VecI16(256 / 16), VecI32(256 / 32), VecI64(256 / 64), + VecF16(256 / 16), VecF32(256 / 32), VecF64(256 / 64); + }, + Self::vgpr256 => types! { _: + VecI16(256 / 16), VecI32(256 / 32), + VecF16(256 / 16), VecF32(256 / 32), VecF64(256 / 64); + }, + Self::vgpr288 => types! { _: VecI32(288 / 32), VecF32(288 / 32); }, + Self::vgpr320 => types! { _: VecI32(320 / 32), VecF32(320 / 32); }, + Self::vgpr352 => types! { _: VecI32(352 / 32), VecF32(352 / 32); }, + Self::vgpr384 => types! { _: VecI32(384 / 32), VecF32(384 / 32); }, + Self::sgpr512 => types! { _: + VecI16(512 / 16), VecI32(512 / 32), VecI64(512 / 64), + VecF16(512 / 16), VecF32(512 / 32), VecF64(512 / 64); + }, + Self::vgpr512 => types! { _: + VecI16(512 / 16), VecI32(512 / 32), + VecF16(512 / 16), VecF32(512 / 32); + }, + Self::vgpr1024 => types! { _: VecF32(1024 / 32); }, + } + } + + /// The number of supported registers in this class. + /// The returned number is the length, so supported register + /// indices are 0 to max_num()-1. + fn max_num(self) -> u32 { + if self == AmdgpuInlineAsmRegClass::vgpr16 { + return 512; + } + let size = self.bytes(); + match self.get_type() { + AmdgpuInlineAsmRegClassType::Sgpr => 106 - (size / 4 - 1), + AmdgpuInlineAsmRegClassType::Vgpr => 256 - (size / 4 - 1), + } + } + + /// Get register class from prefix. + fn parse_prefix(prefix: char) -> Result { + match prefix { + 's' => Ok(AmdgpuInlineAsmRegClassType::Sgpr), + 'v' => Ok(AmdgpuInlineAsmRegClassType::Vgpr), + _ => Err("unknown register prefix"), + } + } +} + +impl AmdgpuInlineAsmRegClassType { + /// Prefix when printed and register constraint in LLVM. + fn prefix(self) -> &'static str { + match self { + AmdgpuInlineAsmRegClassType::Sgpr => "s", + AmdgpuInlineAsmRegClassType::Vgpr => "v", + } + } +} + +#[derive( + Copy, + Clone, + rustc_macros::Encodable, + rustc_macros::Decodable, + Debug, + Eq, + PartialEq, + PartialOrd, + Hash, + rustc_macros::HashStable_Generic +)] +enum AmdgpuRegRange { + /// Low 16-bit of a register + Low(u32), + /// High 16-bit of a register + High(u32), + /// One or more 32-bit registers, in the inclusive range + Range { start: u32, end: u32 }, +} + +#[derive( + Copy, + Clone, + rustc_macros::Encodable, + rustc_macros::Decodable, + Debug, + Eq, + PartialEq, + PartialOrd, + Hash, + rustc_macros::HashStable_Generic +)] +#[allow(non_camel_case_types)] +pub struct AmdgpuInlineAsmReg { + class: AmdgpuInlineAsmRegClassType, + range: AmdgpuRegRange, +} + +impl AmdgpuInlineAsmReg { + pub fn name(self) -> String { + let c = self.class.prefix(); + match self.range { + AmdgpuRegRange::Low(n) => format!("{c}{n}.l"), + AmdgpuRegRange::High(n) => format!("{c}{n}.h"), + AmdgpuRegRange::Range { start, end } if start == end => format!("{c}{start}"), + AmdgpuRegRange::Range { start, end } => format!("{c}[{start}:{end}]"), + } + } + + /// Size of the register in bytes + fn bytes(self) -> u32 { + match self.range { + AmdgpuRegRange::Low(_) | AmdgpuRegRange::High(_) => 2, + AmdgpuRegRange::Range { start, end } => ((end - start) + 1) * 4, + } + } + + pub fn reg_class(self) -> AmdgpuInlineAsmRegClass { + AmdgpuInlineAsmRegClass::from_type(self.class, self.bytes()) + .expect("Failed to emit invalid amdgpu register class") + } + + pub fn parse(name: &str) -> Result { + if name.is_empty() { + return Err("invalid empty register"); + } + let class = AmdgpuInlineAsmRegClass::parse_prefix(name.chars().next().unwrap())?; + // Form with range, e.g. s[2:3] + let res; + if name[1..].starts_with('[') { + if !name.ends_with(']') { + return Err("invalid register, missing closing bracket"); + } + if let Some((start, end)) = name[2..name.len() - 1].split_once(':') { + let Ok(start) = start.parse() else { + return Err("invalid register range start"); + }; + let Ok(end) = end.parse() else { + return Err("invalid register range end"); + }; + + // Check range + if start > end { + return Err("invalid reversed register range"); + } + + if let Some(class) = + AmdgpuInlineAsmRegClass::from_type(class, ((end - start) + 1) * 4) + { + if end >= class.max_num() { + return Err("too large register for this class"); + } + } else { + return Err("invalid register size for this class"); + } + res = Self { class, range: AmdgpuRegRange::Range { start, end } }; + } else { + return Err("invalid register range"); + } + } else { + let parse_num = |core: &str| { + let Ok(start) = core.parse() else { + return Err("invalid register number"); + }; + + if let Some(class) = AmdgpuInlineAsmRegClass::from_type(class, 4) { + if start >= class.max_num() { + return Err("too large register for this class"); + } + } else { + return Err("invalid register size for this class"); + } + + Ok(start) + }; + + let name = &name[1..]; + let range = if let Some(name) = name.strip_suffix(".l") { + if class == AmdgpuInlineAsmRegClassType::Sgpr { + return Err("invalid 16-bit SGPR register"); + } + AmdgpuRegRange::Low(parse_num(name)?) + } else if let Some(name) = name.strip_suffix(".h") { + if class == AmdgpuInlineAsmRegClassType::Sgpr { + return Err("invalid 16-bit SGPR register"); + } + AmdgpuRegRange::High(parse_num(name)?) + } else { + let start = parse_num(name)?; + AmdgpuRegRange::Range { start, end: start } + }; + res = Self { class, range }; + } + Ok(res) + } + + pub fn validate( + self, + _arch: super::InlineAsmArch, + _reloc_model: crate::spec::RelocModel, + _target_features: &rustc_data_structures::fx::FxIndexSet, + _target: &crate::spec::Target, + _is_clobber: bool, + ) -> Result<(), &'static str> { + Ok(()) + } +} + +pub(super) fn fill_reg_map( + _arch: super::InlineAsmArch, + _reloc_model: crate::spec::RelocModel, + _target_features: &rustc_data_structures::fx::FxIndexSet, + _target: &crate::spec::Target, + map: &mut rustc_data_structures::fx::FxHashMap< + super::InlineAsmRegClass, + rustc_data_structures::fx::FxIndexSet, + >, +) { + use super::{InlineAsmReg, InlineAsmRegClass}; + + #[allow(rustc::potential_query_instability)] + for class in regclass_map().keys() { + let InlineAsmRegClass::Amdgpu(class) = *class else { unreachable!("Must be amdgpu class") }; + if let Some(set) = map.get_mut(&InlineAsmRegClass::Amdgpu(class)) { + if class == AmdgpuInlineAsmRegClass::vgpr16 { + for i in 0..(class.max_num() / 2) { + set.insert(InlineAsmReg::Amdgpu(AmdgpuInlineAsmReg { + class: AmdgpuInlineAsmRegClassType::Vgpr, + range: AmdgpuRegRange::Low(i), + })); + set.insert(InlineAsmReg::Amdgpu(AmdgpuInlineAsmReg { + class: AmdgpuInlineAsmRegClassType::Vgpr, + range: AmdgpuRegRange::High(i), + })); + } + } else { + for i in 0..class.max_num() { + set.insert(InlineAsmReg::Amdgpu(AmdgpuInlineAsmReg { + class: class.get_type(), + range: AmdgpuRegRange::Range { start: i, end: i + class.bytes() / 4 }, + })); + } + } + } + } +} + +impl AmdgpuInlineAsmReg { + pub fn emit( + self, + out: &mut dyn fmt::Write, + _arch: InlineAsmArch, + _modifier: Option, + ) -> fmt::Result { + out.write_str(&self.name()) + } + + pub fn overlapping_regs(self, mut cb: impl FnMut(AmdgpuInlineAsmReg)) { + if self.class != AmdgpuInlineAsmRegClassType::Sgpr { + // Overlapping 16-bit registers (not supported for sgprs) + if let AmdgpuRegRange::Range { start, end } = self.range { + for i in start..=end { + cb(AmdgpuInlineAsmReg { class: self.class, range: AmdgpuRegRange::Low(i) }); + cb(AmdgpuInlineAsmReg { class: self.class, range: AmdgpuRegRange::High(i) }); + } + } + } + + // Overlapping 32-bit registers, up to size 32 + for size in 1..=32 { + let (start, end) = match self.range { + AmdgpuRegRange::Low(start) | AmdgpuRegRange::High(start) => (start, start), + AmdgpuRegRange::Range { start, end } => (start, end), + }; + + let size_range = size - 1; + for overlap_start in (start - size_range)..=end { + cb(AmdgpuInlineAsmReg { + class: self.class, + range: AmdgpuRegRange::Range { + start: overlap_start, + end: overlap_start + size_range, + }, + }); + } + } + } +} diff --git a/compiler/rustc_target/src/asm/mod.rs b/compiler/rustc_target/src/asm/mod.rs index a10699bbce884..b6e555c93a0ed 100644 --- a/compiler/rustc_target/src/asm/mod.rs +++ b/compiler/rustc_target/src/asm/mod.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::fmt; use rustc_abi::Size; @@ -177,6 +178,7 @@ macro_rules! types { } mod aarch64; +mod amdgpu; mod arm; mod avr; mod bpf; @@ -196,6 +198,7 @@ mod wasm; mod x86; pub use aarch64::{AArch64InlineAsmReg, AArch64InlineAsmRegClass}; +pub use amdgpu::{AmdgpuInlineAsmReg, AmdgpuInlineAsmRegClass}; pub use arm::{ArmInlineAsmReg, ArmInlineAsmRegClass}; pub use avr::{AvrInlineAsmReg, AvrInlineAsmRegClass}; pub use bpf::{BpfInlineAsmReg, BpfInlineAsmRegClass}; @@ -224,6 +227,7 @@ pub enum InlineAsmArch { RiscV32, RiscV64, Nvptx64, + Amdgpu, Hexagon, LoongArch32, LoongArch64, @@ -252,6 +256,7 @@ impl InlineAsmArch { Arch::Arm => Some(Self::Arm), Arch::Arm64EC => Some(Self::Arm64EC), Arch::AArch64 => Some(Self::AArch64), + Arch::AmdGpu => Some(Self::Amdgpu), Arch::RiscV32 => Some(Self::RiscV32), Arch::RiscV64 => Some(Self::RiscV64), Arch::Nvptx64 => Some(Self::Nvptx64), @@ -273,7 +278,7 @@ impl InlineAsmArch { Arch::Msp430 => Some(Self::Msp430), Arch::M68k => Some(Self::M68k), Arch::CSky => Some(Self::CSKY), - Arch::AmdGpu | Arch::Xtensa | Arch::Other(_) => None, + Arch::Xtensa | Arch::Other(_) => None, } } } @@ -283,6 +288,7 @@ impl InlineAsmArch { pub enum InlineAsmReg { X86(X86InlineAsmReg), Arm(ArmInlineAsmReg), + Amdgpu(AmdgpuInlineAsmReg), AArch64(AArch64InlineAsmReg), RiscV(RiscVInlineAsmReg), Nvptx(NvptxInlineAsmReg), @@ -304,24 +310,25 @@ pub enum InlineAsmReg { } impl InlineAsmReg { - pub fn name(self) -> &'static str { + pub fn name(self) -> Cow<'static, str> { match self { - Self::X86(r) => r.name(), - Self::Arm(r) => r.name(), - Self::AArch64(r) => r.name(), - Self::RiscV(r) => r.name(), - Self::PowerPC(r) => r.name(), - Self::Hexagon(r) => r.name(), - Self::LoongArch(r) => r.name(), - Self::Mips(r) => r.name(), - Self::S390x(r) => r.name(), - Self::Sparc(r) => r.name(), - Self::Bpf(r) => r.name(), - Self::Avr(r) => r.name(), - Self::Msp430(r) => r.name(), - Self::M68k(r) => r.name(), - Self::CSKY(r) => r.name(), - Self::Err => "", + Self::X86(r) => r.name().into(), + Self::Arm(r) => r.name().into(), + Self::AArch64(r) => r.name().into(), + Self::Amdgpu(r) => r.name().into(), + Self::RiscV(r) => r.name().into(), + Self::PowerPC(r) => r.name().into(), + Self::Hexagon(r) => r.name().into(), + Self::LoongArch(r) => r.name().into(), + Self::Mips(r) => r.name().into(), + Self::S390x(r) => r.name().into(), + Self::Sparc(r) => r.name().into(), + Self::Bpf(r) => r.name().into(), + Self::Avr(r) => r.name().into(), + Self::Msp430(r) => r.name().into(), + Self::M68k(r) => r.name().into(), + Self::CSKY(r) => r.name().into(), + Self::Err => "".into(), } } @@ -330,6 +337,7 @@ impl InlineAsmReg { Self::X86(r) => InlineAsmRegClass::X86(r.reg_class()), Self::Arm(r) => InlineAsmRegClass::Arm(r.reg_class()), Self::AArch64(r) => InlineAsmRegClass::AArch64(r.reg_class()), + Self::Amdgpu(r) => InlineAsmRegClass::Amdgpu(r.reg_class()), Self::RiscV(r) => InlineAsmRegClass::RiscV(r.reg_class()), Self::PowerPC(r) => InlineAsmRegClass::PowerPC(r.reg_class()), Self::Hexagon(r) => InlineAsmRegClass::Hexagon(r.reg_class()), @@ -356,6 +364,7 @@ impl InlineAsmReg { InlineAsmArch::AArch64 | InlineAsmArch::Arm64EC => { Self::AArch64(AArch64InlineAsmReg::parse(name)?) } + InlineAsmArch::Amdgpu => Self::Amdgpu(AmdgpuInlineAsmReg::parse(name)?), InlineAsmArch::RiscV32 | InlineAsmArch::RiscV64 => { Self::RiscV(RiscVInlineAsmReg::parse(name)?) } @@ -398,6 +407,7 @@ impl InlineAsmReg { Self::X86(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), Self::Arm(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), Self::AArch64(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), + Self::Amdgpu(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), Self::RiscV(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), Self::PowerPC(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), Self::Hexagon(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), @@ -428,6 +438,7 @@ impl InlineAsmReg { Self::X86(r) => r.emit(out, arch, modifier), Self::Arm(r) => r.emit(out, arch, modifier), Self::AArch64(r) => r.emit(out, arch, modifier), + Self::Amdgpu(r) => r.emit(out, arch, modifier), Self::RiscV(r) => r.emit(out, arch, modifier), Self::PowerPC(r) => r.emit(out, arch, modifier), Self::Hexagon(r) => r.emit(out, arch, modifier), @@ -449,6 +460,7 @@ impl InlineAsmReg { Self::X86(r) => r.overlapping_regs(|r| cb(Self::X86(r))), Self::Arm(r) => r.overlapping_regs(|r| cb(Self::Arm(r))), Self::AArch64(_) => cb(self), + Self::Amdgpu(r) => r.overlapping_regs(|r| cb(Self::Amdgpu(r))), Self::RiscV(_) => cb(self), Self::PowerPC(r) => r.overlapping_regs(|r| cb(Self::PowerPC(r))), Self::Hexagon(r) => r.overlapping_regs(|r| cb(Self::Hexagon(r))), @@ -472,6 +484,7 @@ pub enum InlineAsmRegClass { X86(X86InlineAsmRegClass), Arm(ArmInlineAsmRegClass), AArch64(AArch64InlineAsmRegClass), + Amdgpu(AmdgpuInlineAsmRegClass), RiscV(RiscVInlineAsmRegClass), Nvptx(NvptxInlineAsmRegClass), PowerPC(PowerPCInlineAsmRegClass), @@ -497,6 +510,7 @@ impl InlineAsmRegClass { Self::X86(r) => r.name(), Self::Arm(r) => r.name(), Self::AArch64(r) => r.name(), + Self::Amdgpu(r) => r.name(), Self::RiscV(r) => r.name(), Self::Nvptx(r) => r.name(), Self::PowerPC(r) => r.name(), @@ -524,6 +538,7 @@ impl InlineAsmRegClass { Self::X86(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::X86), Self::Arm(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::Arm), Self::AArch64(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::AArch64), + Self::Amdgpu(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::Amdgpu), Self::RiscV(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::RiscV), Self::Nvptx(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::Nvptx), Self::PowerPC(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::PowerPC), @@ -554,6 +569,7 @@ impl InlineAsmRegClass { Self::X86(r) => r.suggest_modifier(arch, ty), Self::Arm(r) => r.suggest_modifier(arch, ty), Self::AArch64(r) => r.suggest_modifier(arch, ty), + Self::Amdgpu(r) => r.suggest_modifier(arch, ty), Self::RiscV(r) => r.suggest_modifier(arch, ty), Self::Nvptx(r) => r.suggest_modifier(arch, ty), Self::PowerPC(r) => r.suggest_modifier(arch, ty), @@ -584,6 +600,7 @@ impl InlineAsmRegClass { Self::X86(r) => r.default_modifier(arch), Self::Arm(r) => r.default_modifier(arch), Self::AArch64(r) => r.default_modifier(arch), + Self::Amdgpu(r) => r.default_modifier(arch), Self::RiscV(r) => r.default_modifier(arch), Self::Nvptx(r) => r.default_modifier(arch), Self::PowerPC(r) => r.default_modifier(arch), @@ -617,6 +634,7 @@ impl InlineAsmRegClass { Self::X86(r) => r.supported_types(arch), Self::Arm(r) => r.supported_types(arch), Self::AArch64(r) => r.supported_types(arch), + Self::Amdgpu(r) => r.supported_types(arch), Self::RiscV(r) => r.supported_types(arch), Self::Nvptx(r) => r.supported_types(arch), Self::PowerPC(r) => r.supported_types(arch), @@ -645,6 +663,7 @@ impl InlineAsmRegClass { InlineAsmArch::AArch64 | InlineAsmArch::Arm64EC => { Self::AArch64(AArch64InlineAsmRegClass::parse(name)?) } + InlineAsmArch::Amdgpu => Self::Amdgpu(AmdgpuInlineAsmRegClass::parse(name)?), InlineAsmArch::RiscV32 | InlineAsmArch::RiscV64 => { Self::RiscV(RiscVInlineAsmRegClass::parse(name)?) } @@ -682,6 +701,7 @@ impl InlineAsmRegClass { Self::X86(r) => r.valid_modifiers(arch), Self::Arm(r) => r.valid_modifiers(arch), Self::AArch64(r) => r.valid_modifiers(arch), + Self::Amdgpu(r) => r.valid_modifiers(arch), Self::RiscV(r) => r.valid_modifiers(arch), Self::Nvptx(r) => r.valid_modifiers(arch), Self::PowerPC(r) => r.valid_modifiers(arch), @@ -843,6 +863,11 @@ pub fn allocatable_registers( aarch64::fill_reg_map(arch, reloc_model, target_features, target, &mut map); map } + InlineAsmArch::Amdgpu => { + let mut map = amdgpu::regclass_map(); + amdgpu::fill_reg_map(arch, reloc_model, target_features, target, &mut map); + map + } InlineAsmArch::RiscV32 | InlineAsmArch::RiscV64 => { let mut map = riscv::regclass_map(); riscv::fill_reg_map(arch, reloc_model, target_features, target, &mut map); diff --git a/tests/assembly-llvm/asm/amdgpu-types.rs b/tests/assembly-llvm/asm/amdgpu-types.rs new file mode 100644 index 0000000000000..fe8ae88ee83ef --- /dev/null +++ b/tests/assembly-llvm/asm/amdgpu-types.rs @@ -0,0 +1,232 @@ +//@ add-minicore +//@ revisions: gfx11 gfx12 +//@ assembly-output: emit-asm +//@ compile-flags: --target amdgcn-amd-amdhsa +//@[gfx11] compile-flags: -Ctarget-cpu=gfx1100 +//@[gfx12] compile-flags: -Ctarget-cpu=gfx1200 +//@ needs-llvm-components: amdgpu +//@ needs-rust-lld + +#![feature(abi_gpu_kernel, no_core, asm_experimental_arch, f16)] +#![crate_type = "rlib"] +#![no_core] +#![allow(asm_sub_register, non_camel_case_types, unused_assignments, unused_variables)] + +extern crate minicore; +use minicore::*; + +type ptr = *mut u8; + +macro_rules! check { + ($func:ident $ty:ident $class:ident $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func(x: $ty) { + let y: $ty; + asm!(concat!($mov, " {}, {}"), out($class) y, in($class) x); + } + }; + + ($func:ident $ret_ty:ident $ret_class:ident $($arg_name:ident: $arg_ty:ident $arg_class:ident,)* + $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func($($arg_name: $arg_ty,)*) { + let result: $ret_ty; + asm!(concat!($mov, " {}", $(", {", stringify!($arg_name), "}",)*), + out($ret_class) result, $($arg_name = in($arg_class) $arg_name,)*); + } + }; +} + +macro_rules! check_reg { + ($func:ident $ty:ident $reg:tt $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func(x: $ty) { + let y: $ty; + asm!(concat!($mov, " ", $reg, ", ", $reg), lateout($reg) y, in($reg) x); + } + }; + + ($func:ident $ret_ty:ident $ret_reg:tt $($arg_name:ident: $arg_ty:ident $arg_reg:tt,)* + $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func($($arg_name: $arg_ty,)*) { + let result: $ret_ty; + asm!(concat!($mov, " ", $ret_reg, $(", ", $arg_reg,)*), lateout($ret_reg) result, + $(in($arg_reg) $arg_name,)*); + } + }; +} + +// CHECK-LABEL: sgpr_i16: +// CHECK: #ASMSTART +// CHECK: s_pack_ll_b32_b16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// CHECK: #ASMEND +check!(sgpr_i16 i32 sgpr32 x: i16 sgpr32, y: i16 sgpr32, "s_pack_ll_b32_b16"); + +// gfx11-LABEL: vgpr_i16: +// gfx11: #ASMSTART +// gfx11: v_mov_b16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_i16 i16 vgpr32 "v_mov_b16"); + +// gfx12-LABEL: sgpr_f16: +// gfx12: #ASMSTART +// gfx12: s_add_f16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// gfx12: #ASMEND +#[cfg(gfx12)] +check!(sgpr_f16 f16 sgpr32 x: f16 sgpr32, y: f16 sgpr32, "s_add_f16"); + +// gfx11-LABEL: vgpr_f16: +// gfx11: #ASMSTART +// gfx11: v_mov_b16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_f16 f16 vgpr32 "v_mov_b16"); + +// CHECK-LABEL: sgpr_i32: +// CHECK: #ASMSTART +// CHECK: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i32 i32 sgpr32 "s_mov_b32"); + +// CHECK-LABEL: vgpr_i32: +// CHECK: #ASMSTART +// CHECK: v_mov_b32 v{{[0-9]+}}, v{{[0-9]+}} +// CHECK: #ASMEND +check!(vgpr_i32 i32 vgpr32 "v_mov_b32"); + +// CHECK-LABEL: sgpr_f32: +// CHECK: #ASMSTART +// CHECK: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f32 f32 sgpr32 "s_mov_b32"); + +// CHECK-LABEL: vgpr_f32: +// CHECK: #ASMSTART +// CHECK: v_mov_b32 v{{[0-9]+}}, v{{[0-9]+}} +// CHECK: #ASMEND +check!(vgpr_f32 f32 vgpr32 "v_mov_b32"); + +// CHECK-LABEL: sgpr_i64: +// CHECK: #ASMSTART +// CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(sgpr_i64 i64 sgpr64 "s_mov_b64"); + +// CHECK-LABEL: vgpr_i64: +// CHECK: #ASMSTART +// CHECK: v_lshlrev_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_i64 i64 vgpr64 x: i32 vgpr32, y: i64 vgpr64, "v_lshlrev_b64"); + +// CHECK-LABEL: sgpr_f64: +// CHECK: #ASMSTART +// CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(sgpr_f64 f64 sgpr64 "s_mov_b64"); + +// CHECK-LABEL: vgpr_f64: +// CHECK: #ASMSTART +// CHECK: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_f64 f64 vgpr64 x: f64 vgpr64, y: f64 vgpr64, "v_add_f64"); + +// CHECK-LABEL: sgpr_i128: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i128 i128 sgpr128 x: ptr sgpr64, y: i32 sgpr32, "s_load_b128"); + +// CHECK-LABEL: vgpr_i128: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_i128 i128 vgpr128 x: i32 vgpr32, y: ptr sgpr64, "global_load_b128"); + +// CHECK-LABEL: s0_i16: +// CHECK: #ASMSTART +// CHECK: s_pack_ll_b32_b16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// CHECK: #ASMEND +check_reg!(s0_i16 i32 "s0" x: i16 "s1", y: i16 "s2", "s_pack_ll_b32_b16"); + +// gfx11-LABEL: v0_i16: +// gfx11: #ASMSTART +// gfx11: v_mov_b16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_i16 i16 "v0.l" "v_mov_b16"); + +// gfx12-LABEL: s0_f16: +// gfx12: #ASMSTART +// gfx12: s_add_f16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// gfx12: #ASMEND +#[cfg(gfx12)] +check_reg!(s0_f16 f16 "s0" x: f16 "s1", y: f16 "s2", "s_add_f16"); + +// gfx11-LABEL: v0_f16: +// gfx11: #ASMSTART +// gfx11: v_mov_b16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_f16 f16 "v0.l" "v_mov_b16"); + +// CHECK-LABEL: s0_i32: +// CHECK: #ASMSTART +// CHECK: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i32 i32 "s0" "s_mov_b32"); + +// CHECK-LABEL: v0_i32: +// CHECK: #ASMSTART +// CHECK: v_mov_b32 v{{[0-9]+}}, v{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(v0_i32 i32 "v0" "v_mov_b32"); + +// CHECK-LABEL: s0_f32: +// CHECK: #ASMSTART +// CHECK: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f32 f32 "s0" "s_mov_b32"); + +// CHECK-LABEL: v0_f32: +// CHECK: #ASMSTART +// CHECK: v_mov_b32 v{{[0-9]+}}, v{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(v0_f32 f32 "v0" "v_mov_b32"); + +// CHECK-LABEL: s0_i64: +// CHECK: #ASMSTART +// CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(s0_i64 i64 "s[0:1]" "s_mov_b64"); + +// CHECK-LABEL: v0_i64: +// CHECK: #ASMSTART +// CHECK: v_lshlrev_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_i64 i64 "v[0:1]" x: i32 "v2", y: i64 "v[0:1]", "v_lshlrev_b64"); + +// CHECK-LABEL: s0_f64: +// CHECK: #ASMSTART +// CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(s0_f64 f64 "s[0:1]" "s_mov_b64"); + +// CHECK-LABEL: v0_f64: +// CHECK: #ASMSTART +// CHECK: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_f64 f64 "v[0:1]" x: f64 "v[0:1]", y: f64 "v[2:3]", "v_add_f64"); + +// CHECK-LABEL: s0_i128: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i128 i128 "s[0:3]" x: ptr "s[0:1]", y: i32 "s0", "s_load_b128"); + +// CHECK-LABEL: v0_i128: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_i128 i128 "v[0:3]" x: i32 "v0", y: ptr "s[0:1]", "global_load_b128"); diff --git a/tests/assembly-llvm/asm/amdgpu-vec-types.rs b/tests/assembly-llvm/asm/amdgpu-vec-types.rs new file mode 100644 index 0000000000000..68e4a41da58ff --- /dev/null +++ b/tests/assembly-llvm/asm/amdgpu-vec-types.rs @@ -0,0 +1,845 @@ +//@ add-minicore +//@ revisions: gfx11 gfx12 +//@ assembly-output: emit-asm +//@ compile-flags: --target amdgcn-amd-amdhsa +//@[gfx11] compile-flags: -Ctarget-cpu=gfx1100 +//@[gfx12] compile-flags: -Ctarget-cpu=gfx1200 +//@ needs-llvm-components: amdgpu +//@ needs-rust-lld +// ignore-tidy-linelength + +#![feature(abi_gpu_kernel, no_core, asm_experimental_arch, repr_simd, f16)] +#![crate_type = "rlib"] +#![no_core] +#![allow( + asm_sub_register, + improper_gpu_kernel_arg, + improper_ctypes_definitions, + non_camel_case_types, + unused_assignments, + unused_variables +)] + +extern crate minicore; +use minicore::*; + +type ptr = *mut u8; + +#[repr(simd)] +pub struct i16x2([i16; 2]); +#[repr(simd)] +pub struct f16x2([f16; 2]); + +#[repr(simd)] +pub struct i16x4([i16; 4]); +#[repr(simd)] +pub struct f16x4([f16; 4]); +#[repr(simd)] +pub struct i32x2([i32; 2]); +#[repr(simd)] +pub struct f32x2([f32; 2]); + +#[repr(simd)] +pub struct i32x3([i32; 3]); +#[repr(simd)] +pub struct f32x3([f32; 3]); + +#[repr(simd)] +pub struct i16x8([i16; 8]); +#[repr(simd)] +pub struct f16x8([f16; 8]); +#[repr(simd)] +pub struct i32x4([i32; 4]); +#[repr(simd)] +pub struct f32x4([f32; 4]); +#[repr(simd)] +pub struct i64x2([i64; 2]); +#[repr(simd)] +pub struct f64x2([f64; 2]); + +#[repr(simd)] +pub struct i32x5([i32; 5]); +#[repr(simd)] +pub struct f32x5([f32; 5]); + +#[repr(simd)] +pub struct i32x6([i32; 6]); +#[repr(simd)] +pub struct f32x6([f32; 6]); +#[repr(simd)] +pub struct i64x3([i64; 3]); +#[repr(simd)] +pub struct f64x3([f64; 3]); + +#[repr(simd)] +pub struct i32x7([i32; 7]); +#[repr(simd)] +pub struct f32x7([f32; 7]); + +#[repr(simd)] +pub struct i16x16([i16; 16]); +#[repr(simd)] +pub struct f16x16([f16; 16]); +#[repr(simd)] +pub struct i32x8([i32; 8]); +#[repr(simd)] +pub struct f32x8([f32; 8]); +#[repr(simd)] +pub struct i64x4([i64; 4]); +#[repr(simd)] +pub struct f64x4([f64; 4]); + +#[repr(simd)] +pub struct i32x10([i32; 10]); +#[repr(simd)] +pub struct f32x10([f32; 10]); + +#[repr(simd)] +pub struct i16x32([i16; 32]); +#[repr(simd)] +pub struct f16x32([f16; 32]); +#[repr(simd)] +pub struct i32x16([i32; 16]); +#[repr(simd)] +pub struct f32x16([f32; 16]); +#[repr(simd)] +pub struct i64x8([i64; 8]); +#[repr(simd)] +pub struct f64x8([f64; 8]); + +macro_rules! impl_copy { + ($($ty:ident)*) => { + $( + impl Copy for $ty {} + )* + }; +} + +impl_copy!( + i16x2 f16x2 i16x4 f16x4 i32x2 f32x2 i32x3 f32x3 i16x8 f16x8 i32x4 f32x4 + i64x2 f64x2 i32x5 f32x5 i32x6 f32x6 i64x3 f64x3 i32x7 f32x7 i16x16 f16x16 + i32x8 f32x8 i64x4 f64x4 i32x10 f32x10 i16x32 f16x32 i32x16 f32x16 i64x8 + f64x8 +); + +macro_rules! check { + ($func:ident $ty:ident $class:ident $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func(x: $ty) { + let y: $ty; + asm!(concat!($mov, " {}, {}"), out($class) y, in($class) x); + } + }; + + ($func:ident $ret_ty:ident $ret_class:ident $($arg_name:ident: $arg_ty:ident $arg_class:ident,)* + $mov:literal) => { + check!($func $ret_ty $ret_class $($arg_name: $arg_ty $arg_class,)* $mov, ""); + }; + + ($func:ident $ret_ty:ident $ret_class:ident $($arg_name:ident: $arg_ty:ident $arg_class:ident,)* + $mov:literal, $tail:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func($($arg_name: $arg_ty,)*) { + let result: $ret_ty; + asm!(concat!($mov, " {}", $(", {", stringify!($arg_name), "}",)* $tail), + out($ret_class) result, $($arg_name = in($arg_class) $arg_name,)*); + } + }; +} + +macro_rules! check_reg { + ($func:ident $ty:ident $reg:tt $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func(x: $ty) { + let y: $ty; + asm!(concat!($mov, " ", $reg, ", ", $reg), lateout($reg) y, in($reg) x); + } + }; + + ($func:ident $ret_ty:ident $ret_reg:tt $($arg_name:ident: $arg_ty:ident $arg_reg:tt,)* + $mov:literal) => { + check_reg!($func $ret_ty $ret_reg $($arg_name: $arg_ty $arg_reg,)* $mov, ""); + }; + + ($func:ident $ret_ty:ident $ret_reg:tt $($arg_name:ident: $arg_ty:ident $arg_reg:tt,)* + $mov:literal, $tail:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func($($arg_name: $arg_ty,)*) { + let result: $ret_ty; + asm!(concat!($mov, " ", $ret_reg, $(", ", $arg_reg,)* $tail), lateout($ret_reg) result, + $(in($arg_reg) $arg_name,)*); + } + }; +} + +// CHECK-LABEL: sgpr_i16x2: +// CHECK: #ASMSTART +// CHECK: s_pack_ll_b32_b16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// CHECK: #ASMEND +check!(sgpr_i16x2 i16x2 sgpr32 x: i16 sgpr32, y: i16 sgpr32, "s_pack_ll_b32_b16"); + +// CHECK-LABEL: sgpr_f16x2: +// CHECK: #ASMSTART +// CHECK: s_pack_ll_b32_b16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// CHECK: #ASMEND +check!(sgpr_f16x2 f16x2 sgpr32 x: i16 sgpr32, y: i16 sgpr32, "s_pack_ll_b32_b16"); + +// CHECK-LABEL: vgpr_i16x2: +// CHECK: #ASMSTART +// CHECK: v_pk_add_i16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// CHECK: #ASMEND +check!(vgpr_i16x2 i16x2 vgpr32 x: i16x2 vgpr32, y: i16x2 vgpr32, "v_pk_add_i16"); + +// CHECK-LABEL: vgpr_f16x2: +// CHECK: #ASMSTART +// CHECK: v_pk_add_f16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// CHECK: #ASMEND +check!(vgpr_f16x2 f16x2 vgpr32 x: f16x2 vgpr32, y: f16x2 vgpr32, "v_pk_add_f16"); + +// CHECK-LABEL: sgpr_i16x4: +// CHECK: #ASMSTART +// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i16x4 i16x4 sgpr64 x: ptr sgpr64, y: i32 sgpr32, "s_load_b64"); + +// CHECK-LABEL: sgpr_f16x4: +// CHECK: #ASMSTART +// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f16x4 f16x4 sgpr64 x: ptr sgpr64, y: i32 sgpr32, "s_load_b64"); + +// CHECK-LABEL: sgpr_i32x2: +// CHECK: #ASMSTART +// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i32x2 i32x2 sgpr64 x: ptr sgpr64, y: i32 sgpr32, "s_load_b64"); + +// CHECK-LABEL: sgpr_f32x2: +// CHECK: #ASMSTART +// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f32x2 f32x2 sgpr64 x: ptr sgpr64, y: i32 sgpr32, "s_load_b64"); + +// CHECK-LABEL: vgpr_i16x4: +// CHECK: #ASMSTART +// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_i16x4 i16x4 vgpr64 x: i32 vgpr32, y: ptr sgpr64, "global_load_b64"); + +// CHECK-LABEL: vgpr_f16x4: +// CHECK: #ASMSTART +// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_f16x4 f16x4 vgpr64 x: i32 vgpr32, y: ptr sgpr64, "global_load_b64"); + +// CHECK-LABEL: vgpr_i32x2: +// CHECK: #ASMSTART +// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_i32x2 i32x2 vgpr64 x: i32 vgpr32, y: ptr sgpr64, "global_load_b64"); + +// CHECK-LABEL: vgpr_f32x2: +// CHECK: #ASMSTART +// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_f32x2 f32x2 vgpr64 x: i32 vgpr32, y: ptr sgpr64, "global_load_b64"); + +// gfx12-LABEL: sgpr_i32x3: +// gfx12: #ASMSTART +// gfx12: s_load_b96 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// gfx12: #ASMEND +#[cfg(gfx12)] +check!(sgpr_i32x3 i32x3 sgpr96 x: ptr sgpr64, y: i32 sgpr32, "s_load_b96"); + +// gfx12-LABEL: sgpr_f32x3: +// gfx12: #ASMSTART +// gfx12: s_load_b96 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// gfx12: #ASMEND +#[cfg(gfx12)] +check!(sgpr_f32x3 f32x3 sgpr96 x: ptr sgpr64, y: i32 sgpr32, "s_load_b96"); + +// CHECK-LABEL: vgpr_i32x3: +// CHECK: #ASMSTART +// CHECK: global_load_b96 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_i32x3 i32x3 vgpr96 x: i32 vgpr32, y: ptr sgpr64, "global_load_b96"); + +// CHECK-LABEL: vgpr_f32x3: +// CHECK: #ASMSTART +// CHECK: global_load_b96 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_f32x3 f32x3 vgpr96 x: i32 vgpr32, y: ptr sgpr64, "global_load_b96"); + +// CHECK-LABEL: sgpr_i16x8: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i16x8 i16x8 sgpr128 x: ptr sgpr64, y: i32 sgpr32, "s_load_b128"); + +// CHECK-LABEL: sgpr_f16x8: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f16x8 f16x8 sgpr128 x: ptr sgpr64, y: i32 sgpr32, "s_load_b128"); + +// CHECK-LABEL: sgpr_i32x4: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i32x4 i32x4 sgpr128 x: ptr sgpr64, y: i32 sgpr32, "s_load_b128"); + +// CHECK-LABEL: sgpr_f32x4: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f32x4 f32x4 sgpr128 x: ptr sgpr64, y: i32 sgpr32, "s_load_b128"); + +// CHECK-LABEL: sgpr_i64x2: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i64x2 i64x2 sgpr128 x: ptr sgpr64, y: i32 sgpr32, "s_load_b128"); + +// CHECK-LABEL: sgpr_f64x2: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f64x2 f64x2 sgpr128 x: ptr sgpr64, y: i32 sgpr32, "s_load_b128"); + +// CHECK-LABEL: vgpr_i16x8: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_i16x8 i16x8 vgpr128 x: i32 vgpr32, y: ptr sgpr64, "global_load_b128"); + +// CHECK-LABEL: vgpr_f16x8: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_f16x8 f16x8 vgpr128 x: i32 vgpr32, y: ptr sgpr64, "global_load_b128"); + +// CHECK-LABEL: vgpr_i32x4: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_i32x4 i32x4 vgpr128 x: i32 vgpr32, y: ptr sgpr64, "global_load_b128"); + +// CHECK-LABEL: vgpr_f32x4: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_f32x4 f32x4 vgpr128 x: i32 vgpr32, y: ptr sgpr64, "global_load_b128"); + +// CHECK-LABEL: vgpr_i64x2: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_i64x2 i64x2 vgpr128 x: i32 vgpr32, y: ptr sgpr64, "global_load_b128"); + +// CHECK-LABEL: vgpr_f64x2: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_f64x2 f64x2 vgpr128 x: i32 vgpr32, y: ptr sgpr64, "global_load_b128"); + +// CHECK-LABEL: vgpr_i32x5: +// CHECK: #ASMSTART +// CHECK: image_load v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_1D tfe +// CHECK: #ASMEND +check!(vgpr_i32x5 i32x5 vgpr160 x: i32 vgpr32, y: i32x8 sgpr256, "image_load", + " dmask:0xf dim:SQ_RSRC_IMG_1D tfe"); + +// CHECK-LABEL: vgpr_f32x5: +// CHECK: #ASMSTART +// CHECK: image_load v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_1D tfe +// CHECK: #ASMEND +check!(vgpr_f32x5 f32x5 vgpr160 x: i32 vgpr32, y: i32x8 sgpr256, "image_load", + " dmask:0xf dim:SQ_RSRC_IMG_1D tfe"); + +// gfx11-LABEL: vgpr_i32x6: +// gfx11: #ASMSTART +// gfx11: image_sample_d v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_i32x6 i32x4 vgpr128 x: i32x6 vgpr192, y: i32x8 sgpr256, z: i32x4 sgpr128, + "image_sample_d", " dmask:0xf dim:SQ_RSRC_IMG_2D"); + +// gfx11-LABEL: vgpr_f32x6: +// gfx11: #ASMSTART +// gfx11: image_sample_d v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_f32x6 i32x4 vgpr128 x: f32x6 vgpr192, y: i32x8 sgpr256, z: i32x4 sgpr128, + "image_sample_d", " dmask:0xf dim:SQ_RSRC_IMG_2D"); + +// gfx11-LABEL: vgpr_i32x7: +// gfx11: #ASMSTART +// gfx11: image_sample_d_cl v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_i32x7 i32x4 vgpr128 x: i32x7 vgpr224, y: i32x8 sgpr256, z: i32x4 sgpr128, + "image_sample_d_cl", " dmask:0xf dim:SQ_RSRC_IMG_2D"); + +// gfx11-LABEL: vgpr_f32x7: +// gfx11: #ASMSTART +// gfx11: image_sample_d_cl v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_f32x7 i32x4 vgpr128 x: f32x7 vgpr224, y: i32x8 sgpr256, z: i32x4 sgpr128, + "image_sample_d_cl", " dmask:0xf dim:SQ_RSRC_IMG_2D"); + +// CHECK-LABEL: sgpr_i16x16: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i16x16 i16x16 sgpr256 x: ptr sgpr64, y: i32 sgpr32, "s_load_b256"); + +// CHECK-LABEL: sgpr_f16x16: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f16x16 f16x16 sgpr256 x: ptr sgpr64, y: i32 sgpr32, "s_load_b256"); + +// CHECK-LABEL: sgpr_i32x8: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i32x8 i32x8 sgpr256 x: ptr sgpr64, y: i32 sgpr32, "s_load_b256"); + +// CHECK-LABEL: sgpr_f32x8: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f32x8 f32x8 sgpr256 x: ptr sgpr64, y: i32 sgpr32, "s_load_b256"); + +// CHECK-LABEL: sgpr_i64x4: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i64x4 i64x4 sgpr256 x: ptr sgpr64, y: i32 sgpr32, "s_load_b256"); + +// CHECK-LABEL: sgpr_f64x4: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f64x4 f64x4 sgpr256 x: ptr sgpr64, y: i32 sgpr32, "s_load_b256"); + +// gfx11-LABEL: vgpr_i16x16: +// gfx11: #ASMSTART +// gfx11: v_wmma_f32_16x16x16_bf16 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_i16x16 f32x8 vgpr256 x: i32x8 vgpr256, y: i16x16 vgpr256, z: f32x8 vgpr256, + "v_wmma_f32_16x16x16_bf16"); + +// gfx11-LABEL: vgpr_f16x16: +// gfx11: #ASMSTART +// gfx11: v_wmma_f32_16x16x16_f16 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_f16x16 f32x8 vgpr256 x: f16x16 vgpr256, y: f16x16 vgpr256, z: f32x8 vgpr256, + "v_wmma_f32_16x16x16_f16"); + +// gfx11-LABEL: vgpr_i32x8: +// gfx11: #ASMSTART +// gfx11: v_wmma_i32_16x16x16_iu8 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_i32x8 i32x8 vgpr256 x: i32x4 vgpr128, y: i32x4 vgpr128, z: i32x8 vgpr256, + "v_wmma_i32_16x16x16_iu8"); + +// gfx12-LABEL: vgpr_f32x8: +// gfx12: #ASMSTART +// gfx12: v_wmma_f32_16x16x16_fp8_fp8 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx12: #ASMEND +#[cfg(gfx12)] +check!(vgpr_f32x8 f32x8 vgpr256 x: f32x2 vgpr64, y: f32x2 vgpr64, z: f32x8 vgpr256, + "v_wmma_f32_16x16x16_fp8_fp8"); + +// gfx12-LABEL: vgpr_i32x10: +// gfx12: #ASMSTART +// gfx12: image_bvh8_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3] +// gfx12: #ASMEND +#[cfg(gfx12)] +check!(vgpr_i32x10 i32x10 vgpr320 "image_bvh8_intersect_ray", + ", [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3]"); + +// gfx12-LABEL: vgpr_f32x10: +// gfx12: #ASMSTART +// gfx12: image_bvh8_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3] +// gfx12: #ASMEND +#[cfg(gfx12)] +check!(vgpr_f32x10 f32x10 vgpr320 "image_bvh8_intersect_ray", + ", [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3]"); + +// CHECK-LABEL: sgpr_i16x32: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i16x32 i16x32 sgpr512 x: ptr sgpr64, y: i32 sgpr32, "s_load_b512"); + +// CHECK-LABEL: sgpr_f16x32: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f16x32 f16x32 sgpr512 x: ptr sgpr64, y: i32 sgpr32, "s_load_b512"); + +// CHECK-LABEL: sgpr_i32x16: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i32x16 i32x16 sgpr512 x: ptr sgpr64, y: i32 sgpr32, "s_load_b512"); + +// CHECK-LABEL: sgpr_f32x16: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f32x16 f32x16 sgpr512 x: ptr sgpr64, y: i32 sgpr32, "s_load_b512"); + +// CHECK-LABEL: sgpr_i64x8: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i64x8 i64x8 sgpr512 x: ptr sgpr64, y: i32 sgpr32, "s_load_b512"); + +// CHECK-LABEL: sgpr_f64x8: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f64x8 f64x8 sgpr512 x: ptr sgpr64, y: i32 sgpr32, "s_load_b512"); + +// CHECK-LABEL: s0_i16x2: +// CHECK: #ASMSTART +// CHECK: s_pack_ll_b32_b16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// CHECK: #ASMEND +check_reg!(s0_i16x2 i16x2 "s0" x: i16 "s1", y: i16 "s2", "s_pack_ll_b32_b16"); + +// CHECK-LABEL: s0_f16x2: +// CHECK: #ASMSTART +// CHECK: s_pack_ll_b32_b16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// CHECK: #ASMEND +check_reg!(s0_f16x2 f16x2 "s0" x: i16 "s1", y: i16 "s2", "s_pack_ll_b32_b16"); + +// CHECK-LABEL: v0_i16x2: +// CHECK: #ASMSTART +// CHECK: v_pk_add_i16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// CHECK: #ASMEND +check_reg!(v0_i16x2 i16x2 "v0" x: i16x2 "v1", y: i16x2 "v2", "v_pk_add_i16"); + +// CHECK-LABEL: v0_f16x2: +// CHECK: #ASMSTART +// CHECK: v_pk_add_f16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// CHECK: #ASMEND +check_reg!(v0_f16x2 f16x2 "v0" x: f16x2 "v1", y: f16x2 "v2", "v_pk_add_f16"); + +// CHECK-LABEL: s0_i16x4: +// CHECK: #ASMSTART +// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i16x4 i16x4 "s[0:1]" x: ptr "s[2:3]", y: i32 "s4", "s_load_b64"); + +// CHECK-LABEL: s0_f16x4: +// CHECK: #ASMSTART +// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f16x4 f16x4 "s[0:1]" x: ptr "s[2:3]", y: i32 "s4", "s_load_b64"); + +// CHECK-LABEL: s0_i32x2: +// CHECK: #ASMSTART +// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i32x2 i32x2 "s[0:1]" x: ptr "s[2:3]", y: i32 "s4", "s_load_b64"); + +// CHECK-LABEL: s0_f32x2: +// CHECK: #ASMSTART +// CHECK: s_load_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f32x2 f32x2 "s[0:1]" x: ptr "s[2:3]", y: i32 "s4", "s_load_b64"); + +// CHECK-LABEL: v0_i16x4: +// CHECK: #ASMSTART +// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_i16x4 i16x4 "v[0:1]" x: i32 "v2", y: ptr "s[0:1]", "global_load_b64"); + +// CHECK-LABEL: v0_f16x4: +// CHECK: #ASMSTART +// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_f16x4 f16x4 "v[0:1]" x: i32 "v2", y: ptr "s[0:1]", "global_load_b64"); + +// CHECK-LABEL: v0_i32x2: +// CHECK: #ASMSTART +// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_i32x2 i32x2 "v[0:1]" x: i32 "v2", y: ptr "s[0:1]", "global_load_b64"); + +// CHECK-LABEL: v0_f32x2: +// CHECK: #ASMSTART +// CHECK: global_load_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_f32x2 f32x2 "v[0:1]" x: i32 "v2", y: ptr "s[0:1]", "global_load_b64"); + +// gfx12-LABEL: s0_i32x3: +// gfx12: #ASMSTART +// gfx12: s_load_b96 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// gfx12: #ASMEND +#[cfg(gfx12)] +check_reg!(s0_i32x3 i32x3 "s[0:2]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b96"); + +// gfx12-LABEL: s0_f32x3: +// gfx12: #ASMSTART +// gfx12: s_load_b96 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// gfx12: #ASMEND +#[cfg(gfx12)] +check_reg!(s0_f32x3 f32x3 "s[0:2]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b96"); + +// CHECK-LABEL: v0_i32x3: +// CHECK: #ASMSTART +// CHECK: global_load_b96 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_i32x3 i32x3 "v[0:2]" x: i32 "v3", y: ptr "s[0:1]", "global_load_b96"); + +// CHECK-LABEL: v0_f32x3: +// CHECK: #ASMSTART +// CHECK: global_load_b96 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_f32x3 f32x3 "v[0:2]" x: i32 "v3", y: ptr "s[0:1]", "global_load_b96"); + +// CHECK-LABEL: s0_i16x8: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i16x8 i16x8 "s[0:3]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b128"); + +// CHECK-LABEL: s0_f16x8: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f16x8 f16x8 "s[0:3]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b128"); + +// CHECK-LABEL: s0_i32x4: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i32x4 i32x4 "s[0:3]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b128"); + +// CHECK-LABEL: s0_f32x4: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f32x4 f32x4 "s[0:3]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b128"); + +// CHECK-LABEL: s0_i64x2: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i64x2 i64x2 "s[0:3]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b128"); + +// CHECK-LABEL: s0_f64x2: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f64x2 f64x2 "s[0:3]" x: ptr "s[4:5]", y: i32 "s6", "s_load_b128"); + +// CHECK-LABEL: v0_i16x8: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_i16x8 i16x8 "v[0:3]" x: i32 "v4", y: ptr "s[0:1]", "global_load_b128"); + +// CHECK-LABEL: v0_f16x8: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_f16x8 f16x8 "v[0:3]" x: i32 "v4", y: ptr "s[0:1]", "global_load_b128"); + +// CHECK-LABEL: v0_i32x4: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_i32x4 i32x4 "v[0:3]" x: i32 "v4", y: ptr "s[0:1]", "global_load_b128"); + +// CHECK-LABEL: v0_f32x4: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_f32x4 f32x4 "v[0:3]" x: i32 "v4", y: ptr "s[0:1]", "global_load_b128"); + +// CHECK-LABEL: v0_i64x2: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_i64x2 i64x2 "v[0:3]" x: i32 "v4", y: ptr "s[0:1]", "global_load_b128"); + +// CHECK-LABEL: v0_f64x2: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_f64x2 f64x2 "v[0:3]" x: i32 "v4", y: ptr "s[0:1]", "global_load_b128"); + +// CHECK-LABEL: v0_i32x5: +// CHECK: #ASMSTART +// CHECK: image_load v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_1D tfe +// CHECK: #ASMEND +check_reg!(v0_i32x5 i32x5 "v[0:4]" x: i32 "v5", y: i32x8 "s[0:7]", "image_load", + " dmask:0xf dim:SQ_RSRC_IMG_1D tfe"); + +// CHECK-LABEL: v0_f32x5: +// CHECK: #ASMSTART +// CHECK: image_load v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_1D tfe +// CHECK: #ASMEND +check_reg!(v0_f32x5 f32x5 "v[0:4]" x: i32 "v5", y: i32x8 "s[0:7]", "image_load", + " dmask:0xf dim:SQ_RSRC_IMG_1D tfe"); + +// gfx11-LABEL: v0_i32x6: +// gfx11: #ASMSTART +// gfx11: image_sample_d v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_i32x6 i32x4 "v[0:3]" x: i32x6 "v[4:9]", y: i32x8 "s[0:7]", z: i32x4 "s[8:11]", + "image_sample_d", " dmask:0xf dim:SQ_RSRC_IMG_2D"); + +// gfx11-LABEL: v0_f32x6: +// gfx11: #ASMSTART +// gfx11: image_sample_d v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_f32x6 i32x4 "v[0:3]" x: f32x6 "v[4:9]", y: i32x8 "s[0:7]", z: i32x4 "s[8:11]", + "image_sample_d", " dmask:0xf dim:SQ_RSRC_IMG_2D"); + +// gfx11-LABEL: v0_i32x7: +// gfx11: #ASMSTART +// gfx11: image_sample_d_cl v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_i32x7 i32x4 "v[0:3]" x: i32x7 "v[4:10]", y: i32x8 "s[0:7]", z: i32x4 "s[8:11]", + "image_sample_d_cl", " dmask:0xf dim:SQ_RSRC_IMG_2D"); + +// gfx11-LABEL: v0_f32x7: +// gfx11: #ASMSTART +// gfx11: image_sample_d_cl v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xf dim:SQ_RSRC_IMG_2D +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_f32x7 i32x4 "v[0:3]" x: f32x7 "v[4:10]", y: i32x8 "s[0:7]", z: i32x4 "s[8:11]", + "image_sample_d_cl", " dmask:0xf dim:SQ_RSRC_IMG_2D"); + +// CHECK-LABEL: s0_i16x16: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i16x16 i16x16 "s[0:7]" x: ptr "s[8:9]", y: i32 "s10", "s_load_b256"); + +// CHECK-LABEL: s0_f16x16: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f16x16 f16x16 "s[0:7]" x: ptr "s[8:9]", y: i32 "s10", "s_load_b256"); + +// CHECK-LABEL: s0_i32x8: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i32x8 i32x8 "s[0:7]" x: ptr "s[8:9]", y: i32 "s10", "s_load_b256"); + +// CHECK-LABEL: s0_f32x8: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f32x8 f32x8 "s[0:7]" x: ptr "s[8:9]", y: i32 "s10", "s_load_b256"); + +// CHECK-LABEL: s0_i64x4: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i64x4 i64x4 "s[0:7]" x: ptr "s[8:9]", y: i32 "s10", "s_load_b256"); + +// CHECK-LABEL: s0_f64x4: +// CHECK: #ASMSTART +// CHECK: s_load_b256 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f64x4 f64x4 "s[0:7]" x: ptr "s[8:9]", y: i32 "s10", "s_load_b256"); + +// gfx11-LABEL: v0_i16x16: +// gfx11: #ASMSTART +// gfx11: v_wmma_f32_16x16x16_bf16 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_i16x16 f32x8 "v[0:7]" x: i32x8 "v[8:15]", y: i16x16 "v[16:23]", z: f32x8 "v[24:31]", + "v_wmma_f32_16x16x16_bf16"); + +// gfx11-LABEL: v0_f16x16: +// gfx11: #ASMSTART +// gfx11: v_wmma_f32_16x16x16_f16 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_f16x16 f32x8 "v[0:7]" x: f16x16 "v[8:15]", y: f16x16 "v[16:23]", z: f32x8 "v[24:31]", + "v_wmma_f32_16x16x16_f16"); + +// gfx11-LABEL: v0_i32x8: +// gfx11: #ASMSTART +// gfx11: v_wmma_i32_16x16x16_iu8 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_i32x8 i32x8 "v[0:7]" x: i32x4 "v[8:11]", y: i32x4 "v[16:19]", z: i32x8 "v[24:31]", + "v_wmma_i32_16x16x16_iu8"); + +// gfx12-LABEL: v0_f32x8: +// gfx12: #ASMSTART +// gfx12: v_wmma_f32_16x16x16_fp8_fp8 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx12: #ASMEND +#[cfg(gfx12)] +check_reg!(v0_f32x8 f32x8 "v[0:7]" x: f32x2 "v[8:9]", y: f32x2 "v[16:17]", z: f32x8 "v[24:31]", + "v_wmma_f32_16x16x16_fp8_fp8"); + +// gfx12-LABEL: v0_i32x10: +// gfx12: #ASMSTART +// gfx12: image_bvh8_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3] +// gfx12: #ASMEND +#[cfg(gfx12)] +check_reg!(v0_i32x10 i32x10 "v[0:9]" "image_bvh8_intersect_ray", + ", [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3]"); + +// gfx12-LABEL: v0_f32x10: +// gfx12: #ASMSTART +// gfx12: image_bvh8_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3] +// gfx12: #ASMEND +#[cfg(gfx12)] +check_reg!(v0_f32x10 f32x10 "v[0:9]" "image_bvh8_intersect_ray", + ", [v[0:1], v[2:3], v[16:18], v[19:21], v9], s[0:3]"); + +// CHECK-LABEL: s0_i16x32: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i16x32 i16x32 "s[0:15]" x: ptr "s[16:17]", y: i32 "s18", "s_load_b512"); + +// CHECK-LABEL: s0_f16x32: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f16x32 f16x32 "s[0:15]" x: ptr "s[16:17]", y: i32 "s18", "s_load_b512"); + +// CHECK-LABEL: s0_i32x16: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i32x16 i32x16 "s[0:15]" x: ptr "s[16:17]", y: i32 "s18", "s_load_b512"); + +// CHECK-LABEL: s0_f32x16: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f32x16 f32x16 "s[0:15]" x: ptr "s[16:17]", y: i32 "s18", "s_load_b512"); + +// CHECK-LABEL: s0_i64x8: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i64x8 i64x8 "s[0:15]" x: ptr "s[16:17]", y: i32 "s18", "s_load_b512"); + +// CHECK-LABEL: s0_f64x8: +// CHECK: #ASMSTART +// CHECK: s_load_b512 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f64x8 f64x8 "s[0:15]" x: ptr "s[16:17]", y: i32 "s18", "s_load_b512"); diff --git a/tests/assembly-llvm/asm/amdgpu-vec-types2.rs b/tests/assembly-llvm/asm/amdgpu-vec-types2.rs new file mode 100644 index 0000000000000..14427e3b9d528 --- /dev/null +++ b/tests/assembly-llvm/asm/amdgpu-vec-types2.rs @@ -0,0 +1,283 @@ +//@ add-minicore +//@ revisions: gfx942 gfx950 gfx1030 +//@ assembly-output: emit-asm +//@ compile-flags: --target amdgcn-amd-amdhsa +//@[gfx942] compile-flags: -Ctarget-cpu=gfx942 +//@[gfx950] compile-flags: -Ctarget-cpu=gfx950 +//@[gfx1030] compile-flags: -Ctarget-cpu=gfx1030 +//@ needs-llvm-components: amdgpu +//@ needs-rust-lld +// ignore-tidy-linelength + +// Tests for different gfx versions that do not fit in gfx11 and 12 + +#![feature(abi_gpu_kernel, no_core, asm_experimental_arch, repr_simd, f16)] +#![crate_type = "rlib"] +#![no_core] +#![allow( + asm_sub_register, + improper_gpu_kernel_arg, + improper_ctypes_definitions, + non_camel_case_types, + unused_assignments, + unused_variables +)] + +extern crate minicore; +use minicore::*; + +type ptr = *mut u8; + +#[repr(simd)] +pub struct i32x4([i32; 4]); +#[repr(simd)] +pub struct f32x4([f32; 4]); + +#[repr(simd)] +pub struct f64x4([f64; 4]); + +#[repr(simd)] +pub struct i32x9([i32; 9]); +#[repr(simd)] +pub struct f32x9([f32; 9]); + +#[repr(simd)] +pub struct i32x11([i32; 11]); +#[repr(simd)] +pub struct f32x11([f32; 11]); + +#[repr(simd)] +pub struct i32x12([i32; 12]); +#[repr(simd)] +pub struct f32x12([f32; 12]); + +#[repr(simd)] +pub struct i16x32([i16; 32]); +#[repr(simd)] +pub struct f16x32([f16; 32]); +#[repr(simd)] +pub struct i32x16([i32; 16]); +#[repr(simd)] +pub struct f32x16([f32; 16]); + +#[repr(simd)] +pub struct f32x32([f32; 32]); + +macro_rules! impl_copy { + ($($ty:ident)*) => { + $( + impl Copy for $ty {} + )* + }; +} + +impl_copy!( + i32x4 f32x4 f64x4 i32x9 f32x9 i32x11 f32x11 i32x12 f32x12 i16x32 f16x32 + i32x16 f32x16 f32x32 +); + +macro_rules! check { + ($func:ident $ty:ident $class:ident $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func(x: $ty) { + let y: $ty; + asm!(concat!($mov, " {}, {}"), out($class) y, in($class) x); + } + }; + + ($func:ident $ret_ty:ident $ret_class:ident $($arg_name:ident: $arg_ty:ident $arg_class:ident,)* + $mov:literal) => { + check!($func $ret_ty $ret_class $($arg_name: $arg_ty $arg_class,)* $mov, ""); + }; + + ($func:ident $ret_ty:ident $ret_class:ident $($arg_name:ident: $arg_ty:ident $arg_class:ident,)* + $mov:literal, $tail:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func($($arg_name: $arg_ty,)*) { + let result: $ret_ty; + asm!(concat!($mov, " {}", $(", {", stringify!($arg_name), "}",)* $tail), + out($ret_class) result, $($arg_name = in($arg_class) $arg_name,)*); + } + }; +} + +macro_rules! check_reg { + ($func:ident $ty:ident $reg:tt $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func(x: $ty) { + let y: $ty; + asm!(concat!($mov, " ", $reg, ", ", $reg), lateout($reg) y, in($reg) x); + } + }; + + ($func:ident $ret_ty:ident $ret_reg:tt $($arg_name:ident: $arg_ty:ident $arg_reg:tt,)* + $mov:literal) => { + check_reg!($func $ret_ty $ret_reg $($arg_name: $arg_ty $arg_reg,)* $mov, ""); + }; + + ($func:ident $ret_ty:ident $ret_reg:tt $($arg_name:ident: $arg_ty:ident $arg_reg:tt,)* + $mov:literal, $tail:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func($($arg_name: $arg_ty,)*) { + let result: $ret_ty; + asm!(concat!($mov, " ", $ret_reg, $(", ", $arg_reg,)* $tail), lateout($ret_reg) result, + $(in($arg_reg) $arg_name,)*); + } + }; +} + +// gfx942-LABEL: vgpr_f64x4: +// gfx942: #ASMSTART +// gfx942: v_mfma_f64_16x16x4_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx942: #ASMEND +#[cfg(gfx942)] +check!(vgpr_f64x4 f64x4 vgpr256 x: f64 vgpr64, y: f64 vgpr64, z: f64x4 vgpr256, + "v_mfma_f64_16x16x4_f64"); + +// gfx1030-LABEL: vgpr_i32x9: +// gfx1030: #ASMSTART +// gfx1030: image_bvh64_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} a16 +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check!(vgpr_i32x9 i32x4 vgpr128 x: i32x9 vgpr288, y: i32x4 sgpr128, "image_bvh64_intersect_ray", + " a16"); + +// gfx1030-LABEL: vgpr_f32x9: +// gfx1030: #ASMSTART +// gfx1030: image_bvh64_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} a16 +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check!(vgpr_f32x9 i32x4 vgpr128 x: f32x9 vgpr288, y: i32x4 sgpr128, "image_bvh64_intersect_ray", + " a16"); + +// gfx1030-LABEL: vgpr_i32x11: +// gfx1030: #ASMSTART +// gfx1030: image_bvh_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check!(vgpr_i32x11 i32x4 vgpr128 x: i32x11 vgpr352, y: i32x4 sgpr128, "image_bvh_intersect_ray"); + +// gfx1030-LABEL: vgpr_f32x11: +// gfx1030: #ASMSTART +// gfx1030: image_bvh_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check!(vgpr_f32x11 i32x4 vgpr128 x: f32x11 vgpr352, y: i32x4 sgpr128, "image_bvh_intersect_ray"); + +// gfx1030-LABEL: vgpr_i32x12: +// gfx1030: #ASMSTART +// gfx1030: image_bvh64_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check!(vgpr_i32x12 i32x4 vgpr128 x: i32x12 vgpr384, y: i32x4 sgpr128, "image_bvh64_intersect_ray"); + +// gfx1030-LABEL: vgpr_f32x12: +// gfx1030: #ASMSTART +// gfx1030: image_bvh64_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check!(vgpr_f32x12 i32x4 vgpr128 x: f32x12 vgpr384, y: i32x4 sgpr128, "image_bvh64_intersect_ray"); + +// gfx950-LABEL: vgpr_i32x16: +// gfx950: #ASMSTART +// gfx950: v_mfma_i32_32x32x32_i8 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx950: #ASMEND +#[cfg(gfx950)] +check!(vgpr_i32x16 i32x16 vgpr512 x: i32x4 vgpr128, y: i32x4 vgpr128, z: i16x32 vgpr512, + "v_mfma_i32_32x32x32_i8"); + +// gfx950-LABEL: vgpr_f32x16: +// gfx950: #ASMSTART +// gfx950: v_mfma_f32_32x32x16_f16 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx950: #ASMEND +#[cfg(gfx950)] +check!(vgpr_f32x16 f32x16 vgpr512 x: f32x4 vgpr128, y: f32x4 vgpr128, z: f16x32 vgpr512, + "v_mfma_f32_32x32x16_f16"); + +// gfx942-LABEL: vgpr_f32x32: +// gfx942: #ASMSTART +// gfx942: v_mfma_f32_32x32x1_2b_f32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx942: #ASMEND +#[cfg(gfx942)] +check!(vgpr_f32x32 f32x32 vgpr1024 x: f32 vgpr32, y: f32 vgpr32, "v_mfma_f32_32x32x1_2b_f32", + ", v[0:31]"); + +// gfx942-LABEL: v0_f64x4: +// gfx942: #ASMSTART +// gfx942: v_mfma_f64_16x16x4_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx942: #ASMEND +#[cfg(gfx942)] +check_reg!(v0_f64x4 f64x4 "v[0:7]" x: f64 "v[8:9]", y: f64 "v[10:11]", z: f64x4 "v[16:23]", + "v_mfma_f64_16x16x4_f64"); + +// gfx1030-LABEL: v0_i32x9: +// gfx1030: #ASMSTART +// gfx1030: image_bvh64_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} a16 +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check_reg!(v0_i32x9 i32x4 "v[0:3]" x: i32x9 "v[8:16]", y: i32x4 "s[0:3]", + "image_bvh64_intersect_ray", " a16"); + +// gfx1030-LABEL: v0_f32x9: +// gfx1030: #ASMSTART +// gfx1030: image_bvh64_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} a16 +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check_reg!(v0_f32x9 i32x4 "v[0:3]" x: f32x9 "v[8:16]", y: i32x4 "s[0:3]", + "image_bvh64_intersect_ray", " a16"); + +// gfx1030-LABEL: v0_i32x11: +// gfx1030: #ASMSTART +// gfx1030: image_bvh_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check_reg!(v0_i32x11 i32x4 "v[0:3]" x: i32x11 "v[8:18]", y: i32x4 "s[0:3]", + "image_bvh_intersect_ray"); + +// gfx1030-LABEL: v0_f32x11: +// gfx1030: #ASMSTART +// gfx1030: image_bvh_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check_reg!(v0_f32x11 i32x4 "v[0:3]" x: f32x11 "v[8:18]", y: i32x4 "s[0:3]", + "image_bvh_intersect_ray"); + +// gfx1030-LABEL: v0_i32x12: +// gfx1030: #ASMSTART +// gfx1030: image_bvh64_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check_reg!(v0_i32x12 i32x4 "v[0:3]" x: i32x12 "v[8:19]", y: i32x4 "s[0:3]", + "image_bvh64_intersect_ray"); + +// gfx1030-LABEL: v0_f32x12: +// gfx1030: #ASMSTART +// gfx1030: image_bvh64_intersect_ray v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// gfx1030: #ASMEND +#[cfg(gfx1030)] +check_reg!(v0_f32x12 i32x4 "v[0:3]" x: f32x12 "v[8:19]", y: i32x4 "s[0:3]", + "image_bvh64_intersect_ray"); + +// gfx950-LABEL: v0_i32x16: +// gfx950: #ASMSTART +// gfx950: v_mfma_i32_32x32x32_i8 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx950: #ASMEND +#[cfg(gfx950)] +check_reg!(v0_i32x16 i32x16 "v[0:15]" x: i32x4 "v[16:19]", y: i32x4 "v[20:23]", z: i16x32 "v[0:15]", + "v_mfma_i32_32x32x32_i8"); + +// gfx950-LABEL: v0_f32x16: +// gfx950: #ASMSTART +// gfx950: v_mfma_f32_32x32x16_f16 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx950: #ASMEND +#[cfg(gfx950)] +check_reg!(v0_f32x16 f32x16 "v[0:15]" x: f32x4 "v[16:19]", y: f32x4 "v[20:23]", z: f16x32 "v[0:15]", + "v_mfma_f32_32x32x16_f16"); + +// gfx942-LABEL: v0_f32x32: +// gfx942: #ASMSTART +// gfx942: v_mfma_f32_32x32x1_2b_f32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} +// gfx942: #ASMEND +#[cfg(gfx942)] +check_reg!(v0_f32x32 f32x32 "v[0:31]" x: f32 "v32", y: f32 "v33", "v_mfma_f32_32x32x1_2b_f32", + ", v[0:31]");