From 23cc99d9617263f9d9f33ce4634e0d884bb15189 Mon Sep 17 00:00:00 2001
From: Manoj Vivek
Date: Fri, 22 May 2026 12:43:33 +0530
Subject: [PATCH] Missing instructions
---
.../gpuFrameDescriptions.test.ts | 4 +
.../GraphTooltipArrow/gpuFrameDescriptions.ts | 340 ++++++++++++++++--
2 files changed, 323 insertions(+), 21 deletions(-)
diff --git a/ui/packages/shared/profile/src/GraphTooltipArrow/gpuFrameDescriptions.test.ts b/ui/packages/shared/profile/src/GraphTooltipArrow/gpuFrameDescriptions.test.ts
index ce2db761d03..c9317485e3a 100644
--- a/ui/packages/shared/profile/src/GraphTooltipArrow/gpuFrameDescriptions.test.ts
+++ b/ui/packages/shared/profile/src/GraphTooltipArrow/gpuFrameDescriptions.test.ts
@@ -23,6 +23,10 @@ describe('gpuFrameInfo', () => {
['MOV', 'Move'],
['FFMA', 'FP32 Fused Multiply and Add'],
['LDG', 'Load from Global Memory'],
+ ['LDCU', 'Load a Value from Constant Memory into a Uniform Register'],
+ ['HGMMA', 'Matrix Multiply and Accumulate Across a Warpgroup'],
+ ['UTMALDG', 'Tensor Load from Global to Shared Memory'],
+ ['LDT', 'Load Matrix from Tensor Memory to Register File'],
])('returns SASS info for %s with verbatim description %j', (mnemonic, description) => {
const info = gpuFrameInfo(mnemonic);
expect(info?.kind).toBe('sass');
diff --git a/ui/packages/shared/profile/src/GraphTooltipArrow/gpuFrameDescriptions.ts b/ui/packages/shared/profile/src/GraphTooltipArrow/gpuFrameDescriptions.ts
index 7deff8a48a8..4a91787e4e0 100644
--- a/ui/packages/shared/profile/src/GraphTooltipArrow/gpuFrameDescriptions.ts
+++ b/ui/packages/shared/profile/src/GraphTooltipArrow/gpuFrameDescriptions.ts
@@ -36,7 +36,7 @@ export const NVIDIA_DOCS_LABEL = 'NVIDIA docs';
export const STALL_SOURCE_URL =
'https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#warp-stall-reasons';
export const SASS_SOURCE_URL =
- 'https://docs.nvidia.com/cuda/cuda-binary-utilities/index.html#turing-turing-instruction-set-table';
+ 'https://docs.nvidia.com/cuda/cuda-binary-utilities/index.html#instruction-set-reference';
export interface StallEntry {
reasonLabel: string;
@@ -50,14 +50,35 @@ export interface SASSEntry {
sourceUrl?: string;
}
-// Ref: https://docs.nvidia.com/cuda/cuda-binary-utilities/index.html#turing-turing-instruction-set-table
+// Ref: https://docs.nvidia.com/cuda/cuda-binary-utilities/index.html#instruction-set-reference
+// Covers the Volta/Turing/Ampere/Ada/Hopper/Blackwell instruction set tables too.
export const SASS_INSTRUCTION_DESCRIPTIONS: Record = {
// --- Floating Point Instructions ---
+ DADD: {reasonLabel: 'Floating Point Instructions', description: 'FP64 Add'},
+ DFMA: {reasonLabel: 'Floating Point Instructions', description: 'FP64 Fused Mutiply Add'},
+ DMMA: {reasonLabel: 'Floating Point Instructions', description: 'Matrix Multiply and Accumulate'},
+ DMUL: {reasonLabel: 'Floating Point Instructions', description: 'FP64 Multiply'},
+ DSETP: {
+ reasonLabel: 'Floating Point Instructions',
+ description: 'FP64 Compare And Set Predicate',
+ },
FADD: {reasonLabel: 'Floating Point Instructions', description: 'FP32 Add'},
+ FADD2: {reasonLabel: 'Floating Point Instructions', description: 'FP32 Add'},
+ FADD32I: {reasonLabel: 'Floating Point Instructions', description: 'FP32 Add'},
FCHK: {reasonLabel: 'Floating Point Instructions', description: 'Floating-point Range Check'},
FFMA: {reasonLabel: 'Floating Point Instructions', description: 'FP32 Fused Multiply and Add'},
+ FFMA2: {reasonLabel: 'Floating Point Instructions', description: 'FP32 Fused Multiply and Add'},
+ FFMA32I: {reasonLabel: 'Floating Point Instructions', description: 'FP32 Fused Multiply and Add'},
+ FHADD: {reasonLabel: 'Floating Point Instructions', description: 'FP32 Addition'},
+ FHFMA: {reasonLabel: 'Floating Point Instructions', description: 'FP32 Fused Multiply and Add'},
FMNMX: {reasonLabel: 'Floating Point Instructions', description: 'FP32 Minimum/Maximum'},
+ FMNMX3: {
+ reasonLabel: 'Floating Point Instructions',
+ description: '3-Input Floating-point Minimum/Maximum',
+ },
FMUL: {reasonLabel: 'Floating Point Instructions', description: 'FP32 Multiply'},
+ FMUL2: {reasonLabel: 'Floating Point Instructions', description: 'FP32 Multiply'},
+ FMUL32I: {reasonLabel: 'Floating Point Instructions', description: 'FP32 Multiply'},
FSEL: {reasonLabel: 'Floating Point Instructions', description: 'Floating Point Select'},
FSET: {reasonLabel: 'Floating Point Instructions', description: 'FP32 Compare And Set'},
FSETP: {
@@ -65,22 +86,27 @@ export const SASS_INSTRUCTION_DESCRIPTIONS: Record = {
description: 'FP32 Compare And Set Predicate',
},
FSWZADD: {reasonLabel: 'Floating Point Instructions', description: 'FP32 Swizzle Add'},
- MUFU: {reasonLabel: 'Floating Point Instructions', description: 'FP32 Multi Function Operation'},
HADD2: {reasonLabel: 'Floating Point Instructions', description: 'FP16 Add'},
+ HADD2_32I: {reasonLabel: 'Floating Point Instructions', description: 'FP16 Add'},
HFMA2: {reasonLabel: 'Floating Point Instructions', description: 'FP16 Fused Mutiply Add'},
+ HFMA2_32I: {reasonLabel: 'Floating Point Instructions', description: 'FP16 Fused Multiply Add'},
HMMA: {reasonLabel: 'Floating Point Instructions', description: 'Matrix Multiply and Accumulate'},
+ HMNMX2: {reasonLabel: 'Floating Point Instructions', description: 'FP16 Minimum/Maximum'},
HMUL2: {reasonLabel: 'Floating Point Instructions', description: 'FP16 Multiply'},
+ HMUL2_32I: {reasonLabel: 'Floating Point Instructions', description: 'FP16 Multiply'},
HSET2: {reasonLabel: 'Floating Point Instructions', description: 'FP16 Compare And Set'},
HSETP2: {
reasonLabel: 'Floating Point Instructions',
description: 'FP16 Compare And Set Predicate',
},
- DADD: {reasonLabel: 'Floating Point Instructions', description: 'FP64 Add'},
- DFMA: {reasonLabel: 'Floating Point Instructions', description: 'FP64 Fused Mutiply Add'},
- DMUL: {reasonLabel: 'Floating Point Instructions', description: 'FP64 Multiply'},
- DSETP: {
+ MUFU: {reasonLabel: 'Floating Point Instructions', description: 'FP32 Multi Function Operation'},
+ OMMA: {
reasonLabel: 'Floating Point Instructions',
- description: 'FP64 Compare And Set Predicate',
+ description: 'FP4 Matrix Multiply and Accumulate Across a Warp',
+ },
+ QMMA: {
+ reasonLabel: 'Floating Point Instructions',
+ description: 'FP8 Matrix Multiply and Accumulate Across a Warp',
},
// --- Integer Instructions ---
@@ -91,6 +117,7 @@ export const SASS_INSTRUCTION_DESCRIPTIONS: Record = {
IABS: {reasonLabel: 'Integer Instructions', description: 'Integer Absolute Value'},
IADD: {reasonLabel: 'Integer Instructions', description: 'Integer Addition'},
IADD3: {reasonLabel: 'Integer Instructions', description: '3-input Integer Addition'},
+ IADD32I: {reasonLabel: 'Integer Instructions', description: 'Integer Addition'},
IDP: {reasonLabel: 'Integer Instructions', description: 'Integer Dot Product and Accumulate'},
IDP4A: {reasonLabel: 'Integer Instructions', description: 'Integer Dot Product and Accumulate'},
IMAD: {reasonLabel: 'Integer Instructions', description: 'Integer Multiply And Add'},
@@ -100,17 +127,31 @@ export const SASS_INSTRUCTION_DESCRIPTIONS: Record = {
},
IMNMX: {reasonLabel: 'Integer Instructions', description: 'Integer Minimum/Maximum'},
IMUL: {reasonLabel: 'Integer Instructions', description: 'Integer Multiply'},
+ IMUL32I: {reasonLabel: 'Integer Instructions', description: 'Integer Multiply'},
ISCADD: {reasonLabel: 'Integer Instructions', description: 'Scaled Integer Addition'},
+ ISCADD32I: {reasonLabel: 'Integer Instructions', description: 'Scaled Integer Addition'},
ISETP: {reasonLabel: 'Integer Instructions', description: 'Integer Compare And Set Predicate'},
LEA: {reasonLabel: 'Integer Instructions', description: 'LOAD Effective Address'},
LOP: {reasonLabel: 'Integer Instructions', description: 'Logic Operation'},
LOP3: {reasonLabel: 'Integer Instructions', description: 'Logic Operation'},
+ LOP32I: {reasonLabel: 'Integer Instructions', description: 'Logic Operation'},
POPC: {reasonLabel: 'Integer Instructions', description: 'Population count'},
SHF: {reasonLabel: 'Integer Instructions', description: 'Funnel Shift'},
SHL: {reasonLabel: 'Integer Instructions', description: 'Shift Left'},
SHR: {reasonLabel: 'Integer Instructions', description: 'Shift Right'},
VABSDIFF: {reasonLabel: 'Integer Instructions', description: 'Absolute Difference'},
VABSDIFF4: {reasonLabel: 'Integer Instructions', description: 'Absolute Difference'},
+ VHMNMX: {reasonLabel: 'Integer Instructions', description: 'SIMD FP16 3-Input Minimum/Maximum'},
+ VIADD: {reasonLabel: 'Integer Instructions', description: 'SIMD Integer Addition'},
+ VIADDMNMX: {
+ reasonLabel: 'Integer Instructions',
+ description: 'SIMD Integer Addition and Fused Min/Max Comparison',
+ },
+ VIMNMX: {reasonLabel: 'Integer Instructions', description: 'SIMD Integer Minimum/Maximum'},
+ VIMNMX3: {
+ reasonLabel: 'Integer Instructions',
+ description: 'SIMD Integer 3-Input Minimum/Maximum',
+ },
// --- Conversion Instructions ---
F2F: {
@@ -121,19 +162,25 @@ export const SASS_INSTRUCTION_DESCRIPTIONS: Record = {
reasonLabel: 'Conversion Instructions',
description: 'Floating Point To Integer Conversion',
},
+ F2IP: {
+ reasonLabel: 'Conversion Instructions',
+ description: 'FP32 Down-Convert to Integer and Pack',
+ },
+ FRND: {reasonLabel: 'Conversion Instructions', description: 'Round To Integer'},
I2F: {
reasonLabel: 'Conversion Instructions',
description: 'Integer To Floating Point Conversion',
},
+ I2FP: {reasonLabel: 'Conversion Instructions', description: 'Integer to FP32 Convert and Pack'},
I2I: {reasonLabel: 'Conversion Instructions', description: 'Integer To Integer Conversion'},
I2IP: {
reasonLabel: 'Conversion Instructions',
description: 'Integer To Integer Conversion and Packing',
},
- FRND: {reasonLabel: 'Conversion Instructions', description: 'Round To Integer'},
// --- Movement Instructions ---
MOV: {reasonLabel: 'Movement Instructions', description: 'Move'},
+ MOV32I: {reasonLabel: 'Movement Instructions', description: 'Move'},
MOVM: {
reasonLabel: 'Movement Instructions',
description: 'Move Matrix with Transposition or Expansion',
@@ -153,58 +200,177 @@ export const SASS_INSTRUCTION_DESCRIPTIONS: Record = {
R2P: {reasonLabel: 'Predicate Instructions', description: 'Move Register To Predicate Register'},
// --- Load/Store Instructions ---
+ ATOM: {reasonLabel: 'Load/Store Instructions', description: 'Atomic Operation on Generic Memory'},
+ ATOMG: {reasonLabel: 'Load/Store Instructions', description: 'Atomic Operation on Global Memory'},
+ ATOMS: {reasonLabel: 'Load/Store Instructions', description: 'Atomic Operation on Shared Memory'},
+ CCTL: {reasonLabel: 'Load/Store Instructions', description: 'Cache Control'},
+ CCTLL: {reasonLabel: 'Load/Store Instructions', description: 'Cache Control'},
+ CCTLT: {reasonLabel: 'Load/Store Instructions', description: 'Texture Cache Control'},
+ ERRBAR: {reasonLabel: 'Load/Store Instructions', description: 'Error Barrier'},
+ FENCE: {
+ reasonLabel: 'Load/Store Instructions',
+ description: 'Memory Visibility Guarantee for Shared or Global Memory',
+ },
LD: {reasonLabel: 'Load/Store Instructions', description: 'Load from generic Memory'},
LDC: {reasonLabel: 'Load/Store Instructions', description: 'Load Constant'},
LDG: {reasonLabel: 'Load/Store Instructions', description: 'Load from Global Memory'},
+ LDGDEPBAR: {
+ reasonLabel: 'Load/Store Instructions',
+ description: 'Global Load Dependency Barrier',
+ },
+ LDGMC: {reasonLabel: 'Load/Store Instructions', description: 'Reducing Load'},
+ LDGSTS: {
+ reasonLabel: 'Load/Store Instructions',
+ description: 'Asynchronous Global to Shared Memcopy',
+ },
LDL: {reasonLabel: 'Load/Store Instructions', description: 'Load within Local Memory Window'},
LDS: {reasonLabel: 'Load/Store Instructions', description: 'Load within Shared Memory Window'},
LDSM: {
reasonLabel: 'Load/Store Instructions',
description: 'Load Matrix from Shared Memory with Element Size Expansion',
},
- ST: {reasonLabel: 'Load/Store Instructions', description: 'Store to Generic Memory'},
- STG: {reasonLabel: 'Load/Store Instructions', description: 'Store to Global Memory'},
- STL: {reasonLabel: 'Load/Store Instructions', description: 'Store to Local Memory'},
- STS: {reasonLabel: 'Load/Store Instructions', description: 'Store to Shared Memory'},
MATCH: {
reasonLabel: 'Load/Store Instructions',
description: 'Match Register Values Across Thread Group',
},
+ MEMBAR: {reasonLabel: 'Load/Store Instructions', description: 'Memory Barrier'},
QSPC: {reasonLabel: 'Load/Store Instructions', description: 'Query Space'},
- ATOM: {reasonLabel: 'Load/Store Instructions', description: 'Atomic Operation on Generic Memory'},
- ATOMS: {reasonLabel: 'Load/Store Instructions', description: 'Atomic Operation on Shared Memory'},
- ATOMG: {reasonLabel: 'Load/Store Instructions', description: 'Atomic Operation on Global Memory'},
RED: {
reasonLabel: 'Load/Store Instructions',
description: 'Reduction Operation on Generic Memory',
},
- CCTL: {reasonLabel: 'Load/Store Instructions', description: 'Cache Control'},
- CCTLL: {reasonLabel: 'Load/Store Instructions', description: 'Cache Control'},
- ERRBAR: {reasonLabel: 'Load/Store Instructions', description: 'Error Barrier'},
- MEMBAR: {reasonLabel: 'Load/Store Instructions', description: 'Memory Barrier'},
- CCTLT: {reasonLabel: 'Load/Store Instructions', description: 'Texture Cache Control'},
+ REDAS: {
+ reasonLabel: 'Load/Store Instructions',
+ description:
+ 'Asynchronous Reduction on Distributed Shared Memory With Explicit Synchronization',
+ },
+ REDG: {
+ reasonLabel: 'Load/Store Instructions',
+ description: 'Reduction Operation on Generic Memory',
+ },
+ ST: {reasonLabel: 'Load/Store Instructions', description: 'Store to Generic Memory'},
+ STAS: {
+ reasonLabel: 'Load/Store Instructions',
+ description: 'Asynchronous Store to Distributed Shared Memory With Explicit Synchronization',
+ },
+ STG: {reasonLabel: 'Load/Store Instructions', description: 'Store to Global Memory'},
+ STL: {reasonLabel: 'Load/Store Instructions', description: 'Store to Local Memory'},
+ STS: {reasonLabel: 'Load/Store Instructions', description: 'Store to Shared Memory'},
+ STSM: {reasonLabel: 'Load/Store Instructions', description: 'Store Matrix to Shared Memory'},
+ SYNCS: {reasonLabel: 'Load/Store Instructions', description: 'Sync Unit'},
// --- Uniform Datapath Instructions ---
+ CREDUX: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Coupled Reduction of a Vector Register into a Uniform Register',
+ },
+ CS2UR: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Load a Value from Constant Memory into a Uniform Register',
+ },
+ LDCU: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Load a Value from Constant Memory into a Uniform Register',
+ },
R2UR: {
reasonLabel: 'Uniform Datapath Instructions',
description: 'Move from Vector Register to a Uniform Register',
},
+ REDUX: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Reduction of a Vector Register into a Uniform Register',
+ },
S2UR: {
reasonLabel: 'Uniform Datapath Instructions',
description: 'Move Special Register to Uniform Register',
},
UBMSK: {reasonLabel: 'Uniform Datapath Instructions', description: 'Uniform Bitfield Mask'},
UBREV: {reasonLabel: 'Uniform Datapath Instructions', description: 'Uniform Bit Reverse'},
+ UCGABAR_ARV: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'CGA Barrier Synchronization',
+ },
+ UCGABAR_WAIT: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'CGA Barrier Synchronization',
+ },
UCLEA: {
reasonLabel: 'Uniform Datapath Instructions',
description: 'Load Effective Address for a Constant',
},
+ UF2F: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform Float-to-Float Conversion',
+ },
+ UF2FP: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform FP32 Down-convert and Pack',
+ },
+ UF2I: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform Float-to-Integer Conversion',
+ },
+ UF2IP: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform FP32 Down-Convert to Integer and Pack',
+ },
+ UFADD: {reasonLabel: 'Uniform Datapath Instructions', description: 'Uniform FP32 Addition'},
+ UFFMA: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform FP32 Fused Multiply-Add',
+ },
UFLO: {reasonLabel: 'Uniform Datapath Instructions', description: 'Uniform Find Leading One'},
+ UFMNMX: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform Floating-point Minimum/Maximum',
+ },
+ UFMUL: {reasonLabel: 'Uniform Datapath Instructions', description: 'Uniform FP32 Multiply'},
+ UFRND: {reasonLabel: 'Uniform Datapath Instructions', description: 'Uniform Round to Integer'},
+ UFSEL: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform Floating-Point Select',
+ },
+ UFSET: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform Floating-Point Compare and Set',
+ },
+ UFSETP: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform Floating-Point Compare and Set Predicate',
+ },
+ UGETNEXTWORKID: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform Get Next Work ID',
+ },
+ UI2F: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform Integer to Float conversion',
+ },
+ UI2FP: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform Integer to FP32 Convert and Pack',
+ },
+ UI2I: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform Saturating Integer-to-Integer Conversion',
+ },
+ UI2IP: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform Dual Saturating Integer-to-Integer Conversion and Packing',
+ },
+ UIABS: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform Integer Absolute Value',
+ },
UIADD3: {reasonLabel: 'Uniform Datapath Instructions', description: 'Uniform Integer Addition'},
UIMAD: {
reasonLabel: 'Uniform Datapath Instructions',
description: 'Uniform Integer Multiplication',
},
+ UIMNMX: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform Integer Minimum/Maximum',
+ },
UISETP: {
reasonLabel: 'Uniform Datapath Instructions',
description: 'Integer Compare and Set Uniform Predicate',
@@ -217,8 +383,11 @@ export const SASS_INSTRUCTION_DESCRIPTIONS: Record = {
reasonLabel: 'Uniform Datapath Instructions',
description: 'Uniform Load Effective Address',
},
+ ULEPC: {reasonLabel: 'Uniform Datapath Instructions', description: 'Uniform Load Effective PC'},
ULOP: {reasonLabel: 'Uniform Datapath Instructions', description: 'Logic Operation'},
ULOP3: {reasonLabel: 'Uniform Datapath Instructions', description: 'Logic Operation'},
+ ULOP32I: {reasonLabel: 'Uniform Datapath Instructions', description: 'Logic Operation'},
+ UMEMSETS: {reasonLabel: 'Uniform Datapath Instructions', description: 'Initialize Shared Memory'},
UMOV: {reasonLabel: 'Uniform Datapath Instructions', description: 'Uniform Move'},
UP2UR: {
reasonLabel: 'Uniform Datapath Instructions',
@@ -238,11 +407,35 @@ export const SASS_INSTRUCTION_DESCRIPTIONS: Record = {
reasonLabel: 'Uniform Datapath Instructions',
description: 'Uniform Register to Uniform Predicate',
},
+ UREDGR: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform Reduction on Global Memory with Release',
+ },
USEL: {reasonLabel: 'Uniform Datapath Instructions', description: 'Uniform Select'},
+ USETMAXREG: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Release, Deallocate and Allocate Registers',
+ },
USGXT: {reasonLabel: 'Uniform Datapath Instructions', description: 'Uniform Sign Extend'},
USHF: {reasonLabel: 'Uniform Datapath Instructions', description: 'Uniform Funnel Shift'},
USHL: {reasonLabel: 'Uniform Datapath Instructions', description: 'Uniform Left Shift'},
USHR: {reasonLabel: 'Uniform Datapath Instructions', description: 'Uniform Right Shift'},
+ USTGR: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform Store to Global Memory with Release',
+ },
+ UVIADD: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform SIMD Integer Addition',
+ },
+ UVIMNMX: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Uniform SIMD Integer Minimum/Maximum',
+ },
+ UVIRTCOUNT: {
+ reasonLabel: 'Uniform Datapath Instructions',
+ description: 'Virtual Resource Management',
+ },
VOTEU: {
reasonLabel: 'Uniform Datapath Instructions',
description: 'Voting across SIMD Thread Group with Results in Uniform Destination',
@@ -263,6 +456,14 @@ export const SASS_INSTRUCTION_DESCRIPTIONS: Record = {
SUST: {reasonLabel: 'Surface Instructions', description: 'Surface Store'},
// --- Control Instructions ---
+ ACQBULK: {
+ reasonLabel: 'Control Instructions',
+ description: 'Wait for Bulk Release Status Warp State',
+ },
+ ACQSHMINIT: {
+ reasonLabel: 'Control Instructions',
+ description: 'Wait for Shared Memory Initialization Release Status Warp State',
+ },
BMOV: {reasonLabel: 'Control Instructions', description: 'Move Convergence Barrier State'},
BPT: {reasonLabel: 'Control Instructions', description: 'BreakPoint/Trap'},
BRA: {reasonLabel: 'Control Instructions', description: 'Relative Branch'},
@@ -284,6 +485,9 @@ export const SASS_INSTRUCTION_DESCRIPTIONS: Record = {
description: 'Synchronize Threads on a Convergence Barrier',
},
CALL: {reasonLabel: 'Control Instructions', description: 'Call Function'},
+ CGAERRBAR: {reasonLabel: 'Control Instructions', description: 'CGA Error Barrier'},
+ ELECT: {reasonLabel: 'Control Instructions', description: 'Elect a Leader Thread'},
+ ENDCOLLECTIVE: {reasonLabel: 'Control Instructions', description: 'Reset the MCOLLECTIVE mask'},
EXIT: {reasonLabel: 'Control Instructions', description: 'Exit Program'},
JMP: {reasonLabel: 'Control Instructions', description: 'Absolute Jump'},
JMX: {reasonLabel: 'Control Instructions', description: 'Absolute Jump Indirect'},
@@ -293,6 +497,7 @@ export const SASS_INSTRUCTION_DESCRIPTIONS: Record = {
},
KILL: {reasonLabel: 'Control Instructions', description: 'Kill Thread'},
NANOSLEEP: {reasonLabel: 'Control Instructions', description: 'Suspend Execution'},
+ PREEXIT: {reasonLabel: 'Control Instructions', description: 'Dependent Task Launch Hint'},
RET: {reasonLabel: 'Control Instructions', description: 'Return From Subroutine'},
RPCMOV: {reasonLabel: 'Control Instructions', description: 'PC Register Move'},
RTT: {reasonLabel: 'Control Instructions', description: 'Return From Trap'},
@@ -325,6 +530,99 @@ export const SASS_INSTRUCTION_DESCRIPTIONS: Record = {
description: 'Set Local Memory Base Address',
},
VOTE: {reasonLabel: 'Miscellaneous Instructions', description: 'Vote Across SIMD Thread Group'},
+
+ // --- Warpgroup Instructions ---
+ BGMMA: {
+ reasonLabel: 'Warpgroup Instructions',
+ description: 'Bit Matrix Multiply and Accumulate Across Warps',
+ },
+ HGMMA: {
+ reasonLabel: 'Warpgroup Instructions',
+ description: 'Matrix Multiply and Accumulate Across a Warpgroup',
+ },
+ IGMMA: {
+ reasonLabel: 'Warpgroup Instructions',
+ description: 'Integer Matrix Multiply and Accumulate Across a Warpgroup',
+ },
+ QGMMA: {
+ reasonLabel: 'Warpgroup Instructions',
+ description: 'FP8 Matrix Multiply and Accumulate Across a Warpgroup',
+ },
+ WARPGROUP: {reasonLabel: 'Warpgroup Instructions', description: 'Warpgroup Synchronization'},
+ WARPGROUPSET: {reasonLabel: 'Warpgroup Instructions', description: 'Set Warpgroup Counters'},
+
+ // --- Tensor Memory Access Instructions ---
+ UBLKCP: {reasonLabel: 'Tensor Memory Access Instructions', description: 'Bulk Data Copy'},
+ UBLKPF: {reasonLabel: 'Tensor Memory Access Instructions', description: 'Bulk Data Prefetch'},
+ UBLKRED: {
+ reasonLabel: 'Tensor Memory Access Instructions',
+ description: 'Bulk Data Copy from Shared Memory with Reduction',
+ },
+ UTMACCTL: {reasonLabel: 'Tensor Memory Access Instructions', description: 'TMA Cache Control'},
+ UTMACMDFLUSH: {
+ reasonLabel: 'Tensor Memory Access Instructions',
+ description: 'TMA Command Flush',
+ },
+ UTMALDG: {
+ reasonLabel: 'Tensor Memory Access Instructions',
+ description: 'Tensor Load from Global to Shared Memory',
+ },
+ UTMAPF: {reasonLabel: 'Tensor Memory Access Instructions', description: 'Tensor Prefetch'},
+ UTMAREDG: {
+ reasonLabel: 'Tensor Memory Access Instructions',
+ description: 'Tensor Store from Shared to Global Memory with Reduction',
+ },
+ UTMASTG: {
+ reasonLabel: 'Tensor Memory Access Instructions',
+ description: 'Tensor Store from Shared to Global Memory',
+ },
+
+ // --- Tensor Core Memory Instructions ---
+ LDT: {
+ reasonLabel: 'Tensor Core Memory Instructions',
+ description: 'Load Matrix from Tensor Memory to Register File',
+ },
+ LDTM: {
+ reasonLabel: 'Tensor Core Memory Instructions',
+ description: 'Load Matrix from Tensor Memory to Register File',
+ },
+ STT: {
+ reasonLabel: 'Tensor Core Memory Instructions',
+ description: 'Store Matrix to Tensor Memory from Register File',
+ },
+ STTM: {
+ reasonLabel: 'Tensor Core Memory Instructions',
+ description: 'Store Matrix to Tensor Memory from Register File',
+ },
+ UTCATOMSWS: {
+ reasonLabel: 'Tensor Core Memory Instructions',
+ description: 'Perform Atomic operation on SW State Register',
+ },
+ UTCBAR: {reasonLabel: 'Tensor Core Memory Instructions', description: 'Tensor Core Barrier'},
+ UTCCP: {
+ reasonLabel: 'Tensor Core Memory Instructions',
+ description: 'Asynchonous data copy from Shared Memory to Tensor Memory',
+ },
+ UTCHMMA: {
+ reasonLabel: 'Tensor Core Memory Instructions',
+ description: 'Uniform Matrix Multiply and Accumulate',
+ },
+ UTCIMMA: {
+ reasonLabel: 'Tensor Core Memory Instructions',
+ description: 'Uniform Matrix Multiply and Accumulate',
+ },
+ UTCOMMA: {
+ reasonLabel: 'Tensor Core Memory Instructions',
+ description: 'Uniform Matrix Multiply and Accumulate',
+ },
+ UTCQMMA: {
+ reasonLabel: 'Tensor Core Memory Instructions',
+ description: 'Uniform Matrix Multiply and Accumulate',
+ },
+ UTCSHIFT: {
+ reasonLabel: 'Tensor Core Memory Instructions',
+ description: 'Shift elements in Tensor Memory',
+ },
};
// Ref: https://docs.nvidia.com/nsight-compute/ProfilingGuide/index.html#warp-stall-reasons