Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 153 additions & 2 deletions code/qcommon/vm_powerpc.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,26 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
#define USE_ISA_2_07 0
#endif

// ISA 3.0 (POWER9): modsw, moduw instructions
// These replace the 3-instruction divw+mullw+sub sequence for modulo.
// ISA 3.0 (POWER9): modsw, moduw instructions, plus mtvsrws (splat) which
// enables a memory-free GPR -> single-precision FPR move (mov_sx_rx).
#if defined(_ARCH_PWR9) || defined(__POWER9_VECTOR__)
#define USE_ISA_3_0 1
#else
#define USE_ISA_3_0 0
#endif

// ISA 3.1 (POWER10): prefixed instructions (paddi/pli, plxx/pstxx).
// 34-bit signed displacement/immediate in one logical instruction (8 bytes).
// Hardware constraint: prefix+suffix must not cross a 64-byte boundary.
// We emit each prefixed instruction as a stable 12-byte block (always three
// 4-byte words) so per-pass code size is deterministic regardless of local
// alignment — see emit_prefixed() for the layout.
#if defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__)
#define USE_ISA_3_1 1
#else
#define USE_ISA_3_1 0
#endif

#define NUM_PASSES 3 // INIT, EXPAND, FINAL + alloc + FINAL

#define PASS_INIT 0
Expand Down Expand Up @@ -675,6 +687,39 @@ static void VM_FreeBuffers( void )

#endif

// -- ISA 3.1 (POWER10) prefixed instructions --
//
// A prefixed instruction is a 4-byte prefix word followed by a 4-byte suffix
// instruction (8 bytes total, one logical instruction). The prefix carries the
// upper 18 bits of a 34-bit signed displacement/immediate; the suffix carries
// the lower 16 bits and the GPR/FPR selectors.
//
// Two prefix forms are used here:
// MLS (Modified Load/Store): suffix is an existing D-form instruction
// (lwz/stw/lbz/lhz/lha/stb/sth/lfs/lfd/stfs/stfd) or addi (paddi/pli).
// 8LS (8-byte Load/Store): suffix uses dedicated opcodes (57 for pld,
// 61 for pstd) with a 16-bit d field — distinct from the standalone
// DS-form ld/std which use opcodes 58/62.
//
// Hardware constraint: a prefix+suffix pair must NOT cross a 64-byte boundary.
// emit_prefixed() enforces this with a 12-byte stable layout (see below).
#if USE_ISA_3_1
// MLS-form prefix word. R=0 selects (RA|0)+d addressing; R=1 selects PC-relative.
#define PPC_PREFIX_MLS(R, d34) \
( 0x06000000u | (((R)&1u)<<20) | \
((((uint32_t)((int64_t)(d34) >> 16))) & 0x3FFFFu) )

// 8LS-form prefix word.
#define PPC_PREFIX_8LS(R, d34) \
( 0x04000000u | (((R)&1u)<<20) | \
((((uint32_t)((int64_t)(d34) >> 16))) & 0x3FFFFu) )

// Suffix for pld rt, d(ra) (8LS form, opcode 57).
#define PPC_PLD_SUFFIX(rt, ra, d34) PPC_D(57, rt, ra, (uint16_t)(d34))
// Suffix for pstd rs, d(ra) (8LS form, opcode 61).
#define PPC_PSTD_SUFFIX(rs, ra, d34) PPC_D(61, rs, ra, (uint16_t)(d34))
#endif

// -- Trap --
// trap (tw 31, 0, 0)
#define PPC_TRAP() PPC_X(31, 31, 0, 0, 4, 0)
Expand Down Expand Up @@ -707,6 +752,38 @@ static void emitAlign( int align )
}


#if USE_ISA_3_1
// Emit a prefixed instruction (POWER10).
//
// Hardware requires the 8-byte prefix+suffix pair not to cross a 64-byte
// boundary. We always emit a 12-byte block (three 4-byte words) so that
// instruction sizes stay deterministic across the multi-pass compile — a
// per-site alignment-dependent NOP would let alignment cascade between
// passes and break the existing jumpSizeChanged convergence logic.
//
// The 8-byte prefix+suffix lives at an 8-byte-aligned offset within the
// block, so it can never start at offset 60 mod 64 (the only problematic
// position for a 4-byte-aligned 8-byte instruction):
// compiledOfs & 4 == 0 => prefix, suffix, NOP
// compiledOfs & 4 == 4 => NOP, prefix, suffix
// Either way the prefix sits at an 8-byte-aligned offset and the block is
// 12 bytes. POWER10 elides leading/trailing NOPs in dispatch, so the pad
// has near-zero runtime cost.
static void emit_prefixed( uint32_t prefix, uint32_t suffix )
{
if ( ( compiledOfs & 4 ) == 0 ) {
emit( prefix );
emit( suffix );
emit( PPC_NOP() );
} else {
emit( PPC_NOP() );
emit( prefix );
emit( suffix );
}
}
#endif


static int getAlign( int align )
{
return PAD( compiledOfs, align ) - compiledOfs;
Expand Down Expand Up @@ -738,6 +815,19 @@ static void emit_MOVi64( int rt, uint64_t imm )
return;
}

#if USE_ISA_3_1
// fits in signed 34-bit: one pli (12 bytes incl. alignment pad) instead of
// the 5-instruction lis+ori+sldi+oris+ori sequence (20 bytes).
{
int64_t simm = (int64_t)imm;
if ( simm >= -(((int64_t)1) << 33) && simm < (((int64_t)1) << 33) ) {
emit_prefixed( PPC_PREFIX_MLS( 0, simm ),
PPC_ADDI( rt, R0, (uint16_t)simm ) );
return;
}
}
#endif

// full 64-bit
uint16_t w3 = (imm >> 48) & 0xFFFF;
uint16_t w2 = (imm >> 32) & 0xFFFF;
Expand Down Expand Up @@ -898,6 +988,12 @@ static void mov_rx_local( uint32_t reg, const uint32_t addr )
if ( (int16_t)addr == addr ) {
emit( PPC_ADDI( reg, rPSTACK, addr) );
} else {
#if USE_ISA_3_1
// addr (uint32_t) always fits in signed 34-bit; one paddi avoids the
// temp register and replaces the lis+ori+add sequence.
emit_prefixed( PPC_PREFIX_MLS( 0, (int32_t)addr ),
PPC_ADDI( reg, rPSTACK, (uint16_t)addr ) );
#else
if ( find_rx_const( addr ) ) {
uint32_t rx = alloc_rx_const( R6, addr ); // R6 = const addr
emit( PPC_ADD( reg, rPSTACK, rx ) ); // reg = pstack + R6
Expand All @@ -906,6 +1002,7 @@ static void mov_rx_local( uint32_t reg, const uint32_t addr )
mov_rx_imm32( reg, addr ); // reg = addr
emit( PPC_ADD( reg, rPSTACK, reg ) ); // reg = pStack + reg
}
#endif
}
}

Expand Down Expand Up @@ -1104,8 +1201,14 @@ static void emit_CheckJump( vm_t *vm, int reg, int proc_base, int proc_len )
if ( (int16_t)proc_base == proc_base ) {
emit( PPC_ADDI( R11, reg, -proc_base ) ); // r11 = reg - proc_base
} else {
#if USE_ISA_3_1
// One paddi handles the full int32 range without a temp register.
emit_prefixed( PPC_PREFIX_MLS( 0, -(int64_t)proc_base ),
PPC_ADDI( R11, reg, (uint16_t)(-proc_base) ) );
#else
mov_rx_imm32( R11, proc_base );
emit( PPC_SUB( R11, reg, R11 ) );
#endif
}
// check if r11 > proc_len (unsigned, so negative wraps to large)
mov_rx_imm32( R12, proc_len );
Expand Down Expand Up @@ -2053,10 +2156,16 @@ qboolean VM_Compile( vm_t *vm, vmHeader_t *header )
// short offset
emit( PPC_LFS( sx[0], var.addr, var.base ) ); // F0 = varBase[var.addr]
} else {
#if USE_ISA_3_1
// long offset via plfs (no temp register).
emit_prefixed( PPC_PREFIX_MLS( 0, var.addr ),
PPC_LFS( sx[0], (uint16_t)var.addr, var.base ) );
#else
// long offset
rx[0] = alloc_rx_const( R4, var.addr ); // R4 = var.addr
emit( PPC_LFSX( sx[0], rx[0], var.base ) ); // F0 = var.base[R4]
unmask_rx( rx[0] );
#endif
}
set_sx_var( sx[0], &var ); // update metadata
}
Expand Down Expand Up @@ -2111,6 +2220,23 @@ qboolean VM_Compile( vm_t *vm, vmHeader_t *header )
case OP_LOAD4: emit( PPC_LWZ( rx[0], var.addr, var.base ) ); var.size = 4; set_rx_ext( rx[0], Z_NONE ); break; // R3 = (dword)var.base[var.addr]
}
} else {
#if USE_ISA_3_1
// long offset via plbz/plhz/plwz (no temp register).
switch ( ci->op ) {
case OP_LOAD1:
emit_prefixed( PPC_PREFIX_MLS( 0, var.addr ),
PPC_LBZ( rx[0], (uint16_t)var.addr, var.base ) );
var.size = 1; set_rx_ext( rx[0], Z_EXT8 ); break;
case OP_LOAD2:
emit_prefixed( PPC_PREFIX_MLS( 0, var.addr ),
PPC_LHZ( rx[0], (uint16_t)var.addr, var.base ) );
var.size = 2; set_rx_ext( rx[0], Z_EXT16 ); break;
case OP_LOAD4:
emit_prefixed( PPC_PREFIX_MLS( 0, var.addr ),
PPC_LWZ( rx[0], (uint16_t)var.addr, var.base ) );
var.size = 4; set_rx_ext( rx[0], Z_NONE ); break;
}
#else
// long offset, use indexed form
rx[1] = alloc_rx_const( R4, var.addr ); // R4 = var.addr
switch ( ci->op ) {
Expand All @@ -2119,6 +2245,7 @@ qboolean VM_Compile( vm_t *vm, vmHeader_t *header )
case OP_LOAD4: emit( PPC_LWZX( rx[0], rx[1], var.base ) ); var.size = 4; set_rx_ext( rx[0], Z_NONE ); break; // R3 = var.base[R4] (word)
}
unmask_rx( rx[1] );
#endif
}
set_rx_var( rx[0], &var ); // update metadata for destination register
}
Expand Down Expand Up @@ -2157,10 +2284,16 @@ qboolean VM_Compile( vm_t *vm, vmHeader_t *header )
// short offset
emit( PPC_STFS( sx[0], var.addr, var.base ) ); // var.base[var.addr] = F0
} else {
#if USE_ISA_3_1
// long offset via pstfs (no temp register).
emit_prefixed( PPC_PREFIX_MLS( 0, var.addr ),
PPC_STFS( sx[0], (uint16_t)var.addr, var.base ) );
#else
// long offset
rx[0] = alloc_rx_const( R4, var.addr ); // R4 = var.addr
emit( PPC_STFSX( sx[0], rx[0], var.base ) ); // var.base[R4] = F0
unmask_rx( rx[0] );
#endif
}
wipe_var_range( &var ); // clear mappings for affected area
set_sx_var( sx[0], &var ); // update metadata
Expand Down Expand Up @@ -2189,6 +2322,23 @@ qboolean VM_Compile( vm_t *vm, vmHeader_t *header )
default: emit( PPC_STW( rx[0], var.addr, var.base ) ); var.size = 4; break; // (word) var.base[var.addr] = R3
}
} else {
#if USE_ISA_3_1
// long offset via pstb/psth/pstw (no temp register).
switch ( ci->op ) {
case OP_STORE1:
emit_prefixed( PPC_PREFIX_MLS( 0, var.addr ),
PPC_STB( rx[0], (uint16_t)var.addr, var.base ) );
var.size = 1; break;
case OP_STORE2:
emit_prefixed( PPC_PREFIX_MLS( 0, var.addr ),
PPC_STH( rx[0], (uint16_t)var.addr, var.base ) );
var.size = 2; break;
default:
emit_prefixed( PPC_PREFIX_MLS( 0, var.addr ),
PPC_STW( rx[0], (uint16_t)var.addr, var.base ) );
var.size = 4; break;
}
#else
// long offset
rx[1] = alloc_rx_const( R4, var.addr ); // R4 = var.addr
switch ( ci->op ) {
Expand All @@ -2197,6 +2347,7 @@ qboolean VM_Compile( vm_t *vm, vmHeader_t *header )
default: emit( PPC_STWX( rx[0], rx[1], var.base ) ); var.size = 4; break; // (word) var.base[R4] = R3
}
unmask_rx( rx[1] );
#endif
}
wipe_var_range( &var ); // erase mappings for written memory area
set_rx_var( rx[0], &var ); // update metadata for memory
Expand Down
Loading