mirror of
https://github.com/xenia-project/xenia.git
synced 2025-12-06 07:12:03 +01:00
pack local_slot and constant in hir::Value Instead of loading membase at the start of every function, just load it in HostToGuestThunk vzeroupper in GuestToHostThunk before calling host function, and in HostToGuestThunk after calling function to prevent AVX dirty state slowdowns. In the future, check if CPU implements AVX as 128x2 and skip if so (https://john-h-k.github.io/VexTransitionPenalties.html) Remove useless save/restore of ctx pointer, nothing modifies it and it prevents cpus from doing cross-function memory renaming (https://www.agner.org/forum/viewtopic.php?t=41). Could not remove the space on stack because of alignment issues, instead turned it into GUEST_SCRATCH64 which is a temporary that sequences may use Reorder OpcodeInfo so that name is at offset 0, remove name and add GetOpcodeName function (name is only used for debug code, we are seperating frequently accessed data and rarely accessed data) Add VECTOR_DENORMFLUSH opcode for handling output to DOT_PRODUCT and other opcodes that implicitly force denormal inputs/outputs to zero, will eventually use for implementing NJM Rewrite sequences for LOAD_VECTOR_SHL/SHR. The mask with 0xf in it was pointless as all InstrEmit_ functions that create the load shift instructions do that in HIR. The tables are only used for nonzero constant inputs now, which are probably pretty rare. Instead of doing a shift and lookup, a base value is used for both in the constant table and adding/subtracting of the input is done Reuse result of LoadVectorShl/Shr in InstrEmit_stvlx_, InstrEmit_stvrx_. We were previously calculating it twice which was contributing to the final sequences' fatness. Use OPCODE_SELECT instead of the sequence of or, andnot, and that it was using for merging Add the proper unconditional denormal input flushing behavior to vfmadd, add it also to vfmsub (making the assumption it has the same behavior) Remove constant propagation for DOT_PRODUCT_3/4 DOT_PRODUCT_3/4 now returns a vector with all four elements set to the result. (what we were doing before, truncating to float32 and then splatting didnt make any sense) Add much more correct versions of DOT_PRODUCT_3/4, matching the Xb360's to 1 bit. Still needs work to be a perfect emulation. Add constant folding for OPCODE_SELECT, OPCODE_INSERT, OPCODE_PERMUTE, OPCODE_SWIZZLE Remove constant folding for DOT_PRODUCT Removed the multibyte nop code I committed earlier, it doesnt help us much because nops are only used for debug stuff and its ugly and wouldnt survive in a pr to main Check for AVX512BMI, use vpermb to shuffle if supported
137 lines
5.1 KiB
C++
137 lines
5.1 KiB
C++
/**
|
|
******************************************************************************
|
|
* Xenia : Xbox 360 Emulator Research Project *
|
|
******************************************************************************
|
|
* Copyright 2015 Ben Vanik. All rights reserved. *
|
|
* Released under the BSD license - see LICENSE in the root for more details. *
|
|
******************************************************************************
|
|
*/
|
|
|
|
#ifndef XENIA_CPU_BACKEND_X64_X64_STACK_LAYOUT_H_
|
|
#define XENIA_CPU_BACKEND_X64_X64_STACK_LAYOUT_H_
|
|
|
|
#include "xenia/base/vec128.h"
|
|
#include "xenia/cpu/backend/x64/x64_backend.h"
|
|
#include "xenia/cpu/backend/x64/x64_emitter.h"
|
|
|
|
namespace xe {
|
|
namespace cpu {
|
|
namespace backend {
|
|
namespace x64 {
|
|
|
|
class StackLayout {
|
|
public:
|
|
/**
|
|
* Stack Layout
|
|
* ----------------------------
|
|
* NOTE: stack must always be 16b aligned.
|
|
*
|
|
* Thunk stack:
|
|
* Non-Volatile Volatile
|
|
* +------------------+------------------+
|
|
* | arg temp, 3 * 8 | arg temp, 3 * 8 | rsp + 0x000
|
|
* | | |
|
|
* | | |
|
|
* +------------------+------------------+
|
|
* | rbx | (unused) | rsp + 0x018
|
|
* +------------------+------------------+
|
|
* | rbp | rcx | rsp + 0x020
|
|
* +------------------+------------------+
|
|
* | rcx (Win32) | rdx | rsp + 0x028
|
|
* +------------------+------------------+
|
|
* | rsi (Win32) | rsi (Linux) | rsp + 0x030
|
|
* +------------------+------------------+
|
|
* | rdi (Win32) | rdi (Linux) | rsp + 0x038
|
|
* +------------------+------------------+
|
|
* | r12 | r8 | rsp + 0x040
|
|
* +------------------+------------------+
|
|
* | r13 | r9 | rsp + 0x048
|
|
* +------------------+------------------+
|
|
* | r14 | r10 | rsp + 0x050
|
|
* +------------------+------------------+
|
|
* | r15 | r11 | rsp + 0x058
|
|
* +------------------+------------------+
|
|
* | xmm6 (Win32) | (unused) | rsp + 0x060
|
|
* | | |
|
|
* +------------------+------------------+
|
|
* | xmm7 (Win32) | xmm1 | rsp + 0x070
|
|
* | | |
|
|
* +------------------+------------------+
|
|
* | xmm8 (Win32) | xmm2 | rsp + 0x080
|
|
* | | |
|
|
* +------------------+------------------+
|
|
* | xmm9 (Win32) | xmm3 | rsp + 0x090
|
|
* | | |
|
|
* +------------------+------------------+
|
|
* | xmm10 (Win32) | xmm4 | rsp + 0x0A0
|
|
* | | |
|
|
* +------------------+------------------+
|
|
* | xmm11 (Win32) | xmm5 | rsp + 0x0B0
|
|
* | | |
|
|
* +------------------+------------------+
|
|
* | xmm12 (Win32) | (unused) | rsp + 0x0C0
|
|
* | | |
|
|
* +------------------+------------------+
|
|
* | xmm13 (Win32) | (unused) | rsp + 0x0D0
|
|
* | | |
|
|
* +------------------+------------------+
|
|
* | xmm14 (Win32) | (unused) | rsp + 0x0E0
|
|
* | | |
|
|
* +------------------+------------------+
|
|
* | xmm15 (Win32) | (unused) | rsp + 0x0F0
|
|
* | | |
|
|
* +------------------+------------------+
|
|
* | (return address) | (return address) | rsp + 0x100
|
|
* +------------------+------------------+
|
|
* | (rcx home) | (rcx home) | rsp + 0x108
|
|
* +------------------+------------------+
|
|
* | (rdx home) | (rdx home) | rsp + 0x110
|
|
* +------------------+------------------+
|
|
*/
|
|
XEPACKEDSTRUCT(Thunk, {
|
|
uint64_t arg_temp[3];
|
|
uint64_t r[9];
|
|
vec128_t xmm[10];
|
|
});
|
|
static_assert(sizeof(Thunk) % 16 == 0,
|
|
"sizeof(Thunk) must be a multiple of 16!");
|
|
static const size_t THUNK_STACK_SIZE = sizeof(Thunk) + 8;
|
|
|
|
/**
|
|
*
|
|
*
|
|
* Guest stack:
|
|
* +------------------+
|
|
* | arg temp, 3 * 8 | rsp + 0
|
|
* | |
|
|
* | |
|
|
* +------------------+
|
|
* | scratch, 48b | rsp + 32
|
|
* | |
|
|
* +------------------+
|
|
* | rcx / context | rsp + 80
|
|
* +------------------+
|
|
* | guest ret addr | rsp + 88
|
|
* +------------------+
|
|
* | call ret addr | rsp + 96
|
|
* +------------------+
|
|
* ... locals ...
|
|
* +------------------+
|
|
* | (return address) |
|
|
* +------------------+
|
|
*
|
|
*/
|
|
static const size_t GUEST_STACK_SIZE = 104;
|
|
//was GUEST_CTX_HOME, can't remove because that'd throw stack alignment off. instead, can be used as a temporary in sequences
|
|
static const size_t GUEST_SCRATCH64 = 80;
|
|
static const size_t GUEST_RET_ADDR = 88;
|
|
static const size_t GUEST_CALL_RET_ADDR = 96;
|
|
};
|
|
|
|
} // namespace x64
|
|
} // namespace backend
|
|
} // namespace cpu
|
|
} // namespace xe
|
|
|
|
#endif // XENIA_CPU_BACKEND_X64_X64_STACK_LAYOUT_H_
|