xenia/src/xenia/cpu/backend/x64/x64_emitter.cc
gibbed f7a8c5ce7a [x64] Fix corruption of src1 in calls to EmulateVectorRotateLeft.
To mitigate this mistake in the future, implemented new
StashConstantXmm functions.
2019-11-30 20:08:10 -06:00

883 lines
30 KiB
C++

/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2019 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#include "xenia/cpu/backend/x64/x64_emitter.h"
#include <stddef.h>
#include <climits>
#include <cstring>
#include "xenia/base/assert.h"
#include "xenia/base/atomic.h"
#include "xenia/base/debugging.h"
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/base/memory.h"
#include "xenia/base/profiling.h"
#include "xenia/base/vec128.h"
#include "xenia/cpu/backend/x64/x64_backend.h"
#include "xenia/cpu/backend/x64/x64_code_cache.h"
#include "xenia/cpu/backend/x64/x64_function.h"
#include "xenia/cpu/backend/x64/x64_sequences.h"
#include "xenia/cpu/backend/x64/x64_stack_layout.h"
#include "xenia/cpu/cpu_flags.h"
#include "xenia/cpu/function.h"
#include "xenia/cpu/function_debug_info.h"
#include "xenia/cpu/processor.h"
#include "xenia/cpu/symbol.h"
#include "xenia/cpu/thread_state.h"
DEFINE_bool(debugprint_trap_log, false,
"Log debugprint traps to the active debugger", "CPU");
DEFINE_bool(ignore_undefined_externs, true,
"Don't exit when an undefined extern is called.", "CPU");
DEFINE_bool(emit_source_annotations, false,
"Add extra movs and nops to make disassembly easier to read.",
"CPU");
namespace xe {
namespace cpu {
namespace backend {
namespace x64 {
using xe::cpu::hir::HIRBuilder;
using xe::cpu::hir::Instr;
static const size_t kMaxCodeSize = 1 * 1024 * 1024;
static const size_t kStashOffset = 32;
// static const size_t kStashOffsetHigh = 32 + 32;
const uint32_t X64Emitter::gpr_reg_map_[X64Emitter::GPR_COUNT] = {
Xbyak::Operand::RBX, Xbyak::Operand::R10, Xbyak::Operand::R11,
Xbyak::Operand::R12, Xbyak::Operand::R13, Xbyak::Operand::R14,
Xbyak::Operand::R15,
};
const uint32_t X64Emitter::xmm_reg_map_[X64Emitter::XMM_COUNT] = {
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
};
X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
: CodeGenerator(kMaxCodeSize, Xbyak::AutoGrow, allocator),
processor_(backend->processor()),
backend_(backend),
code_cache_(backend->code_cache()),
allocator_(allocator) {
if (cvars::use_haswell_instructions) {
feature_flags_ |= cpu_.has(Xbyak::util::Cpu::tAVX2) ? kX64EmitAVX2 : 0;
feature_flags_ |= cpu_.has(Xbyak::util::Cpu::tFMA) ? kX64EmitFMA : 0;
feature_flags_ |= cpu_.has(Xbyak::util::Cpu::tLZCNT) ? kX64EmitLZCNT : 0;
feature_flags_ |= cpu_.has(Xbyak::util::Cpu::tBMI2) ? kX64EmitBMI2 : 0;
feature_flags_ |= cpu_.has(Xbyak::util::Cpu::tF16C) ? kX64EmitF16C : 0;
feature_flags_ |= cpu_.has(Xbyak::util::Cpu::tMOVBE) ? kX64EmitMovbe : 0;
}
if (!cpu_.has(Xbyak::util::Cpu::tAVX)) {
xe::FatalError(
"Your CPU does not support AVX, which is required by Xenia. See the "
"FAQ for system requirements at https://xenia.jp");
return;
}
}
X64Emitter::~X64Emitter() = default;
bool X64Emitter::Emit(GuestFunction* function, HIRBuilder* builder,
uint32_t debug_info_flags, FunctionDebugInfo* debug_info,
void** out_code_address, size_t* out_code_size,
std::vector<SourceMapEntry>* out_source_map) {
SCOPE_profile_cpu_f("cpu");
// Reset.
debug_info_ = debug_info;
debug_info_flags_ = debug_info_flags;
trace_data_ = &function->trace_data();
source_map_arena_.Reset();
// Fill the generator with code.
EmitFunctionInfo func_info = {};
if (!Emit(builder, func_info)) {
return false;
}
// Copy the final code to the cache and relocate it.
*out_code_size = getSize();
*out_code_address = Emplace(func_info, function);
// Stash source map.
source_map_arena_.CloneContents(out_source_map);
return true;
}
void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
GuestFunction* function) {
// To avoid changing xbyak, we do a switcharoo here.
// top_ points to the Xbyak buffer, and since we are in AutoGrow mode
// it has pending relocations. We copy the top_ to our buffer, swap the
// pointer, relocate, then return the original scratch pointer for use.
uint8_t* old_address = top_;
void* new_address;
assert_true(func_info.code_size.total == size_);
if (function) {
new_address = code_cache_->PlaceGuestCode(function->address(), top_,
func_info, function);
} else {
new_address = code_cache_->PlaceHostCode(0, top_, func_info);
}
top_ = reinterpret_cast<uint8_t*>(new_address);
ready();
top_ = old_address;
reset();
return new_address;
}
bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
Xbyak::Label epilog_label;
epilog_label_ = &epilog_label;
// Calculate stack size. We need to align things to their natural sizes.
// This could be much better (sort by type/etc).
auto locals = builder->locals();
size_t stack_offset = StackLayout::GUEST_STACK_SIZE;
for (auto it = locals.begin(); it != locals.end(); ++it) {
auto slot = *it;
size_t type_size = GetTypeSize(slot->type);
// Align to natural size.
stack_offset = xe::align(stack_offset, type_size);
slot->set_constant((uint32_t)stack_offset);
stack_offset += type_size;
}
// Ensure 16b alignment.
stack_offset -= StackLayout::GUEST_STACK_SIZE;
stack_offset = xe::align(stack_offset, static_cast<size_t>(16));
struct _code_offsets {
size_t prolog;
size_t prolog_stack_alloc;
size_t body;
size_t epilog;
size_t tail;
} code_offsets = {};
code_offsets.prolog = getSize();
// Function prolog.
// Must be 16b aligned.
// Windows is very strict about the form of this and the epilog:
// https://docs.microsoft.com/en-us/cpp/build/prolog-and-epilog?view=vs-2017
// IMPORTANT: any changes to the prolog must be kept in sync with
// X64CodeCache, which dynamically generates exception information.
// Adding or changing anything here must be matched!
const size_t stack_size = StackLayout::GUEST_STACK_SIZE + stack_offset;
assert_true((stack_size + 8) % 16 == 0);
func_info.stack_size = stack_size;
stack_size_ = stack_size;
sub(rsp, (uint32_t)stack_size);
code_offsets.prolog_stack_alloc = getSize();
code_offsets.body = getSize();
mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg());
mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx);
mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0);
// Safe now to do some tracing.
if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctions) {
// We require 32-bit addresses.
assert_true(uint64_t(trace_data_->header()) < UINT_MAX);
auto trace_header = trace_data_->header();
// Call count.
lock();
inc(qword[low_address(&trace_header->function_call_count)]);
// Get call history slot.
static_assert(FunctionTraceData::kFunctionCallerHistoryCount == 4,
"bitmask depends on count");
mov(rax, qword[low_address(&trace_header->function_call_count)]);
and_(rax, 0b00000011);
// Record call history value into slot (guest addr in RDX).
mov(dword[Xbyak::RegExp(uint32_t(uint64_t(
low_address(&trace_header->function_caller_history)))) +
rax * 4],
edx);
// Calling thread. Load ax with thread ID.
EmitGetCurrentThreadId();
lock();
bts(qword[low_address(&trace_header->function_thread_use)], rax);
}
// Load membase.
mov(GetMembaseReg(),
qword[GetContextReg() + offsetof(ppc::PPCContext, virtual_membase)]);
// Body.
auto block = builder->first_block();
while (block) {
// Mark block labels.
auto label = block->label_head;
while (label) {
L(label->name);
label = label->next;
}
// Process instructions.
const Instr* instr = block->instr_head;
while (instr) {
const Instr* new_tail = instr;
if (!SelectSequence(this, instr, &new_tail)) {
// No sequence found!
// NOTE: If you encounter this after adding a new instruction, do a full
// rebuild!
assert_always();
XELOGE("Unable to process HIR opcode %s", instr->opcode->name);
break;
}
instr = new_tail;
}
block = block->next;
}
// Function epilog.
L(epilog_label);
epilog_label_ = nullptr;
EmitTraceUserCallReturn();
mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]);
code_offsets.epilog = getSize();
add(rsp, (uint32_t)stack_size);
ret();
code_offsets.tail = getSize();
if (cvars::emit_source_annotations) {
nop();
nop();
nop();
nop();
nop();
}
assert_zero(code_offsets.prolog);
func_info.code_size.total = getSize();
func_info.code_size.prolog = code_offsets.body - code_offsets.prolog;
func_info.code_size.body = code_offsets.epilog - code_offsets.body;
func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog;
func_info.code_size.tail = getSize() - code_offsets.tail;
func_info.prolog_stack_alloc_offset =
code_offsets.prolog_stack_alloc - code_offsets.prolog;
return true;
}
void X64Emitter::MarkSourceOffset(const Instr* i) {
auto entry = source_map_arena_.Alloc<SourceMapEntry>();
entry->guest_address = static_cast<uint32_t>(i->src1.offset);
entry->hir_offset = uint32_t(i->block->ordinal << 16) | i->ordinal;
entry->code_offset = static_cast<uint32_t>(getSize());
if (cvars::emit_source_annotations) {
nop();
nop();
mov(eax, entry->guest_address);
nop();
nop();
}
if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctionCoverage) {
uint32_t instruction_index =
(entry->guest_address - trace_data_->start_address()) / 4;
lock();
inc(qword[low_address(trace_data_->instruction_execute_counts() +
instruction_index * 8)]);
}
}
void X64Emitter::EmitGetCurrentThreadId() {
// rsi must point to context. We could fetch from the stack if needed.
mov(ax, word[GetContextReg() + offsetof(ppc::PPCContext, thread_id)]);
}
void X64Emitter::EmitTraceUserCallReturn() {}
void X64Emitter::DebugBreak() {
// TODO(benvanik): notify debugger.
db(0xCC);
}
uint64_t TrapDebugPrint(void* raw_context, uint64_t address) {
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
uint32_t str_ptr = uint32_t(thread_state->context()->r[3]);
// uint16_t str_len = uint16_t(thread_state->context()->r[4]);
auto str = thread_state->memory()->TranslateVirtual<const char*>(str_ptr);
// TODO(benvanik): truncate to length?
XELOGD("(DebugPrint) %s", str);
if (cvars::debugprint_trap_log) {
debugging::DebugPrint("(DebugPrint) %s", str);
}
return 0;
}
uint64_t TrapDebugBreak(void* raw_context, uint64_t address) {
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
XELOGE("tw/td forced trap hit! This should be a crash!");
if (cvars::break_on_debugbreak) {
xe::debugging::Break();
}
return 0;
}
void X64Emitter::Trap(uint16_t trap_type) {
switch (trap_type) {
case 20:
case 26:
// 0x0FE00014 is a 'debug print' where r3 = buffer r4 = length
CallNative(TrapDebugPrint, 0);
break;
case 0:
case 22:
// Always trap?
// TODO(benvanik): post software interrupt to debugger.
CallNative(TrapDebugBreak, 0);
break;
case 25:
// ?
break;
default:
XELOGW("Unknown trap type %d", trap_type);
db(0xCC);
break;
}
}
void X64Emitter::UnimplementedInstr(const hir::Instr* i) {
// TODO(benvanik): notify debugger.
db(0xCC);
assert_always();
}
// This is used by the X64ThunkEmitter's ResolveFunctionThunk.
extern "C" uint64_t ResolveFunction(void* raw_context,
uint64_t target_address) {
auto thread_state = *reinterpret_cast<ThreadState**>(raw_context);
// TODO(benvanik): required?
assert_not_zero(target_address);
auto fn =
thread_state->processor()->ResolveFunction((uint32_t)target_address);
assert_not_null(fn);
auto x64_fn = static_cast<X64Function*>(fn);
uint64_t addr = reinterpret_cast<uint64_t>(x64_fn->machine_code());
return addr;
}
void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) {
assert_not_null(function);
auto fn = static_cast<X64Function*>(function);
// Resolve address to the function to call and store in rax.
if (fn->machine_code()) {
// TODO(benvanik): is it worth it to do this? It removes the need for
// a ResolveFunction call, but makes the table less useful.
assert_zero(uint64_t(fn->machine_code()) & 0xFFFFFFFF00000000);
mov(eax, uint32_t(uint64_t(fn->machine_code())));
} else if (code_cache_->has_indirection_table()) {
// Load the pointer to the indirection table maintained in X64CodeCache.
// The target dword will either contain the address of the generated code
// or a thunk to ResolveAddress.
mov(ebx, function->address());
mov(eax, dword[ebx]);
} else {
// Old-style resolve.
// Not too important because indirection table is almost always available.
// TODO: Overwrite the call-site with a straight call.
CallNative(&ResolveFunction, function->address());
}
// Actually jump/call to rax.
if (instr->flags & hir::CALL_TAIL) {
// Since we skip the prolog we need to mark the return here.
EmitTraceUserCallReturn();
// Pass the callers return address over.
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
add(rsp, static_cast<uint32_t>(stack_size()));
jmp(rax);
} else {
// Return address is from the previous SET_RETURN_ADDRESS.
mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
call(rax);
}
}
void X64Emitter::CallIndirect(const hir::Instr* instr,
const Xbyak::Reg64& reg) {
// Check if return.
if (instr->flags & hir::CALL_POSSIBLE_RETURN) {
cmp(reg.cvt32(), dword[rsp + StackLayout::GUEST_RET_ADDR]);
je(epilog_label(), CodeGenerator::T_NEAR);
}
// Load the pointer to the indirection table maintained in X64CodeCache.
// The target dword will either contain the address of the generated code
// or a thunk to ResolveAddress.
if (code_cache_->has_indirection_table()) {
if (reg.cvt32() != ebx) {
mov(ebx, reg.cvt32());
}
mov(eax, dword[ebx]);
} else {
// Old-style resolve.
// Not too important because indirection table is almost always available.
mov(edx, reg.cvt32());
mov(rax, reinterpret_cast<uint64_t>(ResolveFunction));
mov(rcx, GetContextReg());
call(rax);
}
// Actually jump/call to rax.
if (instr->flags & hir::CALL_TAIL) {
// Since we skip the prolog we need to mark the return here.
EmitTraceUserCallReturn();
// Pass the callers return address over.
mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]);
add(rsp, static_cast<uint32_t>(stack_size()));
jmp(rax);
} else {
// Return address is from the previous SET_RETURN_ADDRESS.
mov(rcx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]);
call(rax);
}
}
uint64_t UndefinedCallExtern(void* raw_context, uint64_t function_ptr) {
auto function = reinterpret_cast<Function*>(function_ptr);
if (!cvars::ignore_undefined_externs) {
xe::FatalError("undefined extern call to %.8X %s", function->address(),
function->name().c_str());
} else {
XELOGE("undefined extern call to %.8X %s", function->address(),
function->name().c_str());
}
return 0;
}
void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) {
bool undefined = true;
if (function->behavior() == Function::Behavior::kBuiltin) {
auto builtin_function = static_cast<const BuiltinFunction*>(function);
if (builtin_function->handler()) {
undefined = false;
// rcx = target function
// rdx = arg0
// r8 = arg1
// r9 = arg2
auto thunk = backend()->guest_to_host_thunk();
mov(rax, reinterpret_cast<uint64_t>(thunk));
mov(rcx, reinterpret_cast<uint64_t>(builtin_function->handler()));
mov(rdx, reinterpret_cast<uint64_t>(builtin_function->arg0()));
mov(r8, reinterpret_cast<uint64_t>(builtin_function->arg1()));
call(rax);
// rax = host return
}
} else if (function->behavior() == Function::Behavior::kExtern) {
auto extern_function = static_cast<const GuestFunction*>(function);
if (extern_function->extern_handler()) {
undefined = false;
// rcx = target function
// rdx = arg0
// r8 = arg1
// r9 = arg2
auto thunk = backend()->guest_to_host_thunk();
mov(rax, reinterpret_cast<uint64_t>(thunk));
mov(rcx, reinterpret_cast<uint64_t>(extern_function->extern_handler()));
mov(rdx,
qword[GetContextReg() + offsetof(ppc::PPCContext, kernel_state)]);
call(rax);
// rax = host return
}
}
if (undefined) {
CallNative(UndefinedCallExtern, reinterpret_cast<uint64_t>(function));
}
}
void X64Emitter::CallNative(void* fn) { CallNativeSafe(fn); }
void X64Emitter::CallNative(uint64_t (*fn)(void* raw_context)) {
CallNativeSafe(reinterpret_cast<void*>(fn));
}
void X64Emitter::CallNative(uint64_t (*fn)(void* raw_context, uint64_t arg0)) {
CallNativeSafe(reinterpret_cast<void*>(fn));
}
void X64Emitter::CallNative(uint64_t (*fn)(void* raw_context, uint64_t arg0),
uint64_t arg0) {
mov(GetNativeParam(0), arg0);
CallNativeSafe(reinterpret_cast<void*>(fn));
}
void X64Emitter::CallNativeSafe(void* fn) {
// rcx = target function
// rdx = arg0
// r8 = arg1
// r9 = arg2
auto thunk = backend()->guest_to_host_thunk();
mov(rax, reinterpret_cast<uint64_t>(thunk));
mov(rcx, reinterpret_cast<uint64_t>(fn));
call(rax);
// rax = host return
}
void X64Emitter::SetReturnAddress(uint64_t value) {
mov(rax, value);
mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], rax);
}
Xbyak::Reg64 X64Emitter::GetNativeParam(uint32_t param) {
if (param == 0)
return rdx;
else if (param == 1)
return r8;
else if (param == 2)
return r9;
assert_always();
return r9;
}
// Important: If you change these, you must update the thunks in x64_backend.cc!
Xbyak::Reg64 X64Emitter::GetContextReg() { return rsi; }
Xbyak::Reg64 X64Emitter::GetMembaseReg() { return rdi; }
void X64Emitter::ReloadContext() {
mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]);
}
void X64Emitter::ReloadMembase() {
mov(GetMembaseReg(), qword[GetContextReg() + 8]); // membase
}
// Len Assembly Byte Sequence
// ============================================================================
// 1b NOP 90H
// 2b 66 NOP 66 90H
// 3b NOP DWORD ptr [EAX] 0F 1F 00H
// 4b NOP DWORD ptr [EAX + 00H] 0F 1F 40 00H
// 5b NOP DWORD ptr [EAX + EAX*1 + 00H] 0F 1F 44 00 00H
// 6b 66 NOP DWORD ptr [EAX + EAX*1 + 00H] 66 0F 1F 44 00 00H
// 7b NOP DWORD ptr [EAX + 00000000H] 0F 1F 80 00 00 00 00H
// 8b NOP DWORD ptr [EAX + EAX*1 + 00000000H] 0F 1F 84 00 00 00 00 00H
// 9b 66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00 00H
void X64Emitter::nop(size_t length) {
// TODO(benvanik): fat nop
for (size_t i = 0; i < length; ++i) {
db(0x90);
}
}
bool X64Emitter::ConstantFitsIn32Reg(uint64_t v) {
if ((v & ~0x7FFFFFFF) == 0) {
// Fits under 31 bits, so just load using normal mov.
return true;
} else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) {
// Negative number that fits in 32bits.
return true;
}
return false;
}
void X64Emitter::MovMem64(const Xbyak::RegExp& addr, uint64_t v) {
if ((v & ~0x7FFFFFFF) == 0) {
// Fits under 31 bits, so just load using normal mov.
mov(qword[addr], v);
} else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) {
// Negative number that fits in 32bits.
mov(qword[addr], v);
} else if (!(v >> 32)) {
// All high bits are zero. It'd be nice if we had a way to load a 32bit
// immediate without sign extending!
// TODO(benvanik): this is super common, find a better way.
mov(dword[addr], static_cast<uint32_t>(v));
mov(dword[addr + 4], 0);
} else {
// 64bit number that needs double movs.
mov(dword[addr], static_cast<uint32_t>(v));
mov(dword[addr + 4], static_cast<uint32_t>(v >> 32));
}
}
static const vec128_t xmm_consts[] = {
/* XMMZero */ vec128f(0.0f),
/* XMMOne */ vec128f(1.0f),
/* XMMNegativeOne */ vec128f(-1.0f, -1.0f, -1.0f, -1.0f),
/* XMMFFFF */
vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu),
/* XMMMaskX16Y16 */
vec128i(0x0000FFFFu, 0xFFFF0000u, 0x00000000u, 0x00000000u),
/* XMMFlipX16Y16 */
vec128i(0x00008000u, 0x00000000u, 0x00000000u, 0x00000000u),
/* XMMFixX16Y16 */ vec128f(-32768.0f, 0.0f, 0.0f, 0.0f),
/* XMMNormalizeX16Y16 */
vec128f(1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 0.0f, 0.0f),
/* XMM0001 */ vec128f(0.0f, 0.0f, 0.0f, 1.0f),
/* XMM3301 */ vec128f(3.0f, 3.0f, 0.0f, 1.0f),
/* XMM3331 */ vec128f(3.0f, 3.0f, 3.0f, 1.0f),
/* XMM3333 */ vec128f(3.0f, 3.0f, 3.0f, 3.0f),
/* XMMSignMaskPS */
vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u),
/* XMMSignMaskPD */
vec128i(0x00000000u, 0x80000000u, 0x00000000u, 0x80000000u),
/* XMMAbsMaskPS */
vec128i(0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu),
/* XMMAbsMaskPD */
vec128i(0xFFFFFFFFu, 0x7FFFFFFFu, 0xFFFFFFFFu, 0x7FFFFFFFu),
/* XMMByteSwapMask */
vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu),
/* XMMByteOrderMask */
vec128i(0x01000302u, 0x05040706u, 0x09080B0Au, 0x0D0C0F0Eu),
/* XMMPermuteControl15 */ vec128b(15),
/* XMMPermuteByteMask */ vec128b(0x1F),
/* XMMPackD3DCOLORSat */ vec128i(0x404000FFu),
/* XMMPackD3DCOLOR */
vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x0C000408u),
/* XMMUnpackD3DCOLOR */
vec128i(0xFFFFFF0Eu, 0xFFFFFF0Du, 0xFFFFFF0Cu, 0xFFFFFF0Fu),
/* XMMPackFLOAT16_2 */
vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000302u),
/* XMMUnpackFLOAT16_2 */
vec128i(0x0D0C0F0Eu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu),
/* XMMPackFLOAT16_4 */
vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000302u, 0x05040706u),
/* XMMUnpackFLOAT16_4 */
vec128i(0x09080B0Au, 0x0D0C0F0Eu, 0xFFFFFFFFu, 0xFFFFFFFFu),
/* XMMPackSHORT_Min */ vec128i(0x403F8001u),
/* XMMPackSHORT_Max */ vec128i(0x40407FFFu),
/* XMMPackSHORT_2 */
vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000504u),
/* XMMPackSHORT_4 */
vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000504u, 0x09080D0Cu),
/* XMMUnpackSHORT_2 */
vec128i(0xFFFF0F0Eu, 0xFFFF0D0Cu, 0xFFFFFFFFu, 0xFFFFFFFFu),
/* XMMUnpackSHORT_4 */
vec128i(0xFFFF0B0Au, 0xFFFF0908u, 0xFFFF0F0Eu, 0xFFFF0D0Cu),
/* XMMUnpackSHORT_Overflow */ vec128i(0x403F8000u),
/* XMMPackUINT_2101010_MinUnpacked */
vec128i(0x403FFE01u, 0x403FFE01u, 0x403FFE01u, 0x40400000u),
/* XMMPackUINT_2101010_MaxUnpacked */
vec128i(0x404001FFu, 0x404001FFu, 0x404001FFu, 0x40400003u),
/* XMMPackUINT_2101010_MaskUnpacked */
vec128i(0x3FFu, 0x3FFu, 0x3FFu, 0x3u),
/* XMMPackUINT_2101010_MaskPacked */
vec128i(0x3FFu, 0x3FFu << 10, 0x3FFu << 20, 0x3u << 30),
/* XMMPackUINT_2101010_Shift */ vec128i(0, 10, 20, 30),
/* XMMUnpackUINT_2101010_Overflow */ vec128i(0x403FFE00u),
/* XMMPackULONG_4202020_MinUnpacked */
vec128i(0x40380001u, 0x40380001u, 0x40380001u, 0x40400000u),
/* XMMPackULONG_4202020_MaxUnpacked */
vec128i(0x4047FFFFu, 0x4047FFFFu, 0x4047FFFFu, 0x4040000Fu),
/* XMMPackULONG_4202020_MaskUnpacked */
vec128i(0xFFFFFu, 0xFFFFFu, 0xFFFFFu, 0xFu),
/* XMMPackULONG_4202020_PermuteXZ */
vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x0A0908FFu, 0xFF020100u),
/* XMMPackULONG_4202020_PermuteYW */
vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x0CFFFF06u, 0x0504FFFFu),
/* XMMUnpackULONG_4202020_Permute */
vec128i(0xFF0E0D0Cu, 0xFF0B0A09u, 0xFF080F0Eu, 0xFFFFFF0Bu),
/* XMMUnpackULONG_4202020_Overflow */ vec128i(0x40380000u),
/* XMMOneOver255 */ vec128f(1.0f / 255.0f),
/* XMMMaskEvenPI16 */
vec128i(0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu),
/* XMMShiftMaskEvenPI16 */
vec128i(0x0000000Fu, 0x0000000Fu, 0x0000000Fu, 0x0000000Fu),
/* XMMShiftMaskPS */
vec128i(0x0000001Fu, 0x0000001Fu, 0x0000001Fu, 0x0000001Fu),
/* XMMShiftByteMask */
vec128i(0x000000FFu, 0x000000FFu, 0x000000FFu, 0x000000FFu),
/* XMMSwapWordMask */
vec128i(0x03030303u, 0x03030303u, 0x03030303u, 0x03030303u),
/* XMMUnsignedDwordMax */
vec128i(0xFFFFFFFFu, 0x00000000u, 0xFFFFFFFFu, 0x00000000u),
/* XMM255 */ vec128f(255.0f),
/* XMMPI32 */ vec128i(32),
/* XMMSignMaskI8 */
vec128i(0x80808080u, 0x80808080u, 0x80808080u, 0x80808080u),
/* XMMSignMaskI16 */
vec128i(0x80008000u, 0x80008000u, 0x80008000u, 0x80008000u),
/* XMMSignMaskI32 */
vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u),
/* XMMSignMaskF32 */
vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u),
/* XMMShortMinPS */ vec128f(SHRT_MIN),
/* XMMShortMaxPS */ vec128f(SHRT_MAX),
/* XMMIntMin */ vec128i(INT_MIN),
/* XMMIntMax */ vec128i(INT_MAX),
/* XMMIntMaxPD */ vec128d(INT_MAX),
/* XMMPosIntMinPS */ vec128f((float)0x80000000u),
/* XMMQNaN */ vec128i(0x7FC00000u),
};
// First location to try and place constants.
static const uintptr_t kConstDataLocation = 0x20000000;
static const uintptr_t kConstDataSize = sizeof(xmm_consts);
// Increment the location by this amount for every allocation failure.
static const uintptr_t kConstDataIncrement = 0x00001000;
// This function places constant data that is used by the emitter later on.
// Only called once and used by multiple instances of the emitter.
//
// TODO(DrChat): This should be placed in the code cache with the code, but
// doing so requires RIP-relative addressing, which is difficult to support
// given the current setup.
uintptr_t X64Emitter::PlaceConstData() {
uint8_t* ptr = reinterpret_cast<uint8_t*>(kConstDataLocation);
void* mem = nullptr;
while (!mem) {
mem = memory::AllocFixed(
ptr, xe::round_up(kConstDataSize, memory::page_size()),
memory::AllocationType::kReserveCommit, memory::PageAccess::kReadWrite);
ptr += kConstDataIncrement;
}
// The pointer must not be greater than 31 bits.
assert_zero(reinterpret_cast<uintptr_t>(mem) & ~0x7FFFFFFF);
std::memcpy(mem, xmm_consts, sizeof(xmm_consts));
memory::Protect(mem, kConstDataSize, memory::PageAccess::kReadOnly, nullptr);
return reinterpret_cast<uintptr_t>(mem);
}
void X64Emitter::FreeConstData(uintptr_t data) {
memory::DeallocFixed(reinterpret_cast<void*>(data), 0,
memory::DeallocationType::kRelease);
}
Xbyak::Address X64Emitter::GetXmmConstPtr(XmmConst id) {
// Load through fixed constant table setup by PlaceConstData.
// It's important that the pointer is not signed, as it will be sign-extended.
return ptr[reinterpret_cast<void*>(backend_->emitter_data() +
sizeof(vec128_t) * id)];
}
// Implies possible StashXmm(0, ...)!
void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
// https://www.agner.org/optimize/optimizing_assembly.pdf
// 13.4 Generating constants
if (!v.low && !v.high) {
// 0000...
vpxor(dest, dest);
} else if (v.low == ~0ull && v.high == ~0ull) {
// 1111...
vpcmpeqb(dest, dest);
} else {
// TODO(benvanik): see what other common values are.
// TODO(benvanik): build constant table - 99% are reused.
MovMem64(rsp + kStashOffset, v.low);
MovMem64(rsp + kStashOffset + 8, v.high);
vmovdqa(dest, ptr[rsp + kStashOffset]);
}
}
void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, float v) {
union {
float f;
uint32_t i;
} x = {v};
if (!v) {
// 0
vpxor(dest, dest);
} else if (x.i == ~0U) {
// 1111...
vpcmpeqb(dest, dest);
} else {
// TODO(benvanik): see what other common values are.
// TODO(benvanik): build constant table - 99% are reused.
mov(eax, x.i);
vmovd(dest, eax);
}
}
void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, double v) {
union {
double d;
uint64_t i;
} x = {v};
if (!v) {
// 0
vpxor(dest, dest);
} else if (x.i == ~0ULL) {
// 1111...
vpcmpeqb(dest, dest);
} else {
// TODO(benvanik): see what other common values are.
// TODO(benvanik): build constant table - 99% are reused.
mov(rax, x.i);
vmovq(dest, rax);
}
}
Xbyak::Address X64Emitter::StashXmm(int index, const Xbyak::Xmm& r) {
auto addr = ptr[rsp + kStashOffset + (index * 16)];
vmovups(addr, r);
return addr;
}
Xbyak::Address X64Emitter::StashConstantXmm(int index, float v) {
union {
float f;
uint32_t i;
} x = {v};
auto addr = rsp + kStashOffset + (index * 16);
MovMem64(addr, x.i);
MovMem64(addr + 8, 0);
return ptr[addr];
}
Xbyak::Address X64Emitter::StashConstantXmm(int index, double v) {
union {
double d;
uint64_t i;
} x = {v};
auto addr = rsp + kStashOffset + (index * 16);
MovMem64(addr, x.i);
MovMem64(addr + 8, 0);
return ptr[addr];
}
Xbyak::Address X64Emitter::StashConstantXmm(int index, const vec128_t& v) {
auto addr = rsp + kStashOffset + (index * 16);
MovMem64(addr, v.low);
MovMem64(addr + 8, v.high);
return ptr[addr];
}
} // namespace x64
} // namespace backend
} // namespace cpu
} // namespace xe